Enha: binary whisper stt impl

author: Grail Finder <wohilas@gmail.com> 2025-11-09 12:59:50 +0300
committer: Grail Finder <wohilas@gmail.com> 2025-11-09 12:59:50 +0300
commit: 48f32ba36a889d2514888589bc6c5da710a19feb (patch)
tree: 128e8e9b4b71d49bfd8e5b9841cf50bbfde8ec5e
parent: 4a581f6c122255bddcb3580539ff24b3c7d7c657 (diff)
3 files changed, 234 insertions, 0 deletions
diff --git a/config.example.toml b/config.example.toml
index 8a38073..ca88e3b 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -20,8 +20,14 @@ TTS_URL = "http://localhost:8880/v1/audio/speech"
 TTS_SPEED = 1.0
 # extra stt
 STT_ENABLED = false
+STT_TYPE = "WHISPER_SERVER" # WHISPER_SERVER or WHISPER_BINARY
 STT_URL = "http://localhost:8081/inference"
+WhisperBinaryPath = "./whisper-cli"  # Path to whisper binary (for WHISPER_BINARY mode)
+WhisperModelPath = "./ggml-model.bin"  # Path to whisper model file (for WHISPER_BINARY mode)
+STT_LANG = "en"  # Language for speech recognition (for WHISPER_BINARY mode)
+STT_SR = 16000  # Sample rate for audio recording
 DBPATH = "gflt.db"
+#
 FetchModelNameAPI = "http://localhost:8080/v1/models"
 # external search tool
 SearchAPI = "" # url to call the tool by
diff --git a/extra/stt.go b/extra/stt.go
index ddcc851..6712071 100644
--- a/extra/stt.go
+++ b/extra/stt.go
@@ -31,6 +31,7 @@ type StreamCloser interface {
 func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
 	switch cfg.STT_TYPE {
 	case "WHISPER_BINARY":
+		return NewWhisperBinary(logger, cfg)
 	case "WHISPER_SERVER":
 		return NewWhisperServer(logger, cfg)
 	}
diff --git a/extra/whisper_binary.go b/extra/whisper_binary.go
index 1002a97..17345f2 100644
--- a/extra/whisper_binary.go
+++ b/extra/whisper_binary.go
@@ -1,13 +1,21 @@
 package extra
 
 import (
+	"bytes"
 	"context"
+	"errors"
+	"fmt"
 	"gf-lt/config"
 	"log/slog"
+	"os"
 	"os/exec"
 	"sync"
+
+	"github.com/gordonklaus/portaudio"
 )
 
+
+
 type WhisperBinary struct {
 	whisperPath string
 	modelPath   string
@@ -17,6 +25,7 @@ type WhisperBinary struct {
 	mu          sync.Mutex
 	running     bool
 	cmd         *exec.Cmd
+	audioBuffer []int16
 }
 
 func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
@@ -29,3 +38,221 @@ func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
 		cancel:      cancel,
 	}
 }
+
+func (w *WhisperBinary) StartRecording() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	if w.running {
+		return errors.New("recording is already in progress")
+	}
+
+	if err := portaudio.Initialize(); err != nil {
+		return fmt.Errorf("portaudio init failed: %w", err)
+	}
+
+	// Initialize audio buffer
+	w.audioBuffer = make([]int16, 0)
+
+	in := make([]int16, 1024) // buffer size
+	stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
+	if err != nil {
+		if paErr := portaudio.Terminate(); paErr != nil {
+			return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
+		}
+		return fmt.Errorf("failed to open microphone: %w", err)
+	}
+
+	// Create a dummy command just for context management
+	w.cmd = exec.CommandContext(w.ctx, "sh", "-c", "echo 'dummy command'")
+
+	go w.recordAudio(stream, in)
+	w.running = true
+
+	return nil
+}
+
+func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
+	defer func() {
+		_ = portaudio.Terminate() // ignoring error as we're shutting down
+	}()
+
+	if err := stream.Start(); err != nil {
+		return
+	}
+
+	for {
+		select {
+		case <-w.ctx.Done():
+			return
+		default:
+			if !w.running {
+				return
+			}
+			if err := stream.Read(); err != nil {
+				return
+			}
+
+			// Append samples to buffer
+			w.mu.Lock()
+			if w.audioBuffer == nil {
+				w.audioBuffer = make([]int16, 0)
+			}
+			// Make a copy of the input buffer to avoid overwriting
+			tempBuffer := make([]int16, len(in))
+			copy(tempBuffer, in)
+			w.audioBuffer = append(w.audioBuffer, tempBuffer...)
+			w.mu.Unlock()
+		}
+	}
+}
+
+func (w *WhisperBinary) StopRecording() (string, error) {
+	w.mu.Lock()
+	if !w.running {
+		w.mu.Unlock()
+		return "", errors.New("not currently recording")
+	}
+
+	w.running = false
+	w.cancel() // This will stop the recording goroutine
+	w.mu.Unlock()
+
+	// Save the recorded audio to a temporary file
+	tempFile, err := w.saveAudioToTempFile()
+	if err != nil {
+		return "", fmt.Errorf("failed to save audio to temp file: %w", err)
+	}
+	defer os.Remove(tempFile) // Clean up the temp file
+
+	// Run the whisper binary
+	cmd := exec.CommandContext(w.ctx, w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile)
+
+	var outBuf, errBuf bytes.Buffer
+	cmd.Stdout = &outBuf
+	cmd.Stderr = &errBuf
+
+	if err := cmd.Run(); err != nil {
+		return "", fmt.Errorf("whisper binary failed: %w, stderr: %s", err, errBuf.String())
+	}
+
+	result := outBuf.String()
+	
+	// Clean up audio buffer
+	w.mu.Lock()
+	w.audioBuffer = nil
+	w.mu.Unlock()
+	
+	return result, nil
+}
+
+// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
+func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
+	// Create temporary WAV file
+	tempFile, err := os.CreateTemp("", "recording_*.wav")
+	if err != nil {
+		return "", fmt.Errorf("failed to create temp file: %w", err)
+	}
+	defer tempFile.Close()
+
+	// Write WAV header and data
+	err = w.writeWAVFile(tempFile.Name())
+	if err != nil {
+		return "", fmt.Errorf("failed to write WAV file: %w", err)
+	}
+
+	return tempFile.Name(), nil
+}
+
+// writeWAVFile creates a WAV file from the recorded audio data
+func (w *WhisperBinary) writeWAVFile(filename string) error {
+	// Open file for writing
+	file, err := os.Create(filename)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	w.mu.Lock()
+	audioData := make([]int16, len(w.audioBuffer))
+	copy(audioData, w.audioBuffer)
+	w.mu.Unlock()
+
+	if len(audioData) == 0 {
+		return errors.New("no audio data to write")
+	}
+
+	// Calculate data size (number of samples * size of int16)
+	dataSize := len(audioData) * 2 // 2 bytes per int16 sample
+
+	// Write WAV header with the correct data size
+	header := w.createWAVHeader(16000, 1, 16, dataSize)
+	_, err = file.Write(header)
+	if err != nil {
+		return err
+	}
+
+	// Write audio data
+	for _, sample := range audioData {
+		// Write little-endian 16-bit sample
+		_, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// createWAVHeader creates a WAV file header
+func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
+	header := make([]byte, 44)
+	copy(header[0:4], "RIFF")
+	// Total file size will be updated later
+	copy(header[8:12], "WAVE")
+	copy(header[12:16], "fmt ")
+	// fmt chunk size (16 for PCM)
+	header[16] = 16
+	header[17] = 0
+	header[18] = 0
+	header[19] = 0
+	// Audio format (1 = PCM)
+	header[20] = 1
+	header[21] = 0
+	// Number of channels
+	header[22] = byte(channels)
+	header[23] = 0
+	// Sample rate
+	header[24] = byte(sampleRate)
+	header[25] = byte(sampleRate >> 8)
+	header[26] = byte(sampleRate >> 16)
+	header[27] = byte(sampleRate >> 24)
+	// Byte rate
+	byteRate := sampleRate * channels * bitsPerSample / 8
+	header[28] = byte(byteRate)
+	header[29] = byte(byteRate >> 8)
+	header[30] = byte(byteRate >> 16)
+	header[31] = byte(byteRate >> 24)
+	// Block align
+	blockAlign := channels * bitsPerSample / 8
+	header[32] = byte(blockAlign)
+	header[33] = 0
+	// Bits per sample
+	header[34] = byte(bitsPerSample)
+	header[35] = 0
+	// "data" subchunk
+	copy(header[36:40], "data")
+	// Data size
+	header[40] = byte(dataSize)
+	header[41] = byte(dataSize >> 8)
+	header[42] = byte(dataSize >> 16)
+	header[43] = byte(dataSize >> 24)
+
+	return header
+}
+
+func (w *WhisperBinary) IsRecording() bool {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.running
+}
author	Grail Finder <wohilas@gmail.com>	2025-11-09 12:59:50 +0300
committer	Grail Finder <wohilas@gmail.com>	2025-11-09 12:59:50 +0300
commit	48f32ba36a889d2514888589bc6c5da710a19feb (patch)
tree	128e8e9b4b71d49bfd8e5b9841cf50bbfde8ec5e
parent	4a581f6c122255bddcb3580539ff24b3c7d7c657 (diff)