summaryrefslogtreecommitdiff
path: root/extra
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-03-09 07:07:36 +0300
committerGrail Finder <wohilas@gmail.com>2026-03-09 07:07:36 +0300
commit0e42a6f069ceea40485162c014c04cf718568cfe (patch)
tree583a6a6cb91b315e506990a03fdda1b32d0fe985 /extra
parent2687f38d00ceaa4f61034e3e02b9b59d08efc017 (diff)
parenta1b5f9cdc59938901123650fc0900067ac3447ca (diff)
Merge branch 'master' into feat/agent-flow
Diffstat (limited to 'extra')
-rw-r--r--extra/google_tts.go218
-rw-r--r--extra/kokoro.go259
-rw-r--r--extra/stt.go132
-rw-r--r--extra/tts.go406
-rw-r--r--extra/whisper_binary.go382
-rw-r--r--extra/whisper_server.go156
6 files changed, 750 insertions, 803 deletions
diff --git a/extra/google_tts.go b/extra/google_tts.go
new file mode 100644
index 0000000..782075d
--- /dev/null
+++ b/extra/google_tts.go
@@ -0,0 +1,218 @@
+//go:build extra
+// +build extra
+
+package extra
+
+import (
+ "fmt"
+ "gf-lt/models"
+ "io"
+ "log/slog"
+ "os/exec"
+ "strings"
+ "sync"
+
+ google_translate_tts "github.com/GrailFinder/google-translate-tts"
+ "github.com/neurosnap/sentences/english"
+)
+
+type GoogleTranslateOrator struct {
+ logger *slog.Logger
+ mu sync.Mutex
+ speech *google_translate_tts.Speech
+ // fields for playback control
+ cmd *exec.Cmd
+ cmdMu sync.Mutex
+ stopCh chan struct{}
+ // text buffer and interrupt flag
+ textBuffer strings.Builder
+ interrupt bool
+ Speed float32
+}
+
+func (o *GoogleTranslateOrator) stoproutine() {
+ for {
+ <-TTSDoneChan
+ o.logger.Debug("orator got done signal")
+ o.Stop()
+ for len(TTSTextChan) > 0 {
+ <-TTSTextChan
+ }
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+ }
+}
+
+func (o *GoogleTranslateOrator) readroutine() {
+ tokenizer, _ := english.NewSentenceTokenizer(nil)
+ for {
+ select {
+ case chunk := <-TTSTextChan:
+ o.mu.Lock()
+ o.interrupt = false
+ _, err := o.textBuffer.WriteString(chunk)
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ o.mu.Unlock()
+ continue
+ }
+ text := o.textBuffer.String()
+ sentences := tokenizer.Tokenize(text)
+ o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
+ if len(sentences) <= 1 {
+ o.mu.Unlock()
+ continue
+ }
+ completeSentences := sentences[:len(sentences)-1]
+ remaining := sentences[len(sentences)-1].Text
+ o.textBuffer.Reset()
+ o.textBuffer.WriteString(remaining)
+ o.mu.Unlock()
+ for _, sentence := range completeSentences {
+ o.mu.Lock()
+ interrupted := o.interrupt
+ o.mu.Unlock()
+ if interrupted {
+ return
+ }
+ cleanedText := models.CleanText(sentence.Text)
+ if cleanedText == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
+ if err := o.Speak(cleanedText); err != nil {
+ o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
+ }
+ }
+ case <-TTSFlushChan:
+ o.logger.Debug("got flushchan signal start")
+ // lln is done get the whole message out
+ if len(TTSTextChan) > 0 { // otherwise might get stuck
+ for chunk := range TTSTextChan {
+ o.mu.Lock()
+ _, err := o.textBuffer.WriteString(chunk)
+ o.mu.Unlock()
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ continue
+ }
+ if len(TTSTextChan) == 0 {
+ break
+ }
+ }
+ }
+ o.mu.Lock()
+ remaining := o.textBuffer.String()
+ remaining = models.CleanText(remaining)
+ o.textBuffer.Reset()
+ o.mu.Unlock()
+ if remaining == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with remainder", "rem", remaining)
+ sentencesRem := tokenizer.Tokenize(remaining)
+ for _, rs := range sentencesRem { // to avoid dumping large volume of text
+ o.mu.Lock()
+ interrupt := o.interrupt
+ o.mu.Unlock()
+ if interrupt {
+ break
+ }
+ if err := o.Speak(rs.Text); err != nil {
+ o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
+ }
+ }
+ }
+ }
+}
+
+func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
+ return o.logger
+}
+
+func (o *GoogleTranslateOrator) Speak(text string) error {
+ o.logger.Debug("fn: Speak is called", "text-len", len(text))
+ // Generate MP3 data directly as an io.Reader
+ reader, err := o.speech.GenerateSpeech(text)
+ if err != nil {
+ return fmt.Errorf("generate speech failed: %w", err)
+ }
+ // Wrap in io.NopCloser since GenerateSpeech returns io.Reader (no close needed)
+ body := io.NopCloser(reader)
+ defer body.Close()
+ // Build ffplay command with optional speed filter
+ args := []string{"-nodisp", "-autoexit"}
+ if o.Speed > 0.1 && o.Speed != 1.0 {
+ // atempo range is 0.5 to 2.0; you might clamp it here
+ args = append(args, "-af", fmt.Sprintf("atempo=%.2f", o.Speed))
+ }
+ args = append(args, "-i", "pipe:0")
+ cmd := exec.Command("ffplay", args...)
+ stdin, err := cmd.StdinPipe()
+ if err != nil {
+ return fmt.Errorf("failed to get stdin pipe: %w", err)
+ }
+ o.cmdMu.Lock()
+ o.cmd = cmd
+ o.stopCh = make(chan struct{})
+ o.cmdMu.Unlock()
+ if err := cmd.Start(); err != nil {
+ return fmt.Errorf("failed to start ffplay: %w", err)
+ }
+ copyErr := make(chan error, 1)
+ go func() {
+ _, err := io.Copy(stdin, body)
+ stdin.Close()
+ copyErr <- err
+ }()
+ done := make(chan error, 1)
+ go func() {
+ done <- cmd.Wait()
+ }()
+ select {
+ case <-o.stopCh:
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done
+ return nil
+ case copyErrVal := <-copyErr:
+ if copyErrVal != nil {
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done
+ return copyErrVal
+ }
+ return <-done
+ case err := <-done:
+ return err
+ }
+}
+
+func (o *GoogleTranslateOrator) Stop() {
+ o.cmdMu.Lock()
+ defer o.cmdMu.Unlock()
+ // Signal any running Speak to stop
+ if o.stopCh != nil {
+ select {
+ case <-o.stopCh: // already closed
+ default:
+ close(o.stopCh)
+ }
+ o.stopCh = nil
+ }
+ // Kill the external player process if it's still running
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ o.cmd.Wait() // clean up zombie process
+ o.cmd = nil
+ }
+ // Also reset text buffer and interrupt flag (with o.mu)
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+}
diff --git a/extra/kokoro.go b/extra/kokoro.go
new file mode 100644
index 0000000..e3ca047
--- /dev/null
+++ b/extra/kokoro.go
@@ -0,0 +1,259 @@
+//go:build extra
+// +build extra
+
+package extra
+
+import (
+ "bytes"
+ "encoding/json"
+ "fmt"
+ "gf-lt/models"
+ "io"
+ "log/slog"
+ "net/http"
+ "os/exec"
+ "strings"
+ "sync"
+
+ "github.com/neurosnap/sentences/english"
+)
+
+type KokoroOrator struct {
+ logger *slog.Logger
+ mu sync.Mutex
+ URL string
+ Format models.AudioFormat
+ Stream bool
+ Speed float32
+ Language string
+ Voice string
+ // fields for playback control
+ cmd *exec.Cmd
+ cmdMu sync.Mutex
+ stopCh chan struct{}
+ // textBuffer, interrupt etc. remain the same
+ textBuffer strings.Builder
+ interrupt bool
+}
+
+func (o *KokoroOrator) GetLogger() *slog.Logger {
+ return o.logger
+}
+
+func (o *KokoroOrator) Speak(text string) error {
+ o.logger.Debug("fn: Speak is called", "text-len", len(text))
+ body, err := o.requestSound(text)
+ if err != nil {
+ return fmt.Errorf("request failed: %w", err)
+ }
+ defer body.Close()
+ cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0")
+ stdin, err := cmd.StdinPipe()
+ if err != nil {
+ return fmt.Errorf("failed to get stdin pipe: %w", err)
+ }
+ o.cmdMu.Lock()
+ o.cmd = cmd
+ o.stopCh = make(chan struct{})
+ o.cmdMu.Unlock()
+ if err := cmd.Start(); err != nil {
+ return fmt.Errorf("failed to start ffplay: %w", err)
+ }
+ // Copy audio in background
+ copyErr := make(chan error, 1)
+ go func() {
+ _, err := io.Copy(stdin, body)
+ stdin.Close()
+ copyErr <- err
+ }()
+ // Wait for player in background
+ done := make(chan error, 1)
+ go func() {
+ done <- cmd.Wait()
+ }()
+ // Wait for BOTH copy and player, but ensure we block until done
+ select {
+ case <-o.stopCh:
+ // Stop requested: kill player and wait for it to exit
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done // Wait for process to actually exit
+ return nil
+ case copyErrVal := <-copyErr:
+ if copyErrVal != nil {
+ // Copy failed: kill player and wait
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done
+ return copyErrVal
+ }
+ // Copy succeeded, now wait for playback to complete
+ return <-done
+ case err := <-done:
+ // Playback finished normally (copy must have succeeded or player would have exited early)
+ return err
+ }
+}
+func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
+ if o.URL == "" {
+ return nil, fmt.Errorf("TTS URL is empty")
+ }
+ payload := map[string]interface{}{
+ "input": text,
+ "voice": o.Voice,
+ "response_format": o.Format,
+ "download_format": o.Format,
+ "stream": o.Stream,
+ "speed": o.Speed,
+ // "return_download_link": true,
+ "lang_code": o.Language,
+ }
+ payloadBytes, err := json.Marshal(payload)
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal payload: %w", err)
+ }
+ req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx
+ if err != nil {
+ return nil, fmt.Errorf("failed to create request: %w", err)
+ }
+ req.Header.Set("accept", "application/json")
+ req.Header.Set("Content-Type", "application/json")
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return nil, fmt.Errorf("request failed: %w", err)
+ }
+ if resp.StatusCode != http.StatusOK {
+ defer resp.Body.Close()
+ return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+ }
+ return resp.Body, nil
+}
+
+func (o *KokoroOrator) stoproutine() {
+ for {
+ <-TTSDoneChan
+ o.logger.Debug("orator got done signal")
+ // 1. Stop any ongoing playback (kills external player, closes stopCh)
+ o.Stop()
+ // 2. Drain any pending text chunks
+ for len(TTSTextChan) > 0 {
+ <-TTSTextChan
+ }
+ // 3. Reset internal state
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+ }
+}
+
+func (o *KokoroOrator) Stop() {
+ o.cmdMu.Lock()
+ defer o.cmdMu.Unlock()
+ // Signal any running Speak to stop
+ if o.stopCh != nil {
+ select {
+ case <-o.stopCh: // already closed
+ default:
+ close(o.stopCh)
+ }
+ o.stopCh = nil
+ }
+ // Kill the external player process if it's still running
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ o.cmd.Wait() // clean up zombie process
+ o.cmd = nil
+ }
+ // Also reset text buffer and interrupt flag (with o.mu)
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+}
+
+func (o *KokoroOrator) readroutine() {
+ tokenizer, _ := english.NewSentenceTokenizer(nil)
+ for {
+ select {
+ case chunk := <-TTSTextChan:
+ o.mu.Lock()
+ o.interrupt = false
+ _, err := o.textBuffer.WriteString(chunk)
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ o.mu.Unlock()
+ continue
+ }
+ text := o.textBuffer.String()
+ sentences := tokenizer.Tokenize(text)
+ o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
+ if len(sentences) <= 1 {
+ o.mu.Unlock()
+ continue
+ }
+ completeSentences := sentences[:len(sentences)-1]
+ remaining := sentences[len(sentences)-1].Text
+ o.textBuffer.Reset()
+ o.textBuffer.WriteString(remaining)
+ o.mu.Unlock()
+ for _, sentence := range completeSentences {
+ o.mu.Lock()
+ interrupted := o.interrupt
+ o.mu.Unlock()
+ if interrupted {
+ return
+ }
+ cleanedText := models.CleanText(sentence.Text)
+ if cleanedText == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
+ if err := o.Speak(cleanedText); err != nil {
+ o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
+ }
+ }
+ case <-TTSFlushChan:
+ o.logger.Debug("got flushchan signal start")
+ // lln is done get the whole message out
+ if len(TTSTextChan) > 0 { // otherwise might get stuck
+ for chunk := range TTSTextChan {
+ o.mu.Lock()
+ _, err := o.textBuffer.WriteString(chunk)
+ o.mu.Unlock()
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ continue
+ }
+ if len(TTSTextChan) == 0 {
+ break
+ }
+ }
+ }
+ // flush remaining text
+ o.mu.Lock()
+ remaining := o.textBuffer.String()
+ remaining = models.CleanText(remaining)
+ o.textBuffer.Reset()
+ o.mu.Unlock()
+ if remaining == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with remainder", "rem", remaining)
+ sentencesRem := tokenizer.Tokenize(remaining)
+ for _, rs := range sentencesRem { // to avoid dumping large volume of text
+ o.mu.Lock()
+ interrupt := o.interrupt
+ o.mu.Unlock()
+ if interrupt {
+ break
+ }
+ if err := o.Speak(rs.Text); err != nil {
+ o.logger.Error("tts failed", "sentence", rs, "error", err)
+ }
+ }
+ }
+ }
+}
diff --git a/extra/stt.go b/extra/stt.go
index 86fcf9c..7bbf2fd 100644
--- a/extra/stt.go
+++ b/extra/stt.go
@@ -6,18 +6,10 @@ package extra
import (
"bytes"
"encoding/binary"
- "errors"
- "fmt"
"gf-lt/config"
"io"
"log/slog"
- "mime/multipart"
- "net/http"
"regexp"
- "strings"
- "syscall"
-
- "github.com/gordonklaus/portaudio"
)
var specialRE = regexp.MustCompile(`\[.*?\]`)
@@ -44,14 +36,6 @@ func NewSTT(logger *slog.Logger, cfg *config.Config) STT {
return NewWhisperServer(logger, cfg)
}
-type WhisperServer struct {
- logger *slog.Logger
- ServerURL string
- SampleRate int
- AudioBuffer *bytes.Buffer
- recording bool
-}
-
func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
return &WhisperServer{
logger: logger,
@@ -61,69 +45,6 @@ func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer {
}
}
-func (stt *WhisperServer) StartRecording() error {
- if err := stt.microphoneStream(stt.SampleRate); err != nil {
- return fmt.Errorf("failed to init microphone: %w", err)
- }
- stt.recording = true
- return nil
-}
-
-func (stt *WhisperServer) StopRecording() (string, error) {
- stt.recording = false
- // wait loop to finish?
- if stt.AudioBuffer == nil {
- err := errors.New("unexpected nil AudioBuffer")
- stt.logger.Error(err.Error())
- return "", err
- }
- // Create WAV header first
- body := &bytes.Buffer{}
- writer := multipart.NewWriter(body)
- // Add audio file part
- part, err := writer.CreateFormFile("file", "recording.wav")
- if err != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- // Stream directly to multipart writer: header + raw data
- dataSize := stt.AudioBuffer.Len()
- stt.writeWavHeader(part, dataSize)
- if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- // Reset buffer for next recording
- stt.AudioBuffer.Reset()
- // Add response format field
- err = writer.WriteField("response_format", "text")
- if err != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- if writer.Close() != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- // Send request
- resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
- if err != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- defer resp.Body.Close()
- // Read and print response
- responseTextBytes, err := io.ReadAll(resp.Body)
- if err != nil {
- stt.logger.Error("fn: StopRecording", "error", err)
- return "", err
- }
- resptext := strings.TrimRight(string(responseTextBytes), "\n")
- // in case there are special tokens like [_BEG_]
- resptext = specialRE.ReplaceAllString(resptext, "")
- return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
-}
-
func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
header := make([]byte, 44)
copy(header[0:4], "RIFF")
@@ -147,56 +68,3 @@ func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) {
func (stt *WhisperServer) IsRecording() bool {
return stt.recording
}
-
-func (stt *WhisperServer) microphoneStream(sampleRate int) error {
- // Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
- origStderr, errDup := syscall.Dup(syscall.Stderr)
- if errDup != nil {
- return fmt.Errorf("failed to dup stderr: %w", errDup)
- }
- nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
- if err != nil {
- _ = syscall.Close(origStderr) // Close the dup'd fd if open fails
- return fmt.Errorf("failed to open /dev/null: %w", err)
- }
- // redirect stderr
- _ = syscall.Dup2(nullFD, syscall.Stderr)
- // Initialize PortAudio (this is where ALSA warnings occur)
- defer func() {
- // Restore stderr
- _ = syscall.Dup2(origStderr, syscall.Stderr)
- _ = syscall.Close(origStderr)
- _ = syscall.Close(nullFD)
- }()
- if err := portaudio.Initialize(); err != nil {
- return fmt.Errorf("portaudio init failed: %w", err)
- }
- in := make([]int16, 64)
- stream, err := portaudio.OpenDefaultStream(1, 0, float64(sampleRate), len(in), in)
- if err != nil {
- if paErr := portaudio.Terminate(); paErr != nil {
- return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
- }
- return fmt.Errorf("failed to open microphone: %w", err)
- }
- go func(stream *portaudio.Stream) {
- if err := stream.Start(); err != nil {
- stt.logger.Error("microphoneStream", "error", err)
- return
- }
- for {
- if !stt.IsRecording() {
- return
- }
- if err := stream.Read(); err != nil {
- stt.logger.Error("reading stream", "error", err)
- return
- }
- if err := binary.Write(stt.AudioBuffer, binary.LittleEndian, in); err != nil {
- stt.logger.Error("writing to buffer", "error", err)
- return
- }
- }
- }(stream)
- return nil
-}
diff --git a/extra/tts.go b/extra/tts.go
index 1960aa7..2ddb0ae 100644
--- a/extra/tts.go
+++ b/extra/tts.go
@@ -4,25 +4,13 @@
package extra
import (
- "bytes"
- "encoding/json"
- "fmt"
"gf-lt/config"
"gf-lt/models"
- "io"
"log/slog"
- "net/http"
"os"
"strings"
- "sync"
- "time"
google_translate_tts "github.com/GrailFinder/google-translate-tts"
- "github.com/GrailFinder/google-translate-tts/handlers"
- "github.com/gopxl/beep/v2"
- "github.com/gopxl/beep/v2/mp3"
- "github.com/gopxl/beep/v2/speaker"
- "github.com/neurosnap/sentences/english"
)
var (
@@ -39,142 +27,6 @@ type Orator interface {
GetLogger() *slog.Logger
}
-// impl https://github.com/remsky/Kokoro-FastAPI
-type KokoroOrator struct {
- logger *slog.Logger
- mu sync.Mutex
- URL string
- Format models.AudioFormat
- Stream bool
- Speed float32
- Language string
- Voice string
- currentStream *beep.Ctrl // Added for playback control
- currentDone chan bool
- textBuffer strings.Builder
- interrupt bool
- // textBuffer bytes.Buffer
-}
-
-// Google Translate TTS implementation
-type GoogleTranslateOrator struct {
- logger *slog.Logger
- mu sync.Mutex
- speech *google_translate_tts.Speech
- currentStream *beep.Ctrl
- currentDone chan bool
- textBuffer strings.Builder
- interrupt bool
-}
-
-func (o *KokoroOrator) stoproutine() {
- for {
- <-TTSDoneChan
- o.logger.Debug("orator got done signal")
- o.Stop()
- // drain the channel
- for len(TTSTextChan) > 0 {
- <-TTSTextChan
- }
- o.mu.Lock()
- o.textBuffer.Reset()
- if o.currentDone != nil {
- select {
- case o.currentDone <- true:
- default:
- // Channel might be closed, ignore
- }
- }
- o.interrupt = true
- o.mu.Unlock()
- }
-}
-
-func (o *KokoroOrator) readroutine() {
- tokenizer, _ := english.NewSentenceTokenizer(nil)
- for {
- select {
- case chunk := <-TTSTextChan:
- o.mu.Lock()
- o.interrupt = false
- _, err := o.textBuffer.WriteString(chunk)
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- o.mu.Unlock()
- continue
- }
- text := o.textBuffer.String()
- sentences := tokenizer.Tokenize(text)
- o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
- if len(sentences) <= 1 {
- o.mu.Unlock()
- continue
- }
- completeSentences := sentences[:len(sentences)-1]
- remaining := sentences[len(sentences)-1].Text
- o.textBuffer.Reset()
- o.textBuffer.WriteString(remaining)
- o.mu.Unlock()
-
- for _, sentence := range completeSentences {
- o.mu.Lock()
- interrupted := o.interrupt
- o.mu.Unlock()
- if interrupted {
- return
- }
- cleanedText := models.CleanText(sentence.Text)
- if cleanedText == "" {
- continue
- }
- o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
- if err := o.Speak(cleanedText); err != nil {
- o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
- }
- }
- case <-TTSFlushChan:
- o.logger.Debug("got flushchan signal start")
- // lln is done get the whole message out
- if len(TTSTextChan) > 0 { // otherwise might get stuck
- for chunk := range TTSTextChan {
- o.mu.Lock()
- _, err := o.textBuffer.WriteString(chunk)
- o.mu.Unlock()
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- continue
- }
- if len(TTSTextChan) == 0 {
- break
- }
- }
- }
- // flush remaining text
- o.mu.Lock()
- remaining := o.textBuffer.String()
- remaining = models.CleanText(remaining)
- o.textBuffer.Reset()
- o.mu.Unlock()
- if remaining == "" {
- continue
- }
- o.logger.Debug("calling Speak with remainder", "rem", remaining)
- sentencesRem := tokenizer.Tokenize(remaining)
- for _, rs := range sentencesRem { // to avoid dumping large volume of text
- o.mu.Lock()
- interrupt := o.interrupt
- o.mu.Unlock()
- if interrupt {
- break
- }
- if err := o.Speak(rs.Text); err != nil {
- o.logger.Error("tts failed", "sentence", rs, "error", err)
- }
- }
- }
- }
-}
-
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
provider := cfg.TTS_PROVIDER
if provider == "" {
@@ -204,270 +56,14 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
Language: language,
Proxy: "", // Proxy not supported
Speed: cfg.TTS_SPEED,
- Handler: &handlers.Beep{},
}
orator := &GoogleTranslateOrator{
logger: log,
speech: speech,
+ Speed: cfg.TTS_SPEED,
}
go orator.readroutine()
go orator.stoproutine()
return orator
}
}
-
-func (o *KokoroOrator) GetLogger() *slog.Logger {
- return o.logger
-}
-
-func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
- if o.URL == "" {
- return nil, fmt.Errorf("TTS URL is empty")
- }
- payload := map[string]interface{}{
- "input": text,
- "voice": o.Voice,
- "response_format": o.Format,
- "download_format": o.Format,
- "stream": o.Stream,
- "speed": o.Speed,
- // "return_download_link": true,
- "lang_code": o.Language,
- }
- payloadBytes, err := json.Marshal(payload)
- if err != nil {
- return nil, fmt.Errorf("failed to marshal payload: %w", err)
- }
- req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx
- if err != nil {
- return nil, fmt.Errorf("failed to create request: %w", err)
- }
- req.Header.Set("accept", "application/json")
- req.Header.Set("Content-Type", "application/json")
- resp, err := http.DefaultClient.Do(req)
- if err != nil {
- return nil, fmt.Errorf("request failed: %w", err)
- }
- if resp.StatusCode != http.StatusOK {
- defer resp.Body.Close()
- return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- }
- return resp.Body, nil
-}
-
-func (o *KokoroOrator) Speak(text string) error {
- o.logger.Debug("fn: Speak is called", "text-len", len(text))
- body, err := o.requestSound(text)
- if err != nil {
- o.logger.Error("request failed", "error", err)
- return fmt.Errorf("request failed: %w", err)
- }
- defer body.Close()
- // Decode the mp3 audio from response body
- streamer, format, err := mp3.Decode(body)
- if err != nil {
- o.logger.Error("mp3 decode failed", "error", err)
- return fmt.Errorf("mp3 decode failed: %w", err)
- }
- defer streamer.Close()
- // here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
- if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
- o.logger.Debug("failed to init speaker", "error", err)
- }
- done := make(chan bool)
- o.mu.Lock()
- o.currentDone = done
- o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
- o.mu.Lock()
- close(done)
- o.currentStream = nil
- o.currentDone = nil
- o.mu.Unlock()
- })), Paused: false}
- o.mu.Unlock()
- speaker.Play(o.currentStream)
- <-done
- return nil
-}
-
-func (o *KokoroOrator) Stop() {
- // speaker.Clear()
- o.logger.Debug("attempted to stop orator", "orator", o)
- speaker.Lock()
- defer speaker.Unlock()
- o.mu.Lock()
- defer o.mu.Unlock()
- if o.currentStream != nil {
- // o.currentStream.Paused = true
- o.currentStream.Streamer = nil
- }
-}
-
-func (o *GoogleTranslateOrator) stoproutine() {
- for {
- <-TTSDoneChan
- o.logger.Debug("orator got done signal")
- o.Stop()
- // drain the channel
- for len(TTSTextChan) > 0 {
- <-TTSTextChan
- }
- o.mu.Lock()
- o.textBuffer.Reset()
- if o.currentDone != nil {
- select {
- case o.currentDone <- true:
- default:
- // Channel might be closed, ignore
- }
- }
- o.interrupt = true
- o.mu.Unlock()
- }
-}
-
-func (o *GoogleTranslateOrator) readroutine() {
- tokenizer, _ := english.NewSentenceTokenizer(nil)
- for {
- select {
- case chunk := <-TTSTextChan:
- o.mu.Lock()
- o.interrupt = false
- _, err := o.textBuffer.WriteString(chunk)
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- o.mu.Unlock()
- continue
- }
- text := o.textBuffer.String()
- sentences := tokenizer.Tokenize(text)
- o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
- if len(sentences) <= 1 {
- o.mu.Unlock()
- continue
- }
- completeSentences := sentences[:len(sentences)-1]
- remaining := sentences[len(sentences)-1].Text
- o.textBuffer.Reset()
- o.textBuffer.WriteString(remaining)
- o.mu.Unlock()
-
- for _, sentence := range completeSentences {
- o.mu.Lock()
- interrupted := o.interrupt
- o.mu.Unlock()
- if interrupted {
- return
- }
- cleanedText := models.CleanText(sentence.Text)
- if cleanedText == "" {
- continue
- }
- o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
- if err := o.Speak(cleanedText); err != nil {
- o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
- }
- }
- case <-TTSFlushChan:
- o.logger.Debug("got flushchan signal start")
- // lln is done get the whole message out
- if len(TTSTextChan) > 0 { // otherwise might get stuck
- for chunk := range TTSTextChan {
- o.mu.Lock()
- _, err := o.textBuffer.WriteString(chunk)
- o.mu.Unlock()
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- continue
- }
- if len(TTSTextChan) == 0 {
- break
- }
- }
- }
- o.mu.Lock()
- remaining := o.textBuffer.String()
- remaining = models.CleanText(remaining)
- o.textBuffer.Reset()
- o.mu.Unlock()
- if remaining == "" {
- continue
- }
- o.logger.Debug("calling Speak with remainder", "rem", remaining)
- sentencesRem := tokenizer.Tokenize(remaining)
- for _, rs := range sentencesRem { // to avoid dumping large volume of text
- o.mu.Lock()
- interrupt := o.interrupt
- o.mu.Unlock()
- if interrupt {
- break
- }
- if err := o.Speak(rs.Text); err != nil {
- o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
- }
- }
- }
- }
-}
-
-func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
- return o.logger
-}
-
-func (o *GoogleTranslateOrator) Speak(text string) error {
- o.logger.Debug("fn: Speak is called", "text-len", len(text))
- // Generate MP3 data using google-translate-tts
- reader, err := o.speech.GenerateSpeech(text)
- if err != nil {
- o.logger.Error("generate speech failed", "error", err)
- return fmt.Errorf("generate speech failed: %w", err)
- }
- // Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser)
- streamer, format, err := mp3.Decode(io.NopCloser(reader))
- if err != nil {
- o.logger.Error("mp3 decode failed", "error", err)
- return fmt.Errorf("mp3 decode failed: %w", err)
- }
- defer streamer.Close()
- playbackStreamer := beep.Streamer(streamer)
- speed := o.speech.Speed
- if speed <= 0 {
- speed = 1.0
- }
- if speed != 1.0 {
- playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer)
- }
- // Initialize speaker with the format's sample rate
- if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
- o.logger.Debug("failed to init speaker", "error", err)
- }
- done := make(chan bool)
- o.mu.Lock()
- o.currentDone = done
- o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() {
- o.mu.Lock()
- close(done)
- o.currentStream = nil
- o.currentDone = nil
- o.mu.Unlock()
- })), Paused: false}
- o.mu.Unlock()
- speaker.Play(o.currentStream)
- <-done // wait for playback to complete
- return nil
-}
-
-func (o *GoogleTranslateOrator) Stop() {
- o.logger.Debug("attempted to stop google translate orator")
- speaker.Lock()
- defer speaker.Unlock()
- o.mu.Lock()
- defer o.mu.Unlock()
- if o.currentStream != nil {
- o.currentStream.Streamer = nil
- }
- // Also stop the speech handler if possible
- if o.speech != nil {
- _ = o.speech.Stop()
- }
-}
diff --git a/extra/whisper_binary.go b/extra/whisper_binary.go
index 6b7ddc8..1c35952 100644
--- a/extra/whisper_binary.go
+++ b/extra/whisper_binary.go
@@ -9,15 +9,13 @@ import (
"errors"
"fmt"
"gf-lt/config"
- "io"
"log/slog"
"os"
"os/exec"
"strings"
"sync"
"syscall"
-
- "github.com/gordonklaus/portaudio"
+ "time"
)
type WhisperBinary struct {
@@ -25,24 +23,14 @@ type WhisperBinary struct {
whisperPath string
modelPath string
lang string
- ctx context.Context
- cancel context.CancelFunc
- mu sync.Mutex
- recording bool
- audioBuffer []int16
-}
-
-func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
- ctx, cancel := context.WithCancel(context.Background())
- // Set ALSA error handler first
- return &WhisperBinary{
- logger: logger,
- whisperPath: cfg.WhisperBinaryPath,
- modelPath: cfg.WhisperModelPath,
- lang: cfg.STT_LANG,
- ctx: ctx,
- cancel: cancel,
- }
+ // Per-recording fields (protected by mu)
+ mu sync.Mutex
+ recording bool
+ tempFile string
+ ctx context.Context
+ cancel context.CancelFunc
+ cmd *exec.Cmd
+ cmdMu sync.Mutex
}
func (w *WhisperBinary) StartRecording() error {
@@ -51,276 +39,138 @@ func (w *WhisperBinary) StartRecording() error {
if w.recording {
return errors.New("recording is already in progress")
}
- // If context is cancelled, create a new one for the next recording session
- if w.ctx.Err() != nil {
- w.logger.Debug("Context cancelled, creating new context")
- w.ctx, w.cancel = context.WithCancel(context.Background())
- }
- // Temporarily redirect stderr to suppress ALSA warnings during PortAudio init
- origStderr, errDup := syscall.Dup(syscall.Stderr)
- if errDup != nil {
- return fmt.Errorf("failed to dup stderr: %w", errDup)
- }
- nullFD, err := syscall.Open("/dev/null", syscall.O_WRONLY, 0)
+ // Fresh context for this recording
+ ctx, cancel := context.WithCancel(context.Background())
+ w.ctx = ctx
+ w.cancel = cancel
+ // Create temporary file
+ tempFile, err := os.CreateTemp("", "recording_*.wav")
if err != nil {
- _ = syscall.Close(origStderr) // Close the dup'd fd if open fails
- return fmt.Errorf("failed to open /dev/null: %w", err)
- }
- // redirect stderr
- _ = syscall.Dup2(nullFD, syscall.Stderr)
- // Initialize PortAudio (this is where ALSA warnings occur)
- portaudioErr := portaudio.Initialize()
- defer func() {
- // Restore stderr
- _ = syscall.Dup2(origStderr, syscall.Stderr)
- _ = syscall.Close(origStderr)
- _ = syscall.Close(nullFD)
- }()
- if portaudioErr != nil {
- return fmt.Errorf("portaudio init failed: %w", portaudioErr)
- }
- // Initialize audio buffer
- w.audioBuffer = make([]int16, 0)
- in := make([]int16, 1024) // buffer size
- stream, err := portaudio.OpenDefaultStream(1, 0, 16000.0, len(in), in)
+ cancel()
+ return fmt.Errorf("failed to create temp file: %w", err)
+ }
+ tempFile.Close()
+ w.tempFile = tempFile.Name()
+ // ffmpeg command: capture from default microphone, write WAV
+ args := []string{
+ "-f", "alsa", // or "pulse" if preferred
+ "-i", "default",
+ "-acodec", "pcm_s16le",
+ "-ar", "16000",
+ "-ac", "1",
+ "-y", // overwrite output file
+ w.tempFile,
+ }
+ cmd := exec.CommandContext(w.ctx, "ffmpeg", args...)
+ // Capture stderr for debugging (optional, but useful for diagnosing)
+ stderr, err := cmd.StderrPipe()
if err != nil {
- if paErr := portaudio.Terminate(); paErr != nil {
- return fmt.Errorf("failed to open microphone: %w; terminate error: %w", err, paErr)
- }
- return fmt.Errorf("failed to open microphone: %w", err)
- }
- go w.recordAudio(stream, in)
- w.recording = true
- w.logger.Debug("Recording started")
- return nil
-}
-
-func (w *WhisperBinary) recordAudio(stream *portaudio.Stream, in []int16) {
- defer func() {
- w.logger.Debug("recordAudio defer function called")
- _ = stream.Stop() // Stop the stream
- _ = portaudio.Terminate() // ignoring error as we're shutting down
- w.logger.Debug("recordAudio terminated")
- }()
- w.logger.Debug("Starting audio stream")
- if err := stream.Start(); err != nil {
- w.logger.Error("Failed to start audio stream", "error", err)
- return
- }
- w.logger.Debug("Audio stream started, entering recording loop")
- for {
- select {
- case <-w.ctx.Done():
- w.logger.Debug("Context done, exiting recording loop")
- return
- default:
- // Check recording status with minimal lock time
- w.mu.Lock()
- recording := w.recording
- w.mu.Unlock()
-
- if !recording {
- w.logger.Debug("Recording flag is false, exiting recording loop")
- return
+ cancel()
+ os.Remove(w.tempFile)
+ return fmt.Errorf("failed to create stderr pipe: %w", err)
+ }
+ go func() {
+ buf := make([]byte, 1024)
+ for {
+ n, err := stderr.Read(buf)
+ if n > 0 {
+ w.logger.Debug("ffmpeg stderr", "output", string(buf[:n]))
}
- if err := stream.Read(); err != nil {
- w.logger.Error("Error reading from stream", "error", err)
- return
+ if err != nil {
+ break
}
- // Append samples to buffer - only acquire lock when necessary
- w.mu.Lock()
- if w.audioBuffer == nil {
- w.audioBuffer = make([]int16, 0)
- }
- // Make a copy of the input buffer to avoid overwriting
- tempBuffer := make([]int16, len(in))
- copy(tempBuffer, in)
- w.audioBuffer = append(w.audioBuffer, tempBuffer...)
- w.mu.Unlock()
}
+ }()
+ w.cmdMu.Lock()
+ w.cmd = cmd
+ w.cmdMu.Unlock()
+ if err := cmd.Start(); err != nil {
+ cancel()
+ os.Remove(w.tempFile)
+ return fmt.Errorf("failed to start ffmpeg: %w", err)
}
+ w.recording = true
+ w.logger.Debug("Recording started", "file", w.tempFile)
+ return nil
}
func (w *WhisperBinary) StopRecording() (string, error) {
- w.logger.Debug("StopRecording called")
w.mu.Lock()
+ defer w.mu.Unlock()
if !w.recording {
- w.mu.Unlock()
return "", errors.New("not currently recording")
}
- w.logger.Debug("Setting recording to false and cancelling context")
w.recording = false
- w.cancel() // This will stop the recording goroutine
- w.mu.Unlock()
- // // Small delay to allow the recording goroutine to react to context cancellation
- // time.Sleep(20 * time.Millisecond)
- // Save the recorded audio to a temporary file
- tempFile, err := w.saveAudioToTempFile()
- if err != nil {
- w.logger.Error("Error saving audio to temp file", "error", err)
- return "", fmt.Errorf("failed to save audio to temp file: %w", err)
- }
- w.logger.Debug("Saved audio to temp file", "file", tempFile)
- // Run the whisper binary with a separate context to avoid cancellation during transcription
- cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, tempFile, "2>/dev/null")
- var outBuf bytes.Buffer
- cmd.Stdout = &outBuf
- // Redirect stderr to suppress ALSA warnings and other stderr output
- cmd.Stderr = io.Discard // Suppress stderr output from whisper binary
- w.logger.Debug("Running whisper binary command")
- if err := cmd.Run(); err != nil {
- // Clean up audio buffer
- w.mu.Lock()
- w.audioBuffer = nil
- w.mu.Unlock()
- // Since we're suppressing stderr, we'll just log that the command failed
- w.logger.Error("Error running whisper binary", "error", err)
- return "", fmt.Errorf("whisper binary failed: %w", err)
+ // Gracefully stop ffmpeg
+ w.cmdMu.Lock()
+ if w.cmd != nil && w.cmd.Process != nil {
+ w.logger.Debug("Sending SIGTERM to ffmpeg")
+ w.cmd.Process.Signal(syscall.SIGTERM)
+ // Wait for process to exit (up to 2 seconds)
+ done := make(chan error, 1)
+ go func() {
+ done <- w.cmd.Wait()
+ }()
+ select {
+ case <-done:
+ w.logger.Debug("ffmpeg exited after SIGTERM")
+ case <-time.After(2 * time.Second):
+ w.logger.Warn("ffmpeg did not exit, sending SIGKILL")
+ w.cmd.Process.Kill()
+ <-done
+ }
}
- result := outBuf.String()
- w.logger.Debug("Whisper binary completed", "result", result)
- // Clean up audio buffer
- w.mu.Lock()
- w.audioBuffer = nil
- w.mu.Unlock()
- // Clean up the temporary file after transcription
- w.logger.Debug("StopRecording completed")
- os.Remove(tempFile)
- result = strings.TrimRight(result, "\n")
- // in case there are special tokens like [_BEG_]
- result = specialRE.ReplaceAllString(result, "")
- return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
-}
-
-// saveAudioToTempFile saves the recorded audio data to a temporary WAV file
-func (w *WhisperBinary) saveAudioToTempFile() (string, error) {
- w.logger.Debug("saveAudioToTempFile called")
- // Create temporary WAV file
- tempFile, err := os.CreateTemp("", "recording_*.wav")
- if err != nil {
- w.logger.Error("Failed to create temp file", "error", err)
- return "", fmt.Errorf("failed to create temp file: %w", err)
+ w.cmdMu.Unlock()
+ // Cancel context (already done, but for cleanliness)
+ if w.cancel != nil {
+ w.cancel()
}
- w.logger.Debug("Created temp file", "file", tempFile.Name())
- defer tempFile.Close()
-
- // Write WAV header and data
- w.logger.Debug("About to write WAV file", "file", tempFile.Name())
- err = w.writeWAVFile(tempFile.Name())
- if err != nil {
- w.logger.Error("Error writing WAV file", "error", err)
- return "", fmt.Errorf("failed to write WAV file: %w", err)
+ // Validate temp file
+ if w.tempFile == "" {
+ return "", errors.New("no recording file")
}
- w.logger.Debug("WAV file written successfully", "file", tempFile.Name())
-
- return tempFile.Name(), nil
-}
-
-// writeWAVFile creates a WAV file from the recorded audio data
-func (w *WhisperBinary) writeWAVFile(filename string) error {
- w.logger.Debug("writeWAVFile called", "filename", filename)
- // Open file for writing
- file, err := os.Create(filename)
+ defer os.Remove(w.tempFile)
+ info, err := os.Stat(w.tempFile)
if err != nil {
- w.logger.Error("Error creating file", "error", err)
- return err
+ return "", fmt.Errorf("failed to stat temp file: %w", err)
}
- defer file.Close()
-
- w.logger.Debug("About to acquire mutex in writeWAVFile")
- w.mu.Lock()
- w.logger.Debug("Locked mutex, copying audio buffer")
- audioData := make([]int16, len(w.audioBuffer))
- copy(audioData, w.audioBuffer)
- w.mu.Unlock()
- w.logger.Debug("Unlocked mutex", "audio_data_length", len(audioData))
-
- if len(audioData) == 0 {
- w.logger.Warn("No audio data to write")
- return errors.New("no audio data to write")
+ if info.Size() < 44 { // WAV header is 44 bytes
+ // Log ffmpeg stderr? Already captured in debug logs.
+ return "", fmt.Errorf("recording file too small (%d bytes), possibly no audio captured", info.Size())
}
-
- // Calculate data size (number of samples * size of int16)
- dataSize := len(audioData) * 2 // 2 bytes per int16 sample
- w.logger.Debug("Calculated data size", "size", dataSize)
-
- // Write WAV header with the correct data size
- header := w.createWAVHeader(16000, 1, 16, dataSize)
- _, err = file.Write(header)
- if err != nil {
- w.logger.Error("Error writing WAV header", "error", err)
- return err
- }
- w.logger.Debug("WAV header written successfully")
-
- // Write audio data
- w.logger.Debug("About to write audio data samples")
- for i, sample := range audioData {
- // Write little-endian 16-bit sample
- _, err := file.Write([]byte{byte(sample), byte(sample >> 8)})
- if err != nil {
- w.logger.Error("Error writing sample", "index", i, "error", err)
- return err
- }
- // Log progress every 10000 samples to avoid too much output
- if i%10000 == 0 {
- w.logger.Debug("Written samples", "count", i)
- }
+ // Run whisper.cpp binary
+ cmd := exec.Command(w.whisperPath, "-m", w.modelPath, "-l", w.lang, w.tempFile)
+ var outBuf, errBuf bytes.Buffer
+ cmd.Stdout = &outBuf
+ cmd.Stderr = &errBuf
+ if err := cmd.Run(); err != nil {
+ w.logger.Error("whisper binary failed",
+ "error", err,
+ "stderr", errBuf.String(),
+ "file_size", info.Size())
+ return "", fmt.Errorf("whisper binary failed: %w (stderr: %s)", err, errBuf.String())
}
- w.logger.Debug("All audio data written successfully")
-
- return nil
-}
-
-// createWAVHeader creates a WAV file header
-func (w *WhisperBinary) createWAVHeader(sampleRate, channels, bitsPerSample int, dataSize int) []byte {
- header := make([]byte, 44)
- copy(header[0:4], "RIFF")
- // Total file size will be updated later
- copy(header[8:12], "WAVE")
- copy(header[12:16], "fmt ")
- // fmt chunk size (16 for PCM)
- header[16] = 16
- header[17] = 0
- header[18] = 0
- header[19] = 0
- // Audio format (1 = PCM)
- header[20] = 1
- header[21] = 0
- // Number of channels
- header[22] = byte(channels)
- header[23] = 0
- // Sample rate
- header[24] = byte(sampleRate)
- header[25] = byte(sampleRate >> 8)
- header[26] = byte(sampleRate >> 16)
- header[27] = byte(sampleRate >> 24)
- // Byte rate
- byteRate := sampleRate * channels * bitsPerSample / 8
- header[28] = byte(byteRate)
- header[29] = byte(byteRate >> 8)
- header[30] = byte(byteRate >> 16)
- header[31] = byte(byteRate >> 24)
- // Block align
- blockAlign := channels * bitsPerSample / 8
- header[32] = byte(blockAlign)
- header[33] = 0
- // Bits per sample
- header[34] = byte(bitsPerSample)
- header[35] = 0
- // "data" subchunk
- copy(header[36:40], "data")
- // Data size
- header[40] = byte(dataSize)
- header[41] = byte(dataSize >> 8)
- header[42] = byte(dataSize >> 16)
- header[43] = byte(dataSize >> 24)
-
- return header
+ result := strings.TrimRight(outBuf.String(), "\n")
+ result = specialRE.ReplaceAllString(result, "")
+ return strings.TrimSpace(strings.ReplaceAll(result, "\n ", "\n")), nil
}
+// IsRecording returns true if a recording is in progress.
func (w *WhisperBinary) IsRecording() bool {
w.mu.Lock()
defer w.mu.Unlock()
return w.recording
}
+
+func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary {
+ ctx, cancel := context.WithCancel(context.Background())
+ // Set ALSA error handler first
+ return &WhisperBinary{
+ logger: logger,
+ whisperPath: cfg.WhisperBinaryPath,
+ modelPath: cfg.WhisperModelPath,
+ lang: cfg.STT_LANG,
+ ctx: ctx,
+ cancel: cancel,
+ }
+}
diff --git a/extra/whisper_server.go b/extra/whisper_server.go
new file mode 100644
index 0000000..7532f4a
--- /dev/null
+++ b/extra/whisper_server.go
@@ -0,0 +1,156 @@
+//go:build extra
+// +build extra
+
+package extra
+
+import (
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "log/slog"
+ "mime/multipart"
+ "net/http"
+ "os/exec"
+ "strings"
+ "sync"
+)
+
+type WhisperServer struct {
+ logger *slog.Logger
+ ServerURL string
+ SampleRate int
+ AudioBuffer *bytes.Buffer
+ recording bool // protected by mu
+ mu sync.Mutex // protects recording & AudioBuffer
+ cmd *exec.Cmd // protected by cmdMu
+ stopCh chan struct{} // protected by cmdMu
+ cmdMu sync.Mutex // protects cmd and stopCh
+}
+
+func (stt *WhisperServer) StartRecording() error {
+ stt.mu.Lock()
+ defer stt.mu.Unlock()
+ if stt.recording {
+ return nil
+ }
+ // Build ffmpeg command for microphone capture
+ args := []string{
+ "-f", "alsa",
+ "-i", "default",
+ "-acodec", "pcm_s16le",
+ "-ar", fmt.Sprint(stt.SampleRate),
+ "-ac", "1",
+ "-f", "s16le",
+ "-",
+ }
+ cmd := exec.Command("ffmpeg", args...)
+ stdout, err := cmd.StdoutPipe()
+ if err != nil {
+ return fmt.Errorf("failed to get stdout pipe: %w", err)
+ }
+ stt.cmdMu.Lock()
+ stt.cmd = cmd
+ stt.stopCh = make(chan struct{})
+ stt.cmdMu.Unlock()
+ if err := cmd.Start(); err != nil {
+ return fmt.Errorf("failed to start ffmpeg: %w", err)
+ }
+ stt.recording = true
+ stt.AudioBuffer.Reset()
+ // Read PCM data in goroutine
+ go func() {
+ buf := make([]byte, 4096)
+ for {
+ select {
+ case <-stt.stopCh:
+ return
+ default:
+ n, err := stdout.Read(buf)
+ if n > 0 {
+ stt.mu.Lock()
+ stt.AudioBuffer.Write(buf[:n])
+ stt.mu.Unlock()
+ }
+ if err != nil {
+ if err != io.EOF {
+ stt.logger.Error("recording read error", "error", err)
+ }
+ return
+ }
+ }
+ }
+ }()
+ return nil
+}
+
+func (stt *WhisperServer) StopRecording() (string, error) {
+ stt.mu.Lock()
+ defer stt.mu.Unlock()
+ if !stt.recording {
+ return "", errors.New("not recording")
+ }
+ stt.recording = false
+ // Stop ffmpeg
+ stt.cmdMu.Lock()
+ if stt.cmd != nil && stt.cmd.Process != nil {
+ stt.cmd.Process.Kill()
+ stt.cmd.Wait()
+ }
+ close(stt.stopCh)
+ stt.cmdMu.Unlock()
+ // Rest of StopRecording unchanged (WAV header + HTTP upload)
+ // ...
+ stt.recording = false
+ // wait loop to finish?
+ if stt.AudioBuffer == nil {
+ err := errors.New("unexpected nil AudioBuffer")
+ stt.logger.Error(err.Error())
+ return "", err
+ }
+ // Create WAV header first
+ body := &bytes.Buffer{}
+ writer := multipart.NewWriter(body)
+ // Add audio file part
+ part, err := writer.CreateFormFile("file", "recording.wav")
+ if err != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ // Stream directly to multipart writer: header + raw data
+ dataSize := stt.AudioBuffer.Len()
+ stt.writeWavHeader(part, dataSize)
+ if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ // Reset buffer for next recording
+ stt.AudioBuffer.Reset()
+ // Add response format field
+ err = writer.WriteField("response_format", "text")
+ if err != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ if writer.Close() != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ // Send request
+ resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
+ if err != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ defer resp.Body.Close()
+ // Read and print response
+ responseTextBytes, err := io.ReadAll(resp.Body)
+ if err != nil {
+ stt.logger.Error("fn: StopRecording", "error", err)
+ return "", err
+ }
+ resptext := strings.TrimRight(string(responseTextBytes), "\n")
+ // in case there are special tokens like [_BEG_]
+ resptext = specialRE.ReplaceAllString(resptext, "")
+ return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
+}