summaryrefslogtreecommitdiff
path: root/extra
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-03-07 16:24:39 +0300
committerGrail Finder <wohilas@gmail.com>2026-03-07 16:24:39 +0300
commit0f0c43f32701c314e2472ef1f9a1ec8a68ab0d1a (patch)
tree92b37336495972334c7c7f8ef4fe8fa29545f9c4 /extra
parent0e55e44f624d2839dc51fa293a18b323c497a6b1 (diff)
Dep: remove beep/portaudio dependancy
Diffstat (limited to 'extra')
-rw-r--r--extra/google_tts.go211
-rw-r--r--extra/kokoro.go42
-rw-r--r--extra/tts.go190
3 files changed, 223 insertions, 220 deletions
diff --git a/extra/google_tts.go b/extra/google_tts.go
new file mode 100644
index 0000000..5b46f34
--- /dev/null
+++ b/extra/google_tts.go
@@ -0,0 +1,211 @@
+//go:build extra
+// +build extra
+
+package extra
+
+import (
+ "fmt"
+ "gf-lt/models"
+ "io"
+ "log/slog"
+ "os/exec"
+ "strings"
+ "sync"
+
+ google_translate_tts "github.com/GrailFinder/google-translate-tts"
+ "github.com/neurosnap/sentences/english"
+)
+
+type GoogleTranslateOrator struct {
+ logger *slog.Logger
+ mu sync.Mutex
+ speech *google_translate_tts.Speech
+ // fields for playback control
+ cmd *exec.Cmd
+ cmdMu sync.Mutex
+ stopCh chan struct{}
+ // text buffer and interrupt flag
+ textBuffer strings.Builder
+ interrupt bool
+}
+
+func (o *GoogleTranslateOrator) stoproutine() {
+ for {
+ <-TTSDoneChan
+ o.logger.Debug("orator got done signal")
+ o.Stop()
+ for len(TTSTextChan) > 0 {
+ <-TTSTextChan
+ }
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+ }
+}
+
+func (o *GoogleTranslateOrator) readroutine() {
+ tokenizer, _ := english.NewSentenceTokenizer(nil)
+ for {
+ select {
+ case chunk := <-TTSTextChan:
+ o.mu.Lock()
+ o.interrupt = false
+ _, err := o.textBuffer.WriteString(chunk)
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ o.mu.Unlock()
+ continue
+ }
+ text := o.textBuffer.String()
+ sentences := tokenizer.Tokenize(text)
+ o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
+ if len(sentences) <= 1 {
+ o.mu.Unlock()
+ continue
+ }
+ completeSentences := sentences[:len(sentences)-1]
+ remaining := sentences[len(sentences)-1].Text
+ o.textBuffer.Reset()
+ o.textBuffer.WriteString(remaining)
+ o.mu.Unlock()
+ for _, sentence := range completeSentences {
+ o.mu.Lock()
+ interrupted := o.interrupt
+ o.mu.Unlock()
+ if interrupted {
+ return
+ }
+ cleanedText := models.CleanText(sentence.Text)
+ if cleanedText == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
+ if err := o.Speak(cleanedText); err != nil {
+ o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
+ }
+ }
+ case <-TTSFlushChan:
+ o.logger.Debug("got flushchan signal start")
+ // lln is done get the whole message out
+ if len(TTSTextChan) > 0 { // otherwise might get stuck
+ for chunk := range TTSTextChan {
+ o.mu.Lock()
+ _, err := o.textBuffer.WriteString(chunk)
+ o.mu.Unlock()
+ if err != nil {
+ o.logger.Warn("failed to write to stringbuilder", "error", err)
+ continue
+ }
+ if len(TTSTextChan) == 0 {
+ break
+ }
+ }
+ }
+ o.mu.Lock()
+ remaining := o.textBuffer.String()
+ remaining = models.CleanText(remaining)
+ o.textBuffer.Reset()
+ o.mu.Unlock()
+ if remaining == "" {
+ continue
+ }
+ o.logger.Debug("calling Speak with remainder", "rem", remaining)
+ sentencesRem := tokenizer.Tokenize(remaining)
+ for _, rs := range sentencesRem { // to avoid dumping large volume of text
+ o.mu.Lock()
+ interrupt := o.interrupt
+ o.mu.Unlock()
+ if interrupt {
+ break
+ }
+ if err := o.Speak(rs.Text); err != nil {
+ o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
+ }
+ }
+ }
+ }
+}
+
+func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
+ return o.logger
+}
+
+func (o *GoogleTranslateOrator) Speak(text string) error {
+ o.logger.Debug("fn: Speak is called", "text-len", len(text))
+ // Generate MP3 data directly as an io.Reader
+ reader, err := o.speech.GenerateSpeech(text)
+ if err != nil {
+ return fmt.Errorf("generate speech failed: %w", err)
+ }
+ // Wrap in io.NopCloser since GenerateSpeech returns io.Reader (no close needed)
+ body := io.NopCloser(reader)
+ defer body.Close()
+ // Exactly the same ffplay piping as KokoroOrator
+ cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0")
+ stdin, err := cmd.StdinPipe()
+ if err != nil {
+ return fmt.Errorf("failed to get stdin pipe: %w", err)
+ }
+ o.cmdMu.Lock()
+ o.cmd = cmd
+ o.stopCh = make(chan struct{})
+ o.cmdMu.Unlock()
+ if err := cmd.Start(); err != nil {
+ return fmt.Errorf("failed to start ffplay: %w", err)
+ }
+ copyErr := make(chan error, 1)
+ go func() {
+ _, err := io.Copy(stdin, body)
+ stdin.Close()
+ copyErr <- err
+ }()
+ done := make(chan error, 1)
+ go func() {
+ done <- cmd.Wait()
+ }()
+ select {
+ case <-o.stopCh:
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done
+ return nil
+ case copyErrVal := <-copyErr:
+ if copyErrVal != nil {
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ }
+ <-done
+ return copyErrVal
+ }
+ return <-done
+ case err := <-done:
+ return err
+ }
+}
+
+func (o *GoogleTranslateOrator) Stop() {
+ o.cmdMu.Lock()
+ defer o.cmdMu.Unlock()
+ // Signal any running Speak to stop
+ if o.stopCh != nil {
+ select {
+ case <-o.stopCh: // already closed
+ default:
+ close(o.stopCh)
+ }
+ o.stopCh = nil
+ }
+ // Kill the external player process if it's still running
+ if o.cmd != nil && o.cmd.Process != nil {
+ o.cmd.Process.Kill()
+ o.cmd.Wait() // clean up zombie process
+ o.cmd = nil
+ }
+ // Also reset text buffer and interrupt flag (with o.mu)
+ o.mu.Lock()
+ o.textBuffer.Reset()
+ o.interrupt = true
+ o.mu.Unlock()
+}
diff --git a/extra/kokoro.go b/extra/kokoro.go
index 15b173b..e3ca047 100644
--- a/extra/kokoro.go
+++ b/extra/kokoro.go
@@ -40,17 +40,13 @@ func (o *KokoroOrator) GetLogger() *slog.Logger {
return o.logger
}
-// Speak streams audio directly to an external player
func (o *KokoroOrator) Speak(text string) error {
o.logger.Debug("fn: Speak is called", "text-len", len(text))
- // 1. Get the audio stream (still an io.ReadCloser)
body, err := o.requestSound(text)
if err != nil {
return fmt.Errorf("request failed: %w", err)
}
defer body.Close()
- // 2. Prepare external player (ffplay as example)
- // -i pipe:0 tells ffplay to read from stdin
cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0")
stdin, err := cmd.StdinPipe()
if err != nil {
@@ -60,60 +56,46 @@ func (o *KokoroOrator) Speak(text string) error {
o.cmd = cmd
o.stopCh = make(chan struct{})
o.cmdMu.Unlock()
- // 3. Start the player
if err := cmd.Start(); err != nil {
return fmt.Errorf("failed to start ffplay: %w", err)
}
- // 4. Copy audio data to stdin in a goroutine
+ // Copy audio in background
copyErr := make(chan error, 1)
go func() {
_, err := io.Copy(stdin, body)
- stdin.Close() // signal EOF to player
+ stdin.Close()
copyErr <- err
}()
- // 5. Wait for player to finish or stop signal
+ // Wait for player in background
done := make(chan error, 1)
go func() {
done <- cmd.Wait()
}()
+ // Wait for BOTH copy and player, but ensure we block until done
select {
case <-o.stopCh:
- // Stop requested: kill the player
+ // Stop requested: kill player and wait for it to exit
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
- <-done // wait for process to exit
+ <-done // Wait for process to actually exit
return nil
- case err := <-done:
- // Playback finished normally
- return err
case copyErrVal := <-copyErr:
if copyErrVal != nil {
- // Copy failed – kill the player
+ // Copy failed: kill player and wait
if o.cmd != nil && o.cmd.Process != nil {
o.cmd.Process.Kill()
}
<-done
return copyErrVal
}
- return nil
+ // Copy succeeded, now wait for playback to complete
+ return <-done
+ case err := <-done:
+ // Playback finished normally (copy must have succeeded or player would have exited early)
+ return err
}
}
-
-// // Stop interrupts ongoing playback
-// func (o *KokoroOrator) Stop() {
-// o.cmdMu.Lock()
-// defer o.cmdMu.Unlock()
-// if o.stopCh != nil {
-// close(o.stopCh)
-// }
-// // Also clear the buffer and set interrupt flag as before
-// o.mu.Lock()
-// o.textBuffer.Reset()
-// o.interrupt = true
-// o.mu.Unlock()
-// }
-
func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
if o.URL == "" {
return nil, fmt.Errorf("TTS URL is empty")
diff --git a/extra/tts.go b/extra/tts.go
index a75678b..80085ab 100644
--- a/extra/tts.go
+++ b/extra/tts.go
@@ -4,22 +4,13 @@
package extra
import (
- "fmt"
"gf-lt/config"
"gf-lt/models"
- "io"
"log/slog"
"os"
"strings"
- "sync"
- "time"
google_translate_tts "github.com/GrailFinder/google-translate-tts"
- "github.com/GrailFinder/google-translate-tts/handlers"
- "github.com/gopxl/beep/v2"
- "github.com/gopxl/beep/v2/mp3"
- "github.com/gopxl/beep/v2/speaker"
- "github.com/neurosnap/sentences/english"
)
var (
@@ -36,17 +27,6 @@ type Orator interface {
GetLogger() *slog.Logger
}
-// Google Translate TTS implementation
-type GoogleTranslateOrator struct {
- logger *slog.Logger
- mu sync.Mutex
- speech *google_translate_tts.Speech
- currentStream *beep.Ctrl
- currentDone chan bool
- textBuffer strings.Builder
- interrupt bool
-}
-
func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
provider := cfg.TTS_PROVIDER
if provider == "" {
@@ -76,7 +56,6 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
Language: language,
Proxy: "", // Proxy not supported
Speed: cfg.TTS_SPEED,
- Handler: &handlers.Beep{},
}
orator := &GoogleTranslateOrator{
logger: log,
@@ -87,172 +66,3 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
return orator
}
}
-
-func (o *GoogleTranslateOrator) stoproutine() {
- for {
- <-TTSDoneChan
- o.logger.Debug("orator got done signal")
- o.Stop()
- // drain the channel
- for len(TTSTextChan) > 0 {
- <-TTSTextChan
- }
- o.mu.Lock()
- o.textBuffer.Reset()
- if o.currentDone != nil {
- select {
- case o.currentDone <- true:
- default:
- // Channel might be closed, ignore
- }
- }
- o.interrupt = true
- o.mu.Unlock()
- }
-}
-
-func (o *GoogleTranslateOrator) readroutine() {
- tokenizer, _ := english.NewSentenceTokenizer(nil)
- for {
- select {
- case chunk := <-TTSTextChan:
- o.mu.Lock()
- o.interrupt = false
- _, err := o.textBuffer.WriteString(chunk)
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- o.mu.Unlock()
- continue
- }
- text := o.textBuffer.String()
- sentences := tokenizer.Tokenize(text)
- o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences))
- if len(sentences) <= 1 {
- o.mu.Unlock()
- continue
- }
- completeSentences := sentences[:len(sentences)-1]
- remaining := sentences[len(sentences)-1].Text
- o.textBuffer.Reset()
- o.textBuffer.WriteString(remaining)
- o.mu.Unlock()
-
- for _, sentence := range completeSentences {
- o.mu.Lock()
- interrupted := o.interrupt
- o.mu.Unlock()
- if interrupted {
- return
- }
- cleanedText := models.CleanText(sentence.Text)
- if cleanedText == "" {
- continue
- }
- o.logger.Debug("calling Speak with sentence", "sent", cleanedText)
- if err := o.Speak(cleanedText); err != nil {
- o.logger.Error("tts failed", "sentence", cleanedText, "error", err)
- }
- }
- case <-TTSFlushChan:
- o.logger.Debug("got flushchan signal start")
- // lln is done get the whole message out
- if len(TTSTextChan) > 0 { // otherwise might get stuck
- for chunk := range TTSTextChan {
- o.mu.Lock()
- _, err := o.textBuffer.WriteString(chunk)
- o.mu.Unlock()
- if err != nil {
- o.logger.Warn("failed to write to stringbuilder", "error", err)
- continue
- }
- if len(TTSTextChan) == 0 {
- break
- }
- }
- }
- o.mu.Lock()
- remaining := o.textBuffer.String()
- remaining = models.CleanText(remaining)
- o.textBuffer.Reset()
- o.mu.Unlock()
- if remaining == "" {
- continue
- }
- o.logger.Debug("calling Speak with remainder", "rem", remaining)
- sentencesRem := tokenizer.Tokenize(remaining)
- for _, rs := range sentencesRem { // to avoid dumping large volume of text
- o.mu.Lock()
- interrupt := o.interrupt
- o.mu.Unlock()
- if interrupt {
- break
- }
- if err := o.Speak(rs.Text); err != nil {
- o.logger.Error("tts failed", "sentence", rs.Text, "error", err)
- }
- }
- }
- }
-}
-
-func (o *GoogleTranslateOrator) GetLogger() *slog.Logger {
- return o.logger
-}
-
-func (o *GoogleTranslateOrator) Speak(text string) error {
- o.logger.Debug("fn: Speak is called", "text-len", len(text))
- // Generate MP3 data using google-translate-tts
- reader, err := o.speech.GenerateSpeech(text)
- if err != nil {
- o.logger.Error("generate speech failed", "error", err)
- return fmt.Errorf("generate speech failed: %w", err)
- }
- // Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser)
- streamer, format, err := mp3.Decode(io.NopCloser(reader))
- if err != nil {
- o.logger.Error("mp3 decode failed", "error", err)
- return fmt.Errorf("mp3 decode failed: %w", err)
- }
- defer streamer.Close()
- playbackStreamer := beep.Streamer(streamer)
- speed := o.speech.Speed
- if speed <= 0 {
- speed = 1.0
- }
- if speed != 1.0 {
- playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer)
- }
- // Initialize speaker with the format's sample rate
- if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
- o.logger.Debug("failed to init speaker", "error", err)
- }
- done := make(chan bool)
- o.mu.Lock()
- o.currentDone = done
- o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() {
- o.mu.Lock()
- close(done)
- o.currentStream = nil
- o.currentDone = nil
- o.mu.Unlock()
- })), Paused: false}
- o.mu.Unlock()
- speaker.Play(o.currentStream)
- <-done // wait for playback to complete
- return nil
-}
-
-func (o *GoogleTranslateOrator) Stop() {
- o.logger.Debug("attempted to stop google translate orator")
- speaker.Lock()
- defer speaker.Unlock()
- o.mu.Lock()
- defer o.mu.Unlock()
- if o.currentStream != nil {
- o.currentStream.Streamer = nil
- }
- // Also stop the speech handler if possible
- if o.speech != nil {
- _ = o.speech.Stop()
- }
-}