diff options
Diffstat (limited to 'extra')
| -rw-r--r-- | extra/google_tts.go | 211 | ||||
| -rw-r--r-- | extra/kokoro.go | 42 | ||||
| -rw-r--r-- | extra/tts.go | 190 |
3 files changed, 223 insertions, 220 deletions
diff --git a/extra/google_tts.go b/extra/google_tts.go new file mode 100644 index 0000000..5b46f34 --- /dev/null +++ b/extra/google_tts.go @@ -0,0 +1,211 @@ +//go:build extra +// +build extra + +package extra + +import ( + "fmt" + "gf-lt/models" + "io" + "log/slog" + "os/exec" + "strings" + "sync" + + google_translate_tts "github.com/GrailFinder/google-translate-tts" + "github.com/neurosnap/sentences/english" +) + +type GoogleTranslateOrator struct { + logger *slog.Logger + mu sync.Mutex + speech *google_translate_tts.Speech + // fields for playback control + cmd *exec.Cmd + cmdMu sync.Mutex + stopCh chan struct{} + // text buffer and interrupt flag + textBuffer strings.Builder + interrupt bool +} + +func (o *GoogleTranslateOrator) stoproutine() { + for { + <-TTSDoneChan + o.logger.Debug("orator got done signal") + o.Stop() + for len(TTSTextChan) > 0 { + <-TTSTextChan + } + o.mu.Lock() + o.textBuffer.Reset() + o.interrupt = true + o.mu.Unlock() + } +} + +func (o *GoogleTranslateOrator) readroutine() { + tokenizer, _ := english.NewSentenceTokenizer(nil) + for { + select { + case chunk := <-TTSTextChan: + o.mu.Lock() + o.interrupt = false + _, err := o.textBuffer.WriteString(chunk) + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + o.mu.Unlock() + continue + } + text := o.textBuffer.String() + sentences := tokenizer.Tokenize(text) + o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences)) + if len(sentences) <= 1 { + o.mu.Unlock() + continue + } + completeSentences := sentences[:len(sentences)-1] + remaining := sentences[len(sentences)-1].Text + o.textBuffer.Reset() + o.textBuffer.WriteString(remaining) + o.mu.Unlock() + for _, sentence := range completeSentences { + o.mu.Lock() + interrupted := o.interrupt + o.mu.Unlock() + if interrupted { + return + } + cleanedText := models.CleanText(sentence.Text) + if cleanedText == "" { + continue + } + o.logger.Debug("calling Speak with sentence", "sent", cleanedText) + if err := o.Speak(cleanedText); err != nil { + o.logger.Error("tts failed", "sentence", cleanedText, "error", err) + } + } + case <-TTSFlushChan: + o.logger.Debug("got flushchan signal start") + // lln is done get the whole message out + if len(TTSTextChan) > 0 { // otherwise might get stuck + for chunk := range TTSTextChan { + o.mu.Lock() + _, err := o.textBuffer.WriteString(chunk) + o.mu.Unlock() + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + continue + } + if len(TTSTextChan) == 0 { + break + } + } + } + o.mu.Lock() + remaining := o.textBuffer.String() + remaining = models.CleanText(remaining) + o.textBuffer.Reset() + o.mu.Unlock() + if remaining == "" { + continue + } + o.logger.Debug("calling Speak with remainder", "rem", remaining) + sentencesRem := tokenizer.Tokenize(remaining) + for _, rs := range sentencesRem { // to avoid dumping large volume of text + o.mu.Lock() + interrupt := o.interrupt + o.mu.Unlock() + if interrupt { + break + } + if err := o.Speak(rs.Text); err != nil { + o.logger.Error("tts failed", "sentence", rs.Text, "error", err) + } + } + } + } +} + +func (o *GoogleTranslateOrator) GetLogger() *slog.Logger { + return o.logger +} + +func (o *GoogleTranslateOrator) Speak(text string) error { + o.logger.Debug("fn: Speak is called", "text-len", len(text)) + // Generate MP3 data directly as an io.Reader + reader, err := o.speech.GenerateSpeech(text) + if err != nil { + return fmt.Errorf("generate speech failed: %w", err) + } + // Wrap in io.NopCloser since GenerateSpeech returns io.Reader (no close needed) + body := io.NopCloser(reader) + defer body.Close() + // Exactly the same ffplay piping as KokoroOrator + cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0") + stdin, err := cmd.StdinPipe() + if err != nil { + return fmt.Errorf("failed to get stdin pipe: %w", err) + } + o.cmdMu.Lock() + o.cmd = cmd + o.stopCh = make(chan struct{}) + o.cmdMu.Unlock() + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start ffplay: %w", err) + } + copyErr := make(chan error, 1) + go func() { + _, err := io.Copy(stdin, body) + stdin.Close() + copyErr <- err + }() + done := make(chan error, 1) + go func() { + done <- cmd.Wait() + }() + select { + case <-o.stopCh: + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + } + <-done + return nil + case copyErrVal := <-copyErr: + if copyErrVal != nil { + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + } + <-done + return copyErrVal + } + return <-done + case err := <-done: + return err + } +} + +func (o *GoogleTranslateOrator) Stop() { + o.cmdMu.Lock() + defer o.cmdMu.Unlock() + // Signal any running Speak to stop + if o.stopCh != nil { + select { + case <-o.stopCh: // already closed + default: + close(o.stopCh) + } + o.stopCh = nil + } + // Kill the external player process if it's still running + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + o.cmd.Wait() // clean up zombie process + o.cmd = nil + } + // Also reset text buffer and interrupt flag (with o.mu) + o.mu.Lock() + o.textBuffer.Reset() + o.interrupt = true + o.mu.Unlock() +} diff --git a/extra/kokoro.go b/extra/kokoro.go index 15b173b..e3ca047 100644 --- a/extra/kokoro.go +++ b/extra/kokoro.go @@ -40,17 +40,13 @@ func (o *KokoroOrator) GetLogger() *slog.Logger { return o.logger } -// Speak streams audio directly to an external player func (o *KokoroOrator) Speak(text string) error { o.logger.Debug("fn: Speak is called", "text-len", len(text)) - // 1. Get the audio stream (still an io.ReadCloser) body, err := o.requestSound(text) if err != nil { return fmt.Errorf("request failed: %w", err) } defer body.Close() - // 2. Prepare external player (ffplay as example) - // -i pipe:0 tells ffplay to read from stdin cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0") stdin, err := cmd.StdinPipe() if err != nil { @@ -60,60 +56,46 @@ func (o *KokoroOrator) Speak(text string) error { o.cmd = cmd o.stopCh = make(chan struct{}) o.cmdMu.Unlock() - // 3. Start the player if err := cmd.Start(); err != nil { return fmt.Errorf("failed to start ffplay: %w", err) } - // 4. Copy audio data to stdin in a goroutine + // Copy audio in background copyErr := make(chan error, 1) go func() { _, err := io.Copy(stdin, body) - stdin.Close() // signal EOF to player + stdin.Close() copyErr <- err }() - // 5. Wait for player to finish or stop signal + // Wait for player in background done := make(chan error, 1) go func() { done <- cmd.Wait() }() + // Wait for BOTH copy and player, but ensure we block until done select { case <-o.stopCh: - // Stop requested: kill the player + // Stop requested: kill player and wait for it to exit if o.cmd != nil && o.cmd.Process != nil { o.cmd.Process.Kill() } - <-done // wait for process to exit + <-done // Wait for process to actually exit return nil - case err := <-done: - // Playback finished normally - return err case copyErrVal := <-copyErr: if copyErrVal != nil { - // Copy failed – kill the player + // Copy failed: kill player and wait if o.cmd != nil && o.cmd.Process != nil { o.cmd.Process.Kill() } <-done return copyErrVal } - return nil + // Copy succeeded, now wait for playback to complete + return <-done + case err := <-done: + // Playback finished normally (copy must have succeeded or player would have exited early) + return err } } - -// // Stop interrupts ongoing playback -// func (o *KokoroOrator) Stop() { -// o.cmdMu.Lock() -// defer o.cmdMu.Unlock() -// if o.stopCh != nil { -// close(o.stopCh) -// } -// // Also clear the buffer and set interrupt flag as before -// o.mu.Lock() -// o.textBuffer.Reset() -// o.interrupt = true -// o.mu.Unlock() -// } - func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) { if o.URL == "" { return nil, fmt.Errorf("TTS URL is empty") diff --git a/extra/tts.go b/extra/tts.go index a75678b..80085ab 100644 --- a/extra/tts.go +++ b/extra/tts.go @@ -4,22 +4,13 @@ package extra import ( - "fmt" "gf-lt/config" "gf-lt/models" - "io" "log/slog" "os" "strings" - "sync" - "time" google_translate_tts "github.com/GrailFinder/google-translate-tts" - "github.com/GrailFinder/google-translate-tts/handlers" - "github.com/gopxl/beep/v2" - "github.com/gopxl/beep/v2/mp3" - "github.com/gopxl/beep/v2/speaker" - "github.com/neurosnap/sentences/english" ) var ( @@ -36,17 +27,6 @@ type Orator interface { GetLogger() *slog.Logger } -// Google Translate TTS implementation -type GoogleTranslateOrator struct { - logger *slog.Logger - mu sync.Mutex - speech *google_translate_tts.Speech - currentStream *beep.Ctrl - currentDone chan bool - textBuffer strings.Builder - interrupt bool -} - func NewOrator(log *slog.Logger, cfg *config.Config) Orator { provider := cfg.TTS_PROVIDER if provider == "" { @@ -76,7 +56,6 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator { Language: language, Proxy: "", // Proxy not supported Speed: cfg.TTS_SPEED, - Handler: &handlers.Beep{}, } orator := &GoogleTranslateOrator{ logger: log, @@ -87,172 +66,3 @@ func NewOrator(log *slog.Logger, cfg *config.Config) Orator { return orator } } - -func (o *GoogleTranslateOrator) stoproutine() { - for { - <-TTSDoneChan - o.logger.Debug("orator got done signal") - o.Stop() - // drain the channel - for len(TTSTextChan) > 0 { - <-TTSTextChan - } - o.mu.Lock() - o.textBuffer.Reset() - if o.currentDone != nil { - select { - case o.currentDone <- true: - default: - // Channel might be closed, ignore - } - } - o.interrupt = true - o.mu.Unlock() - } -} - -func (o *GoogleTranslateOrator) readroutine() { - tokenizer, _ := english.NewSentenceTokenizer(nil) - for { - select { - case chunk := <-TTSTextChan: - o.mu.Lock() - o.interrupt = false - _, err := o.textBuffer.WriteString(chunk) - if err != nil { - o.logger.Warn("failed to write to stringbuilder", "error", err) - o.mu.Unlock() - continue - } - text := o.textBuffer.String() - sentences := tokenizer.Tokenize(text) - o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences)) - if len(sentences) <= 1 { - o.mu.Unlock() - continue - } - completeSentences := sentences[:len(sentences)-1] - remaining := sentences[len(sentences)-1].Text - o.textBuffer.Reset() - o.textBuffer.WriteString(remaining) - o.mu.Unlock() - - for _, sentence := range completeSentences { - o.mu.Lock() - interrupted := o.interrupt - o.mu.Unlock() - if interrupted { - return - } - cleanedText := models.CleanText(sentence.Text) - if cleanedText == "" { - continue - } - o.logger.Debug("calling Speak with sentence", "sent", cleanedText) - if err := o.Speak(cleanedText); err != nil { - o.logger.Error("tts failed", "sentence", cleanedText, "error", err) - } - } - case <-TTSFlushChan: - o.logger.Debug("got flushchan signal start") - // lln is done get the whole message out - if len(TTSTextChan) > 0 { // otherwise might get stuck - for chunk := range TTSTextChan { - o.mu.Lock() - _, err := o.textBuffer.WriteString(chunk) - o.mu.Unlock() - if err != nil { - o.logger.Warn("failed to write to stringbuilder", "error", err) - continue - } - if len(TTSTextChan) == 0 { - break - } - } - } - o.mu.Lock() - remaining := o.textBuffer.String() - remaining = models.CleanText(remaining) - o.textBuffer.Reset() - o.mu.Unlock() - if remaining == "" { - continue - } - o.logger.Debug("calling Speak with remainder", "rem", remaining) - sentencesRem := tokenizer.Tokenize(remaining) - for _, rs := range sentencesRem { // to avoid dumping large volume of text - o.mu.Lock() - interrupt := o.interrupt - o.mu.Unlock() - if interrupt { - break - } - if err := o.Speak(rs.Text); err != nil { - o.logger.Error("tts failed", "sentence", rs.Text, "error", err) - } - } - } - } -} - -func (o *GoogleTranslateOrator) GetLogger() *slog.Logger { - return o.logger -} - -func (o *GoogleTranslateOrator) Speak(text string) error { - o.logger.Debug("fn: Speak is called", "text-len", len(text)) - // Generate MP3 data using google-translate-tts - reader, err := o.speech.GenerateSpeech(text) - if err != nil { - o.logger.Error("generate speech failed", "error", err) - return fmt.Errorf("generate speech failed: %w", err) - } - // Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser) - streamer, format, err := mp3.Decode(io.NopCloser(reader)) - if err != nil { - o.logger.Error("mp3 decode failed", "error", err) - return fmt.Errorf("mp3 decode failed: %w", err) - } - defer streamer.Close() - playbackStreamer := beep.Streamer(streamer) - speed := o.speech.Speed - if speed <= 0 { - speed = 1.0 - } - if speed != 1.0 { - playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer) - } - // Initialize speaker with the format's sample rate - if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil { - o.logger.Debug("failed to init speaker", "error", err) - } - done := make(chan bool) - o.mu.Lock() - o.currentDone = done - o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() { - o.mu.Lock() - close(done) - o.currentStream = nil - o.currentDone = nil - o.mu.Unlock() - })), Paused: false} - o.mu.Unlock() - speaker.Play(o.currentStream) - <-done // wait for playback to complete - return nil -} - -func (o *GoogleTranslateOrator) Stop() { - o.logger.Debug("attempted to stop google translate orator") - speaker.Lock() - defer speaker.Unlock() - o.mu.Lock() - defer o.mu.Unlock() - if o.currentStream != nil { - o.currentStream.Streamer = nil - } - // Also stop the speech handler if possible - if o.speech != nil { - _ = o.speech.Stop() - } -} |
