diff options
| author | Grail Finder <wohilas@gmail.com> | 2026-01-08 09:28:16 +0300 |
|---|---|---|
| committer | Grail Finder <wohilas@gmail.com> | 2026-01-08 09:28:16 +0300 |
| commit | 51916895789fc2b166af752c14cd4696d6517bec (patch) | |
| tree | 584d8612449a8a2e4185211ba9a9581e358d8386 | |
| parent | 5f0de6f3112d2bbda6af408cc5d6e51c0a752f88 (diff) | |
Feat: google-translate-tts support
| -rw-r--r-- | config.example.toml | 2 | ||||
| -rw-r--r-- | config/config.go | 8 | ||||
| -rw-r--r-- | docs/config.md | 12 | ||||
| -rw-r--r-- | docs/tutorial_rp.md | 6 | ||||
| -rw-r--r-- | extra/tts.go | 193 | ||||
| -rw-r--r-- | go.mod | 4 | ||||
| -rw-r--r-- | go.sum | 1 |
7 files changed, 208 insertions, 18 deletions
diff --git a/config.example.toml b/config.example.toml index 2227c5e..eb1be70 100644 --- a/config.example.toml +++ b/config.example.toml @@ -29,6 +29,8 @@ RAGDir = "ragimport" TTS_ENABLED = false TTS_URL = "http://localhost:8880/v1/audio/speech" TTS_SPEED = 1.2 +TTS_PROVIDER = "kokoro" +TTS_LANGUAGE = "en" # extra stt STT_ENABLED = false STT_TYPE = "WHISPER_SERVER" # WHISPER_SERVER or WHISPER_BINARY diff --git a/config/config.go b/config/config.go index 1454b4c..dc769d3 100644 --- a/config/config.go +++ b/config/config.go @@ -48,9 +48,11 @@ type Config struct { OpenRouterToken string `toml:"OpenRouterToken"` OpenRouterModel string `toml:"OpenRouterModel"` // TTS - TTS_URL string `toml:"TTS_URL"` - TTS_ENABLED bool `toml:"TTS_ENABLED"` - TTS_SPEED float32 `toml:"TTS_SPEED"` + TTS_URL string `toml:"TTS_URL"` + TTS_ENABLED bool `toml:"TTS_ENABLED"` + TTS_SPEED float32 `toml:"TTS_SPEED"` + TTS_PROVIDER string `toml:"TTS_PROVIDER"` + TTS_LANGUAGE string `toml:"TTS_LANGUAGE"` // STT STT_TYPE string `toml:"STT_TYPE"` // WHISPER_SERVER, WHISPER_BINARY STT_URL string `toml:"STT_URL"` diff --git a/docs/config.md b/docs/config.md index fa5d42b..f1cac5b 100644 --- a/docs/config.md +++ b/docs/config.md @@ -96,11 +96,21 @@ This document explains how to set up and configure the application using the `co - Enable or disable text-to-speech functionality. #### TTS_URL (`"http://localhost:8880/v1/audio/speech"`) -- The endpoint for TTS API. +- The endpoint for TTS API (used with `kokoro` provider). #### TTS_SPEED (`1.2`) - Playback speed for speech output (1.0 is normal speed). +#### TTS_PROVIDER (`"kokoro"`) +- TTS provider to use. Options: `"kokoro"` or `"google"`. + - `"kokoro"`: Uses Kokoro FastAPI TTS server (requires TTS_URL to be set). Provides high-quality voice synthesis but requires a running Kokoro server. + - `"google"`: Uses Google Translate TTS with gopxl/beep for local playback. Works offline using Google's public TTS API with local audio playback via gopxl/beep. Supports multiple languages via TTS_LANGUAGE setting. + +#### TTS_LANGUAGE (`"en"`) +- Language code for TTS (used with `google` provider). + - Examples: `"en"` (English), `"es"` (Spanish), `"fr"` (French) + - See Google Translate TTS documentation for supported languages. + ### Speech-to-Text (STT) Settings #### STT_ENABLED (`false`) diff --git a/docs/tutorial_rp.md b/docs/tutorial_rp.md index 451aadb..9053ffb 100644 --- a/docs/tutorial_rp.md +++ b/docs/tutorial_rp.md @@ -1,5 +1,7 @@ -after [installing](https://github.com/GrailFinder/gf-lt/tree/master?tab=readme-ov-file#how-to-install) -[set up your config](config.md) +### RP case example + +check the (https://github.com/GrailFinder/gf-lt/tree/master?tab=readme-ov-file#how-to-install) and +[setting up your config](config.md) To roleplay, we would need to create a character card or get one from the web. For this tutorial, we are going to use the default character Seraphina from [SillyTavern (ST)](https://github.com/SillyTavern/SillyTavern/blob/release/default/content/default_Seraphina.png). diff --git a/extra/tts.go b/extra/tts.go index c9ad59d..0209072 100644 --- a/extra/tts.go +++ b/extra/tts.go @@ -12,9 +12,12 @@ import ( "io" "log/slog" "net/http" + "os" "strings" "time" + google_translate_tts "github.com/GrailFinder/google-translate-tts" + "github.com/GrailFinder/google-translate-tts/handlers" "github.com/gopxl/beep/v2" "github.com/gopxl/beep/v2/mp3" "github.com/gopxl/beep/v2/speaker" @@ -49,6 +52,14 @@ type KokoroOrator struct { // textBuffer bytes.Buffer } +// Google Translate TTS implementation +type GoogleTranslateOrator struct { + logger *slog.Logger + speech *google_translate_tts.Speech + currentStream *beep.Ctrl + textBuffer strings.Builder +} + func (o *KokoroOrator) stoproutine() { <-TTSDoneChan o.logger.Debug("orator got done signal") @@ -123,18 +134,47 @@ func (o *KokoroOrator) readroutine() { } func NewOrator(log *slog.Logger, cfg *config.Config) Orator { - orator := &KokoroOrator{ - logger: log, - URL: cfg.TTS_URL, - Format: models.AFMP3, - Stream: false, - Speed: cfg.TTS_SPEED, - Language: "a", - Voice: "af_bella(1)+af_sky(1)", - } - go orator.readroutine() - go orator.stoproutine() - return orator + provider := cfg.TTS_PROVIDER + if provider == "" { + provider = "kokoro" + } + + switch strings.ToLower(provider) { + case "google", "google-translate", "google_translate": + language := cfg.TTS_LANGUAGE + if language == "" { + language = "en" + } + + speech := &google_translate_tts.Speech{ + Folder: os.TempDir() + "/gf-lt-tts", // Temporary directory for caching + Language: language, + Proxy: "", // Proxy not supported + Speed: cfg.TTS_SPEED, + Handler: &handlers.Beep{}, + } + + orator := &GoogleTranslateOrator{ + logger: log, + speech: speech, + } + go orator.readroutine() + go orator.stoproutine() + return orator + default: // kokoro + orator := &KokoroOrator{ + logger: log, + URL: cfg.TTS_URL, + Format: models.AFMP3, + Stream: false, + Speed: cfg.TTS_SPEED, + Language: "a", + Voice: "af_bella(1)+af_sky(1)", + } + go orator.readroutine() + go orator.stoproutine() + return orator + } } func (o *KokoroOrator) GetLogger() *slog.Logger { @@ -213,3 +253,132 @@ func (o *KokoroOrator) Stop() { o.currentStream.Streamer = nil } } + +func (o *GoogleTranslateOrator) stoproutine() { + <-TTSDoneChan + o.logger.Debug("orator got done signal") + o.Stop() + // drain the channel + for len(TTSTextChan) > 0 { + <-TTSTextChan + } +} + +func (o *GoogleTranslateOrator) readroutine() { + tokenizer, _ := english.NewSentenceTokenizer(nil) + for { + select { + case chunk := <-TTSTextChan: + _, err := o.textBuffer.WriteString(chunk) + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + continue + } + text := o.textBuffer.String() + sentences := tokenizer.Tokenize(text) + o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences)) + for i, sentence := range sentences { + if i == len(sentences)-1 { // last sentence + o.textBuffer.Reset() + _, err := o.textBuffer.WriteString(sentence.Text) + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + continue + } + continue // if only one (often incomplete) sentence; wait for next chunk + } + o.logger.Debug("calling Speak with sentence", "sent", sentence.Text) + if err := o.Speak(sentence.Text); err != nil { + o.logger.Error("tts failed", "sentence", sentence.Text, "error", err) + } + } + case <-TTSFlushChan: + o.logger.Debug("got flushchan signal start") + // lln is done get the whole message out + if len(TTSTextChan) > 0 { // otherwise might get stuck + for chunk := range TTSTextChan { + _, err := o.textBuffer.WriteString(chunk) + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + continue + } + if len(TTSTextChan) == 0 { + break + } + } + } + // INFO: if there is a lot of text it will take some time to make with tts at once + // to avoid this pause, it might be better to keep splitting on sentences + // but keepinig in mind that remainder could be ommited by tokenizer + // Flush remaining text + remaining := o.textBuffer.String() + o.textBuffer.Reset() + if remaining != "" { + o.logger.Debug("calling Speak with remainder", "rem", remaining) + if err := o.Speak(remaining); err != nil { + o.logger.Error("tts failed", "sentence", remaining, "error", err) + } + } + } + } +} + +func (o *GoogleTranslateOrator) GetLogger() *slog.Logger { + return o.logger +} + +func (o *GoogleTranslateOrator) Speak(text string) error { + o.logger.Debug("fn: Speak is called", "text-len", len(text)) + + // Generate MP3 data using google-translate-tts + reader, err := o.speech.GenerateSpeech(text) + if err != nil { + o.logger.Error("generate speech failed", "error", err) + return fmt.Errorf("generate speech failed: %w", err) + } + + // Decode the mp3 audio from reader (wrap with NopCloser for io.ReadCloser) + streamer, format, err := mp3.Decode(io.NopCloser(reader)) + if err != nil { + o.logger.Error("mp3 decode failed", "error", err) + return fmt.Errorf("mp3 decode failed: %w", err) + } + defer streamer.Close() + + playbackStreamer := beep.Streamer(streamer) + speed := o.speech.Speed + if speed <= 0 { + speed = 1.0 + } + if speed != 1.0 { + playbackStreamer = beep.ResampleRatio(3, float64(speed), streamer) + } + + // Initialize speaker with the format's sample rate + if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil { + o.logger.Debug("failed to init speaker", "error", err) + } + + done := make(chan bool) + // Create controllable stream and store reference + o.currentStream = &beep.Ctrl{Streamer: beep.Seq(playbackStreamer, beep.Callback(func() { + close(done) + o.currentStream = nil + })), Paused: false} + speaker.Play(o.currentStream) + <-done // wait for playback to complete + return nil +} + +func (o *GoogleTranslateOrator) Stop() { + o.logger.Debug("attempted to stop google translate orator") + speaker.Lock() + defer speaker.Unlock() + if o.currentStream != nil { + o.currentStream.Streamer = nil + } + // Also stop the speech handler if possible + if o.speech != nil { + _ = o.speech.Stop() + } +} @@ -4,6 +4,7 @@ go 1.25.1 require ( github.com/BurntSushi/toml v1.5.0 + github.com/GrailFinder/google-translate-tts v0.0.0-00010101000000-000000000000 github.com/GrailFinder/searchagent v0.2.0 github.com/gdamore/tcell/v2 v2.13.2 github.com/glebarez/go-sqlite v1.22.0 @@ -23,6 +24,7 @@ require ( github.com/gdamore/encoding v1.0.1 // indirect github.com/google/uuid v1.6.0 // indirect github.com/hajimehoshi/go-mp3 v0.3.4 // indirect + github.com/hajimehoshi/oto/v2 v2.3.1 // indirect github.com/lucasb-eyer/go-colorful v1.3.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect @@ -39,3 +41,5 @@ require ( modernc.org/memory v1.11.0 // indirect modernc.org/sqlite v1.40.1 // indirect ) + +replace github.com/GrailFinder/google-translate-tts => /home/grail/projects/plays/goplays/google-translate-tts @@ -35,6 +35,7 @@ github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b h1:WEuQWBxel github.com/gordonklaus/portaudio v0.0.0-20250206071425-98a94950218b/go.mod h1:esZFQEUwqC+l76f2R8bIWSwXMaPbp79PppwZ1eJhFco= github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68= github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo= +github.com/hajimehoshi/oto/v2 v2.3.1 h1:qrLKpNus2UfD674oxckKjNJmesp9hMh7u7QCrStB3Rc= github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= |
