diff options
author | Grail Finder <wohilas@gmail.com> | 2025-05-19 09:42:47 +0300 |
---|---|---|
committer | Grail Finder <wohilas@gmail.com> | 2025-05-19 09:42:47 +0300 |
commit | a7e7da6f9965624e4667ecedd23f2eb073ac2f56 (patch) | |
tree | 2046f8ed77ccf0d47fe0a074691f8138f8419d51 | |
parent | 2e5755c28a86c525041cb952764425ad82411ec7 (diff) |
Feat: stop audio [WIP]
-rw-r--r-- | bot.go | 2 | ||||
-rw-r--r-- | config.example.toml | 1 | ||||
-rw-r--r-- | config/config.go | 5 | ||||
-rw-r--r-- | extra/audio.go | 159 | ||||
-rw-r--r-- | models/extra.go | 4 | ||||
-rw-r--r-- | tui.go | 9 |
6 files changed, 98 insertions, 82 deletions
@@ -524,7 +524,7 @@ func init() { choseChunkParser() httpClient = createClient(time.Second * 15) if cfg.TTS_ENABLED { - orator = extra.InitOrator(logger, cfg.TTS_URL) + orator = extra.NewOrator(logger, cfg) } if cfg.STT_ENABLED { asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000) diff --git a/config.example.toml b/config.example.toml index 16409f7..229f657 100644 --- a/config.example.toml +++ b/config.example.toml @@ -15,6 +15,7 @@ RAGWorkers = 5 # extra tts TTS_ENABLED = false TTS_URL = "http://localhost:8880/v1/audio/speech" +TTS_SPEED = 1.0 # extra stt STT_ENABLED = false STT_URL = "http://localhost:8081/inference" diff --git a/config/config.go b/config/config.go index ccae96d..e612aa7 100644 --- a/config/config.go +++ b/config/config.go @@ -40,8 +40,9 @@ type Config struct { DeepSeekModel string `toml:"DeepSeekModel"` ApiLinks []string // TTS - TTS_URL string `toml:"TTS_URL"` - TTS_ENABLED bool `toml:"TTS_ENABLED"` + TTS_URL string `toml:"TTS_URL"` + TTS_ENABLED bool `toml:"TTS_ENABLED"` + TTS_SPEED float32 `toml:"TTS_SPEED"` // STT STT_URL string `toml:"STT_URL"` STT_ENABLED bool `toml:"STT_ENABLED"` diff --git a/extra/audio.go b/extra/audio.go index b130bf8..8b0d8f9 100644 --- a/extra/audio.go +++ b/extra/audio.go @@ -2,6 +2,7 @@ package extra import ( "bytes" + "elefant/config" "elefant/models" "encoding/json" "fmt" @@ -18,24 +19,45 @@ import ( ) var ( - TTSTextChan = make(chan string, 1000) + TTSTextChan = make(chan string, 10000) TTSFlushChan = make(chan bool, 1) TTSDoneChan = make(chan bool, 1) ) type Orator interface { Speak(text string) error + Stop() + // pause and resume? + GetSBuilder() strings.Builder GetLogger() *slog.Logger } // impl https://github.com/remsky/Kokoro-FastAPI type KokoroOrator struct { - logger *slog.Logger - URL string - Format models.AudioFormat - Stream bool - Speed int8 - Language string + logger *slog.Logger + URL string + Format models.AudioFormat + Stream bool + Speed float32 + Language string + Voice string + currentStream *beep.Ctrl // Added for playback control + textBuffer strings.Builder +} + +func stoproutine(orator Orator) { + select { + case <-TTSDoneChan: + orator.GetLogger().Info("orator got done signal") + orator.Stop() + // close(TTSTextChan) + // TTSTextChan = make(chan string, 10000) + // drain the channel + for len(TTSTextChan) > 0 { + <-TTSTextChan + } + return + } } func readroutine(orator Orator) { @@ -70,98 +92,58 @@ func readroutine(orator Orator) { break } } + // INFO: if there is a lot of text it will take some time to make with tts at once + // to avoid this pause, it might be better to keep splitting on sentences + // but keepinig in mind that remainder could be ommited by tokenizer // Flush remaining text remaining := remainder.String() - orator.GetLogger().Info("flushing", "rem", remaining) - if remaining != "" { // but nothing is here? - orator.GetLogger().Info("flushing", "remaining", remaining) - if err := orator.Speak(remaining); err != nil { - orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err) - } - } - case <-TTSDoneChan: - // Flush remaining text - if remaining := sentenceBuf.String(); remaining != "" { + remainder.Reset() + if remaining != "" { + // orator.GetLogger().Info("flushing", "remaining", remaining) if err := orator.Speak(remaining); err != nil { orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err) } } - return + // case <-TTSDoneChan: + // orator.GetLogger().Info("orator got done signal") + // orator.Stop() + // // it that the best way to empty channel? + // close(TTSTextChan) + // TTSTextChan = make(chan string, 10000) + // return } } } -func InitOrator(log *slog.Logger, URL string) Orator { +func NewOrator(log *slog.Logger, cfg *config.Config) Orator { orator := &KokoroOrator{ logger: log, - URL: URL, + URL: cfg.TTS_URL, Format: models.AFMP3, Stream: false, - Speed: 1, + Speed: cfg.TTS_SPEED, Language: "a", + Voice: "af_bella(1)+af_sky(1)", } go readroutine(orator) + go stoproutine(orator) return orator } -// type AudioStream struct { -// TextChan chan string // Send text chunks here -// DoneChan chan bool // Close when streaming ends -// } - -// func RunOrator(orator Orator) *AudioStream { -// stream := &AudioStream{ -// TextChan: make(chan string, 1000), -// DoneChan: make(chan bool), -// } -// go func() { -// tokenizer, _ := english.NewSentenceTokenizer(nil) -// var sentenceBuf bytes.Buffer -// for { -// select { -// case chunk := <-stream.TextChan: -// sentenceBuf.WriteString(chunk) -// text := sentenceBuf.String() -// sentences := tokenizer.Tokenize(text) -// for i, sentence := range sentences { -// if i == len(sentences)-1 { -// sentenceBuf.Reset() -// sentenceBuf.WriteString(sentence.Text) -// continue -// } -// // Send complete sentence to TTS -// if err := orator.Speak(sentence.Text); err != nil { -// orator.GetLogger().Error("tts failed", "sentence", sentence.Text, "error", err) -// } -// } -// case <-stream.DoneChan: -// // Flush remaining text -// if remaining := sentenceBuf.String(); remaining != "" { -// if err := orator.Speak(remaining); err != nil { -// orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err) -// } -// } -// return -// } -// } -// }() -// return stream -// } - func (o *KokoroOrator) GetLogger() *slog.Logger { return o.logger } func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) { payload := map[string]interface{}{ - "input": text, - "voice": "af_bella(1)+af_sky(1)", - "response_format": "mp3", - "download_format": "mp3", - "stream": o.Stream, - "speed": o.Speed, - "return_download_link": true, - "lang_code": o.Language, + "input": text, + "voice": o.Voice, + "response_format": o.Format, + "download_format": o.Format, + "stream": o.Stream, + "speed": o.Speed, + // "return_download_link": true, + "lang_code": o.Language, } payloadBytes, err := json.Marshal(payload) if err != nil { @@ -185,6 +167,7 @@ func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) { } func (o *KokoroOrator) Speak(text string) error { + o.logger.Info("fn: Speak is called", "text-len", len(text)) body, err := o.requestSound(text) if err != nil { o.logger.Error("request failed", "error", err) @@ -198,11 +181,33 @@ func (o *KokoroOrator) Speak(text string) error { return fmt.Errorf("mp3 decode failed: %w", err) } defer streamer.Close() - speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)) + // here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then? + if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil { + o.logger.Debug("failed to init speaker", "error", err) + } done := make(chan bool) - speaker.Play(beep.Seq(streamer, beep.Callback(func() { + // Create controllable stream and store reference + o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() { close(done) - }))) - <-done + o.currentStream = nil + })), Paused: false} + speaker.Play(o.currentStream) + <-done // we hang in this routine; return nil } + +// TODO: stop works; but new stream does not start afterwards +func (o *KokoroOrator) Stop() { + // speaker.Clear() + o.logger.Info("attempted to stop orator", "orator", o) + speaker.Lock() + defer speaker.Unlock() + if o.currentStream != nil { + o.currentStream.Paused = true + o.currentStream.Streamer = nil + } +} + +func (o *KokoroOrator) GetSBuilder() strings.Builder { + return o.textBuffer +} diff --git a/models/extra.go b/models/extra.go index 4e3a0bf..e1ca80f 100644 --- a/models/extra.go +++ b/models/extra.go @@ -3,6 +3,6 @@ package models type AudioFormat string const ( - AFOPUS AudioFormat = "opus" - AFMP3 AudioFormat = "mp3" + AFWav AudioFormat = "wav" + AFMP3 AudioFormat = "mp3" ) @@ -1,6 +1,7 @@ package main import ( + "elefant/extra" "elefant/models" "elefant/pngmeta" "fmt" @@ -708,6 +709,14 @@ func init() { return nil } } + // I need keybind for tts to shut up + if event.Key() == tcell.KeyCtrlA { + textArea.SetText("pressed ctrl+A", true) + if cfg.TTS_ENABLED { + // audioStream.TextChan <- chunk + extra.TTSDoneChan <- true + } + } if event.Key() == tcell.KeyCtrlW { // INFO: continue bot/text message // without new role |