summaryrefslogtreecommitdiff
path: root/extra/audio.go
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2025-05-21 21:16:42 +0300
committerGrail Finder <wohilas@gmail.com>2025-05-21 21:16:42 +0300
commit4a17dd745cbd7354a24b046b76838f7670af51c6 (patch)
tree7365b5d5d3b3ad24252d1a5ea6b1a3c0de0b2557 /extra/audio.go
parentfe4f759173ecd3cff2a6db38878cfd2727b8b49c (diff)
Enha: rename to tts.go; use one buffer;
Diffstat (limited to 'extra/audio.go')
-rw-r--r--extra/audio.go208
1 files changed, 0 insertions, 208 deletions
diff --git a/extra/audio.go b/extra/audio.go
deleted file mode 100644
index 11021f9..0000000
--- a/extra/audio.go
+++ /dev/null
@@ -1,208 +0,0 @@
-package extra
-
-import (
- "bytes"
- "gf-lt/config"
- "gf-lt/models"
- "encoding/json"
- "fmt"
- "io"
- "log/slog"
- "net/http"
- "strings"
- "time"
-
- "github.com/gopxl/beep/v2"
- "github.com/gopxl/beep/v2/mp3"
- "github.com/gopxl/beep/v2/speaker"
- "github.com/neurosnap/sentences/english"
-)
-
-var (
- TTSTextChan = make(chan string, 10000)
- TTSFlushChan = make(chan bool, 1)
- TTSDoneChan = make(chan bool, 1)
-)
-
-type Orator interface {
- Speak(text string) error
- Stop()
- // pause and resume?
- GetSBuilder() strings.Builder
- GetLogger() *slog.Logger
-}
-
-// impl https://github.com/remsky/Kokoro-FastAPI
-type KokoroOrator struct {
- logger *slog.Logger
- URL string
- Format models.AudioFormat
- Stream bool
- Speed float32
- Language string
- Voice string
- currentStream *beep.Ctrl // Added for playback control
- textBuffer strings.Builder
-}
-
-func stoproutine(orator Orator) {
- <-TTSDoneChan
- orator.GetLogger().Info("orator got done signal")
- orator.Stop()
- // drain the channel
- for len(TTSTextChan) > 0 {
- <-TTSTextChan
- }
-}
-
-func readroutine(orator Orator) {
- tokenizer, _ := english.NewSentenceTokenizer(nil)
- var sentenceBuf bytes.Buffer
- var remainder strings.Builder
- for {
- select {
- case chunk := <-TTSTextChan:
- sentenceBuf.WriteString(chunk)
- text := sentenceBuf.String()
- sentences := tokenizer.Tokenize(text)
- for i, sentence := range sentences {
- if i == len(sentences)-1 {
- sentenceBuf.Reset()
- sentenceBuf.WriteString(sentence.Text)
- continue
- }
- // Send complete sentence to TTS
- if err := orator.Speak(sentence.Text); err != nil {
- orator.GetLogger().Error("tts failed", "sentence", sentence.Text, "error", err)
- }
- }
- case <-TTSFlushChan:
- // lln is done get the whole message out
- // FIXME: loses one token
- for chunk := range TTSTextChan {
- // orator.GetLogger().Info("flushing", "chunk", chunk)
- // sentenceBuf.WriteString(chunk)
- remainder.WriteString(chunk) // I get text here
- if len(TTSTextChan) == 0 {
- break
- }
- }
- // INFO: if there is a lot of text it will take some time to make with tts at once
- // to avoid this pause, it might be better to keep splitting on sentences
- // but keepinig in mind that remainder could be ommited by tokenizer
- // Flush remaining text
- remaining := remainder.String()
- remainder.Reset()
- if remaining != "" {
- // orator.GetLogger().Info("flushing", "remaining", remaining)
- if err := orator.Speak(remaining); err != nil {
- orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
- }
- }
- // case <-TTSDoneChan:
- // orator.GetLogger().Info("orator got done signal")
- // orator.Stop()
- // // it that the best way to empty channel?
- // close(TTSTextChan)
- // TTSTextChan = make(chan string, 10000)
- // return
- }
- }
-}
-
-func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
- orator := &KokoroOrator{
- logger: log,
- URL: cfg.TTS_URL,
- Format: models.AFMP3,
- Stream: false,
- Speed: cfg.TTS_SPEED,
- Language: "a",
- Voice: "af_bella(1)+af_sky(1)",
- }
- go readroutine(orator)
- go stoproutine(orator)
- return orator
-}
-
-func (o *KokoroOrator) GetLogger() *slog.Logger {
- return o.logger
-}
-
-func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
- payload := map[string]interface{}{
- "input": text,
- "voice": o.Voice,
- "response_format": o.Format,
- "download_format": o.Format,
- "stream": o.Stream,
- "speed": o.Speed,
- // "return_download_link": true,
- "lang_code": o.Language,
- }
- payloadBytes, err := json.Marshal(payload)
- if err != nil {
- return nil, fmt.Errorf("failed to marshal payload: %w", err)
- }
- req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx
- if err != nil {
- return nil, fmt.Errorf("failed to create request: %w", err)
- }
- req.Header.Set("accept", "application/json")
- req.Header.Set("Content-Type", "application/json")
- resp, err := http.DefaultClient.Do(req)
- if err != nil {
- return nil, fmt.Errorf("request failed: %w", err)
- }
- if resp.StatusCode != http.StatusOK {
- defer resp.Body.Close()
- return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
- }
- return resp.Body, nil
-}
-
-func (o *KokoroOrator) Speak(text string) error {
- o.logger.Info("fn: Speak is called", "text-len", len(text))
- body, err := o.requestSound(text)
- if err != nil {
- o.logger.Error("request failed", "error", err)
- return fmt.Errorf("request failed: %w", err)
- }
- defer body.Close()
- // Decode the mp3 audio from response body
- streamer, format, err := mp3.Decode(body)
- if err != nil {
- o.logger.Error("mp3 decode failed", "error", err)
- return fmt.Errorf("mp3 decode failed: %w", err)
- }
- defer streamer.Close()
- // here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
- if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
- o.logger.Debug("failed to init speaker", "error", err)
- }
- done := make(chan bool)
- // Create controllable stream and store reference
- o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
- close(done)
- o.currentStream = nil
- })), Paused: false}
- speaker.Play(o.currentStream)
- <-done // we hang in this routine;
- return nil
-}
-
-// TODO: stop works; but new stream does not start afterwards
-func (o *KokoroOrator) Stop() {
- // speaker.Clear()
- o.logger.Info("attempted to stop orator", "orator", o)
- speaker.Lock()
- defer speaker.Unlock()
- if o.currentStream != nil {
- o.currentStream.Paused = true
- o.currentStream.Streamer = nil
- }
-}
-
-func (o *KokoroOrator) GetSBuilder() strings.Builder {
- return o.textBuffer
-}