Feat: stop audio [WIP]

author: Grail Finder <wohilas@gmail.com> 2025-05-19 09:42:47 +0300
committer: Grail Finder <wohilas@gmail.com> 2025-05-19 09:42:47 +0300
commit: a7e7da6f9965624e4667ecedd23f2eb073ac2f56 (patch)
tree: 2046f8ed77ccf0d47fe0a074691f8138f8419d51
parent: 2e5755c28a86c525041cb952764425ad82411ec7 (diff)
6 files changed, 98 insertions, 82 deletions
diff --git a/bot.go b/bot.go
index 716bbd4..6b733b2 100644
--- a/bot.go
+++ b/bot.go
@@ -524,7 +524,7 @@ func init() {
 	choseChunkParser()
 	httpClient = createClient(time.Second * 15)
 	if cfg.TTS_ENABLED {
-		orator = extra.InitOrator(logger, cfg.TTS_URL)
+		orator = extra.NewOrator(logger, cfg)
 	}
 	if cfg.STT_ENABLED {
 		asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000)
diff --git a/config.example.toml b/config.example.toml
index 16409f7..229f657 100644
--- a/config.example.toml
+++ b/config.example.toml
@@ -15,6 +15,7 @@ RAGWorkers = 5
 # extra tts
 TTS_ENABLED = false
 TTS_URL = "http://localhost:8880/v1/audio/speech"
+TTS_SPEED = 1.0
 # extra stt
 STT_ENABLED = false
 STT_URL = "http://localhost:8081/inference"
diff --git a/config/config.go b/config/config.go
index ccae96d..e612aa7 100644
--- a/config/config.go
+++ b/config/config.go
@@ -40,8 +40,9 @@ type Config struct {
 	DeepSeekModel         string `toml:"DeepSeekModel"`
 	ApiLinks              []string
 	// TTS
-	TTS_URL     string `toml:"TTS_URL"`
-	TTS_ENABLED bool   `toml:"TTS_ENABLED"`
+	TTS_URL     string  `toml:"TTS_URL"`
+	TTS_ENABLED bool    `toml:"TTS_ENABLED"`
+	TTS_SPEED   float32 `toml:"TTS_SPEED"`
 	// STT
 	STT_URL     string `toml:"STT_URL"`
 	STT_ENABLED bool   `toml:"STT_ENABLED"`
diff --git a/extra/audio.go b/extra/audio.go
index b130bf8..8b0d8f9 100644
--- a/extra/audio.go
+++ b/extra/audio.go
@@ -2,6 +2,7 @@ package extra
 
 import (
 	"bytes"
+	"elefant/config"
 	"elefant/models"
 	"encoding/json"
 	"fmt"
@@ -18,24 +19,45 @@ import (
 )
 
 var (
-	TTSTextChan  = make(chan string, 1000)
+	TTSTextChan  = make(chan string, 10000)
 	TTSFlushChan = make(chan bool, 1)
 	TTSDoneChan  = make(chan bool, 1)
 )
 
 type Orator interface {
 	Speak(text string) error
+	Stop()
+	// pause and resume?
+	GetSBuilder() strings.Builder
 	GetLogger() *slog.Logger
 }
 
 // impl https://github.com/remsky/Kokoro-FastAPI
 type KokoroOrator struct {
-	logger   *slog.Logger
-	URL      string
-	Format   models.AudioFormat
-	Stream   bool
-	Speed    int8
-	Language string
+	logger        *slog.Logger
+	URL           string
+	Format        models.AudioFormat
+	Stream        bool
+	Speed         float32
+	Language      string
+	Voice         string
+	currentStream *beep.Ctrl // Added for playback control
+	textBuffer    strings.Builder
+}
+
+func stoproutine(orator Orator) {
+	select {
+	case <-TTSDoneChan:
+		orator.GetLogger().Info("orator got done signal")
+		orator.Stop()
+		// close(TTSTextChan)
+		// TTSTextChan = make(chan string, 10000)
+		// drain the channel
+		for len(TTSTextChan) > 0 {
+			<-TTSTextChan
+		}
+		return
+	}
 }
 
 func readroutine(orator Orator) {
@@ -70,98 +92,58 @@ func readroutine(orator Orator) {
 					break
 				}
 			}
+			// INFO: if there is a lot of text it will take some time to make with tts at once
+			// to avoid this pause, it might be better to keep splitting on sentences
+			// but keepinig in mind that remainder could be ommited by tokenizer
 			// Flush remaining text
 			remaining := remainder.String()
-			orator.GetLogger().Info("flushing", "rem", remaining)
-			if remaining != "" { // but nothing is here?
-				orator.GetLogger().Info("flushing", "remaining", remaining)
-				if err := orator.Speak(remaining); err != nil {
-					orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
-				}
-			}
-		case <-TTSDoneChan:
-			// Flush remaining text
-			if remaining := sentenceBuf.String(); remaining != "" {
+			remainder.Reset()
+			if remaining != "" {
+				// orator.GetLogger().Info("flushing", "remaining", remaining)
 				if err := orator.Speak(remaining); err != nil {
 					orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
 				}
 			}
-			return
+			// case <-TTSDoneChan:
+			// 	orator.GetLogger().Info("orator got done signal")
+			// 	orator.Stop()
+			// 	// it that the best way to empty channel?
+			// 	close(TTSTextChan)
+			// 	TTSTextChan = make(chan string, 10000)
+			// 	return
 		}
 	}
 }
 
-func InitOrator(log *slog.Logger, URL string) Orator {
+func NewOrator(log *slog.Logger, cfg *config.Config) Orator {
 	orator := &KokoroOrator{
 		logger:   log,
-		URL:      URL,
+		URL:      cfg.TTS_URL,
 		Format:   models.AFMP3,
 		Stream:   false,
-		Speed:    1,
+		Speed:    cfg.TTS_SPEED,
 		Language: "a",
+		Voice:    "af_bella(1)+af_sky(1)",
 	}
 	go readroutine(orator)
+	go stoproutine(orator)
 	return orator
 }
 
-// type AudioStream struct {
-// 	TextChan chan string // Send text chunks here
-// 	DoneChan chan bool   // Close when streaming ends
-// }
-
-// func RunOrator(orator Orator) *AudioStream {
-// 	stream := &AudioStream{
-// 		TextChan: make(chan string, 1000),
-// 		DoneChan: make(chan bool),
-// 	}
-// 	go func() {
-// 		tokenizer, _ := english.NewSentenceTokenizer(nil)
-// 		var sentenceBuf bytes.Buffer
-// 		for {
-// 			select {
-// 			case chunk := <-stream.TextChan:
-// 				sentenceBuf.WriteString(chunk)
-// 				text := sentenceBuf.String()
-// 				sentences := tokenizer.Tokenize(text)
-// 				for i, sentence := range sentences {
-// 					if i == len(sentences)-1 {
-// 						sentenceBuf.Reset()
-// 						sentenceBuf.WriteString(sentence.Text)
-// 						continue
-// 					}
-// 					// Send complete sentence to TTS
-// 					if err := orator.Speak(sentence.Text); err != nil {
-// 						orator.GetLogger().Error("tts failed", "sentence", sentence.Text, "error", err)
-// 					}
-// 				}
-// 			case <-stream.DoneChan:
-// 				// Flush remaining text
-// 				if remaining := sentenceBuf.String(); remaining != "" {
-// 					if err := orator.Speak(remaining); err != nil {
-// 						orator.GetLogger().Error("tts failed", "sentence", remaining, "error", err)
-// 					}
-// 				}
-// 				return
-// 			}
-// 		}
-// 	}()
-// 	return stream
-// }
-
 func (o *KokoroOrator) GetLogger() *slog.Logger {
 	return o.logger
 }
 
 func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
 	payload := map[string]interface{}{
-		"input":                text,
-		"voice":                "af_bella(1)+af_sky(1)",
-		"response_format":      "mp3",
-		"download_format":      "mp3",
-		"stream":               o.Stream,
-		"speed":                o.Speed,
-		"return_download_link": true,
-		"lang_code":            o.Language,
+		"input":           text,
+		"voice":           o.Voice,
+		"response_format": o.Format,
+		"download_format": o.Format,
+		"stream":          o.Stream,
+		"speed":           o.Speed,
+		// "return_download_link": true,
+		"lang_code": o.Language,
 	}
 	payloadBytes, err := json.Marshal(payload)
 	if err != nil {
@@ -185,6 +167,7 @@ func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) {
 }
 
 func (o *KokoroOrator) Speak(text string) error {
+	o.logger.Info("fn: Speak is called", "text-len", len(text))
 	body, err := o.requestSound(text)
 	if err != nil {
 		o.logger.Error("request failed", "error", err)
@@ -198,11 +181,33 @@ func (o *KokoroOrator) Speak(text string) error {
 		return fmt.Errorf("mp3 decode failed: %w", err)
 	}
 	defer streamer.Close()
-	speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10))
+	// here it spams with errors that speaker cannot be initialized more than once, but how would we deal with many audio records then?
+	if err := speaker.Init(format.SampleRate, format.SampleRate.N(time.Second/10)); err != nil {
+		o.logger.Debug("failed to init speaker", "error", err)
+	}
 	done := make(chan bool)
-	speaker.Play(beep.Seq(streamer, beep.Callback(func() {
+	// Create controllable stream and store reference
+	o.currentStream = &beep.Ctrl{Streamer: beep.Seq(streamer, beep.Callback(func() {
 		close(done)
-	})))
-	<-done
+		o.currentStream = nil
+	})), Paused: false}
+	speaker.Play(o.currentStream)
+	<-done // we hang in this routine;
 	return nil
 }
+
+// TODO: stop works; but new stream does not start afterwards
+func (o *KokoroOrator) Stop() {
+	// speaker.Clear()
+	o.logger.Info("attempted to stop orator", "orator", o)
+	speaker.Lock()
+	defer speaker.Unlock()
+	if o.currentStream != nil {
+		o.currentStream.Paused = true
+		o.currentStream.Streamer = nil
+	}
+}
+
+func (o *KokoroOrator) GetSBuilder() strings.Builder {
+	return o.textBuffer
+}
diff --git a/models/extra.go b/models/extra.go
index 4e3a0bf..e1ca80f 100644
--- a/models/extra.go
+++ b/models/extra.go
@@ -3,6 +3,6 @@ package models
 type AudioFormat string
 
 const (
-	AFOPUS AudioFormat = "opus"
-	AFMP3  AudioFormat = "mp3"
+	AFWav AudioFormat = "wav"
+	AFMP3 AudioFormat = "mp3"
 )
diff --git a/tui.go b/tui.go
index a2e3ded..df42ea0 100644
--- a/tui.go
+++ b/tui.go
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"elefant/extra"
 	"elefant/models"
 	"elefant/pngmeta"
 	"fmt"
@@ -708,6 +709,14 @@ func init() {
 				return nil
 			}
 		}
+		// I need keybind for tts to shut up
+		if event.Key() == tcell.KeyCtrlA {
+			textArea.SetText("pressed ctrl+A", true)
+			if cfg.TTS_ENABLED {
+				// audioStream.TextChan <- chunk
+				extra.TTSDoneChan <- true
+			}
+		}
 		if event.Key() == tcell.KeyCtrlW {
 			// INFO: continue bot/text message
 			// without new role
author	Grail Finder <wohilas@gmail.com>	2025-05-19 09:42:47 +0300
committer	Grail Finder <wohilas@gmail.com>	2025-05-19 09:42:47 +0300
commit	a7e7da6f9965624e4667ecedd23f2eb073ac2f56 (patch)
tree	2046f8ed77ccf0d47fe0a074691f8138f8419d51
parent	2e5755c28a86c525041cb952764425ad82411ec7 (diff)