diff options
| author | Grail Finder <wohilas@gmail.com> | 2026-03-07 15:41:39 +0300 |
|---|---|---|
| committer | Grail Finder <wohilas@gmail.com> | 2026-03-07 15:41:39 +0300 |
| commit | 0e55e44f624d2839dc51fa293a18b323c497a6b1 (patch) | |
| tree | f5fbaeb112c92456454a3af188c5c89fbdc40a2f /extra/kokoro.go | |
| parent | 014e297ae3497d07b5c46c234a9157db8dfce198 (diff) | |
Enha (kokoro): use ffplay instead of beep (portaudio)
Diffstat (limited to 'extra/kokoro.go')
| -rw-r--r-- | extra/kokoro.go | 277 |
1 files changed, 277 insertions, 0 deletions
diff --git a/extra/kokoro.go b/extra/kokoro.go new file mode 100644 index 0000000..15b173b --- /dev/null +++ b/extra/kokoro.go @@ -0,0 +1,277 @@ +//go:build extra +// +build extra + +package extra + +import ( + "bytes" + "encoding/json" + "fmt" + "gf-lt/models" + "io" + "log/slog" + "net/http" + "os/exec" + "strings" + "sync" + + "github.com/neurosnap/sentences/english" +) + +type KokoroOrator struct { + logger *slog.Logger + mu sync.Mutex + URL string + Format models.AudioFormat + Stream bool + Speed float32 + Language string + Voice string + // fields for playback control + cmd *exec.Cmd + cmdMu sync.Mutex + stopCh chan struct{} + // textBuffer, interrupt etc. remain the same + textBuffer strings.Builder + interrupt bool +} + +func (o *KokoroOrator) GetLogger() *slog.Logger { + return o.logger +} + +// Speak streams audio directly to an external player +func (o *KokoroOrator) Speak(text string) error { + o.logger.Debug("fn: Speak is called", "text-len", len(text)) + // 1. Get the audio stream (still an io.ReadCloser) + body, err := o.requestSound(text) + if err != nil { + return fmt.Errorf("request failed: %w", err) + } + defer body.Close() + // 2. Prepare external player (ffplay as example) + // -i pipe:0 tells ffplay to read from stdin + cmd := exec.Command("ffplay", "-nodisp", "-autoexit", "-i", "pipe:0") + stdin, err := cmd.StdinPipe() + if err != nil { + return fmt.Errorf("failed to get stdin pipe: %w", err) + } + o.cmdMu.Lock() + o.cmd = cmd + o.stopCh = make(chan struct{}) + o.cmdMu.Unlock() + // 3. Start the player + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start ffplay: %w", err) + } + // 4. Copy audio data to stdin in a goroutine + copyErr := make(chan error, 1) + go func() { + _, err := io.Copy(stdin, body) + stdin.Close() // signal EOF to player + copyErr <- err + }() + // 5. Wait for player to finish or stop signal + done := make(chan error, 1) + go func() { + done <- cmd.Wait() + }() + select { + case <-o.stopCh: + // Stop requested: kill the player + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + } + <-done // wait for process to exit + return nil + case err := <-done: + // Playback finished normally + return err + case copyErrVal := <-copyErr: + if copyErrVal != nil { + // Copy failed – kill the player + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + } + <-done + return copyErrVal + } + return nil + } +} + +// // Stop interrupts ongoing playback +// func (o *KokoroOrator) Stop() { +// o.cmdMu.Lock() +// defer o.cmdMu.Unlock() +// if o.stopCh != nil { +// close(o.stopCh) +// } +// // Also clear the buffer and set interrupt flag as before +// o.mu.Lock() +// o.textBuffer.Reset() +// o.interrupt = true +// o.mu.Unlock() +// } + +func (o *KokoroOrator) requestSound(text string) (io.ReadCloser, error) { + if o.URL == "" { + return nil, fmt.Errorf("TTS URL is empty") + } + payload := map[string]interface{}{ + "input": text, + "voice": o.Voice, + "response_format": o.Format, + "download_format": o.Format, + "stream": o.Stream, + "speed": o.Speed, + // "return_download_link": true, + "lang_code": o.Language, + } + payloadBytes, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("failed to marshal payload: %w", err) + } + req, err := http.NewRequest("POST", o.URL, bytes.NewBuffer(payloadBytes)) //nolint:noctx + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + req.Header.Set("accept", "application/json") + req.Header.Set("Content-Type", "application/json") + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("request failed: %w", err) + } + if resp.StatusCode != http.StatusOK { + defer resp.Body.Close() + return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } + return resp.Body, nil +} + +func (o *KokoroOrator) stoproutine() { + for { + <-TTSDoneChan + o.logger.Debug("orator got done signal") + // 1. Stop any ongoing playback (kills external player, closes stopCh) + o.Stop() + // 2. Drain any pending text chunks + for len(TTSTextChan) > 0 { + <-TTSTextChan + } + // 3. Reset internal state + o.mu.Lock() + o.textBuffer.Reset() + o.interrupt = true + o.mu.Unlock() + } +} + +func (o *KokoroOrator) Stop() { + o.cmdMu.Lock() + defer o.cmdMu.Unlock() + // Signal any running Speak to stop + if o.stopCh != nil { + select { + case <-o.stopCh: // already closed + default: + close(o.stopCh) + } + o.stopCh = nil + } + // Kill the external player process if it's still running + if o.cmd != nil && o.cmd.Process != nil { + o.cmd.Process.Kill() + o.cmd.Wait() // clean up zombie process + o.cmd = nil + } + // Also reset text buffer and interrupt flag (with o.mu) + o.mu.Lock() + o.textBuffer.Reset() + o.interrupt = true + o.mu.Unlock() +} + +func (o *KokoroOrator) readroutine() { + tokenizer, _ := english.NewSentenceTokenizer(nil) + for { + select { + case chunk := <-TTSTextChan: + o.mu.Lock() + o.interrupt = false + _, err := o.textBuffer.WriteString(chunk) + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + o.mu.Unlock() + continue + } + text := o.textBuffer.String() + sentences := tokenizer.Tokenize(text) + o.logger.Debug("adding chunk", "chunk", chunk, "text", text, "sen-len", len(sentences)) + if len(sentences) <= 1 { + o.mu.Unlock() + continue + } + completeSentences := sentences[:len(sentences)-1] + remaining := sentences[len(sentences)-1].Text + o.textBuffer.Reset() + o.textBuffer.WriteString(remaining) + o.mu.Unlock() + for _, sentence := range completeSentences { + o.mu.Lock() + interrupted := o.interrupt + o.mu.Unlock() + if interrupted { + return + } + cleanedText := models.CleanText(sentence.Text) + if cleanedText == "" { + continue + } + o.logger.Debug("calling Speak with sentence", "sent", cleanedText) + if err := o.Speak(cleanedText); err != nil { + o.logger.Error("tts failed", "sentence", cleanedText, "error", err) + } + } + case <-TTSFlushChan: + o.logger.Debug("got flushchan signal start") + // lln is done get the whole message out + if len(TTSTextChan) > 0 { // otherwise might get stuck + for chunk := range TTSTextChan { + o.mu.Lock() + _, err := o.textBuffer.WriteString(chunk) + o.mu.Unlock() + if err != nil { + o.logger.Warn("failed to write to stringbuilder", "error", err) + continue + } + if len(TTSTextChan) == 0 { + break + } + } + } + // flush remaining text + o.mu.Lock() + remaining := o.textBuffer.String() + remaining = models.CleanText(remaining) + o.textBuffer.Reset() + o.mu.Unlock() + if remaining == "" { + continue + } + o.logger.Debug("calling Speak with remainder", "rem", remaining) + sentencesRem := tokenizer.Tokenize(remaining) + for _, rs := range sentencesRem { // to avoid dumping large volume of text + o.mu.Lock() + interrupt := o.interrupt + o.mu.Unlock() + if interrupt { + break + } + if err := o.Speak(rs.Text); err != nil { + o.logger.Error("tts failed", "sentence", rs, "error", err) + } + } + } + } +} |
