From 4a581f6c122255bddcb3580539ff24b3c7d7c657 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Sun, 9 Nov 2025 11:28:50 +0300 Subject: Chore: stt reworks [WIP] --- bot.go | 2 +- config/config.go | 12 +++++++++--- extra/stt.go | 30 ++++++++++++++++++++---------- extra/whisper_binary.go | 31 +++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 14 deletions(-) create mode 100644 extra/whisper_binary.go diff --git a/bot.go b/bot.go index 537df0c..4dbe5af 100644 --- a/bot.go +++ b/bot.go @@ -606,6 +606,6 @@ func init() { orator = extra.NewOrator(logger, cfg) } if cfg.STT_ENABLED { - asr = extra.NewWhisperSTT(logger, cfg.STT_URL, 16000) + asr = extra.NewSTT(logger, cfg) } } diff --git a/config/config.go b/config/config.go index d73bf28..77873e8 100644 --- a/config/config.go +++ b/config/config.go @@ -56,9 +56,14 @@ type Config struct { TTS_ENABLED bool `toml:"TTS_ENABLED"` TTS_SPEED float32 `toml:"TTS_SPEED"` // STT - STT_URL string `toml:"STT_URL"` - STT_ENABLED bool `toml:"STT_ENABLED"` - DBPATH string `toml:"DBPATH"` + STT_TYPE string `toml:"STT_TYPE"` // WHISPER_SERVER, WHISPER_BINARY + STT_URL string `toml:"STT_URL"` + STT_SR int `toml:"STT_SR"` + STT_ENABLED bool `toml:"STT_ENABLED"` + WhisperBinaryPath string `toml:"WhisperBinaryPath"` + WhisperModelPath string `toml:"WhisperModelPath"` + STT_LANG string `toml:"STT_LANG"` + DBPATH string `toml:"DBPATH"` } func LoadConfigOrDefault(fn string) *Config { @@ -93,6 +98,7 @@ func LoadConfigOrDefault(fn string) *Config { config.TTS_ENABLED = false config.TTS_URL = "http://localhost:8880/v1/audio/speech" config.FetchModelNameAPI = "http://localhost:8080/v1/models" + config.STT_SR = 16000 } config.CurrentAPI = config.ChatAPI config.APIMap = map[string]string{ diff --git a/extra/stt.go b/extra/stt.go index ce107b4..ddcc851 100644 --- a/extra/stt.go +++ b/extra/stt.go @@ -5,6 +5,7 @@ import ( "encoding/binary" "errors" "fmt" + "gf-lt/config" "io" "log/slog" "mime/multipart" @@ -27,7 +28,16 @@ type StreamCloser interface { Close() error } -type WhisperSTT struct { +func NewSTT(logger *slog.Logger, cfg *config.Config) STT { + switch cfg.STT_TYPE { + case "WHISPER_BINARY": + case "WHISPER_SERVER": + return NewWhisperServer(logger, cfg) + } + return NewWhisperServer(logger, cfg) +} + +type WhisperServer struct { logger *slog.Logger ServerURL string SampleRate int @@ -35,16 +45,16 @@ type WhisperSTT struct { recording bool } -func NewWhisperSTT(logger *slog.Logger, serverURL string, sampleRate int) *WhisperSTT { - return &WhisperSTT{ +func NewWhisperServer(logger *slog.Logger, cfg *config.Config) *WhisperServer { + return &WhisperServer{ logger: logger, - ServerURL: serverURL, - SampleRate: sampleRate, + ServerURL: cfg.STT_URL, + SampleRate: cfg.STT_SR, AudioBuffer: new(bytes.Buffer), } } -func (stt *WhisperSTT) StartRecording() error { +func (stt *WhisperServer) StartRecording() error { if err := stt.microphoneStream(stt.SampleRate); err != nil { return fmt.Errorf("failed to init microphone: %w", err) } @@ -52,7 +62,7 @@ func (stt *WhisperSTT) StartRecording() error { return nil } -func (stt *WhisperSTT) StopRecording() (string, error) { +func (stt *WhisperServer) StopRecording() (string, error) { stt.recording = false // wait loop to finish? if stt.AudioBuffer == nil { @@ -107,7 +117,7 @@ func (stt *WhisperSTT) StopRecording() (string, error) { return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil } -func (stt *WhisperSTT) writeWavHeader(w io.Writer, dataSize int) { +func (stt *WhisperServer) writeWavHeader(w io.Writer, dataSize int) { header := make([]byte, 44) copy(header[0:4], "RIFF") binary.LittleEndian.PutUint32(header[4:8], uint32(36+dataSize)) @@ -127,11 +137,11 @@ func (stt *WhisperSTT) writeWavHeader(w io.Writer, dataSize int) { } } -func (stt *WhisperSTT) IsRecording() bool { +func (stt *WhisperServer) IsRecording() bool { return stt.recording } -func (stt *WhisperSTT) microphoneStream(sampleRate int) error { +func (stt *WhisperServer) microphoneStream(sampleRate int) error { if err := portaudio.Initialize(); err != nil { return fmt.Errorf("portaudio init failed: %w", err) } diff --git a/extra/whisper_binary.go b/extra/whisper_binary.go new file mode 100644 index 0000000..1002a97 --- /dev/null +++ b/extra/whisper_binary.go @@ -0,0 +1,31 @@ +package extra + +import ( + "context" + "gf-lt/config" + "log/slog" + "os/exec" + "sync" +) + +type WhisperBinary struct { + whisperPath string + modelPath string + lang string + ctx context.Context + cancel context.CancelFunc + mu sync.Mutex + running bool + cmd *exec.Cmd +} + +func NewWhisperBinary(logger *slog.Logger, cfg *config.Config) *WhisperBinary { + ctx, cancel := context.WithCancel(context.Background()) + return &WhisperBinary{ + whisperPath: cfg.WhisperBinaryPath, + modelPath: cfg.WhisperModelPath, + lang: cfg.STT_LANG, + ctx: ctx, + cancel: cancel, + } +} -- cgit v1.2.3