summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2025-05-17 21:23:51 +0300
committerGrail Finder <wohilas@gmail.com>2025-05-17 21:23:51 +0300
commit2d56806cfa07a6238b7c6943334ce32096830f9d (patch)
treeecb6e3fd8ddf06730fff8dc66ff0502b0e0c8851
parentf7d1fbf73c2979220855522574ce3c01aa51e47a (diff)
Feat: stt sketch [WIP]
-rw-r--r--extra/audio.go6
-rw-r--r--extra/stt.go188
-rw-r--r--go.mod6
-rw-r--r--go.sum18
4 files changed, 207 insertions, 11 deletions
diff --git a/extra/audio.go b/extra/audio.go
index 531b08b..b130bf8 100644
--- a/extra/audio.go
+++ b/extra/audio.go
@@ -11,9 +11,9 @@ import (
"strings"
"time"
- "github.com/gopxl/beep"
- "github.com/gopxl/beep/mp3"
- "github.com/gopxl/beep/speaker"
+ "github.com/gopxl/beep/v2"
+ "github.com/gopxl/beep/v2/mp3"
+ "github.com/gopxl/beep/v2/speaker"
"github.com/neurosnap/sentences/english"
)
diff --git a/extra/stt.go b/extra/stt.go
new file mode 100644
index 0000000..6456488
--- /dev/null
+++ b/extra/stt.go
@@ -0,0 +1,188 @@
+package extra
+
+import (
+ "bytes"
+ "encoding/json"
+ "errors"
+ "fmt"
+ "io"
+ "log/slog"
+ "net/http"
+ "os"
+ "os/signal"
+
+ "github.com/MarkKremer/microphone/v2"
+ "github.com/gopxl/beep/v2"
+ "github.com/gopxl/beep/v2/wav"
+)
+
+type STT interface {
+ StartRecording() error
+ StopRecording() (string, error)
+ IsRecording() bool
+}
+
+type WhisperSTT struct {
+ logger *slog.Logger
+ ServerURL string
+ SampleRate beep.SampleRate
+ Buffer *bytes.Buffer
+ streamer beep.StreamCloser
+ recording bool
+}
+
+type writeseeker struct {
+ buf []byte
+ pos int
+}
+
+func (m *writeseeker) Write(p []byte) (n int, err error) {
+ minCap := m.pos + len(p)
+ if minCap > cap(m.buf) { // Make sure buf has enough capacity:
+ buf2 := make([]byte, len(m.buf), minCap+len(p)) // add some extra
+ copy(buf2, m.buf)
+ m.buf = buf2
+ }
+ if minCap > len(m.buf) {
+ m.buf = m.buf[:minCap]
+ }
+ copy(m.buf[m.pos:], p)
+ m.pos += len(p)
+ return len(p), nil
+}
+
+func (m *writeseeker) Seek(offset int64, whence int) (int64, error) {
+ newPos, offs := 0, int(offset)
+ switch whence {
+ case io.SeekStart:
+ newPos = offs
+ case io.SeekCurrent:
+ newPos = m.pos + offs
+ case io.SeekEnd:
+ newPos = len(m.buf) + offs
+ }
+ if newPos < 0 {
+ return 0, errors.New("negative result pos")
+ }
+ m.pos = newPos
+ return int64(newPos), nil
+}
+
+// Reader returns an io.Reader. Use it, for example, with io.Copy, to copy the content of the WriterSeeker buffer to an io.Writer
+func (ws *writeseeker) Reader() io.Reader {
+ return bytes.NewReader(ws.buf)
+}
+
+func NewWhisperSTT(logger *slog.Logger, serverURL string, sampleRate beep.SampleRate) *WhisperSTT {
+ return &WhisperSTT{
+ logger: logger,
+ ServerURL: serverURL,
+ SampleRate: sampleRate,
+ Buffer: new(bytes.Buffer),
+ }
+}
+
+func (stt *WhisperSTT) StartRecording() error {
+ stream, err := microphoneStream(stt.SampleRate)
+ if err != nil {
+ return fmt.Errorf("failed to init microphone: %w", err)
+ }
+
+ stt.streamer = stream
+ stt.recording = true
+
+ go stt.capture()
+ return nil
+}
+
+func (stt *WhisperSTT) capture() {
+ sink := beep.NewBuffer(beep.Format{
+ SampleRate: stt.SampleRate,
+ NumChannels: 1,
+ Precision: 2,
+ })
+
+ // Append the streamer to the buffer and encode as WAV
+ sink.Append(stt.streamer)
+
+ // Encode the captured audio to WAV format using beep's WAV encoder
+ // var wavBuf bytes.Buffer
+ var wavBuf writeseeker
+ if err := wav.Encode(&wavBuf, sink.Streamer(0, sink.Len()), beep.Format{
+ SampleRate: stt.SampleRate,
+ NumChannels: 1,
+ Precision: 2,
+ }); err != nil {
+ stt.logger.Error("failed to encode WAV", "error", err)
+ }
+ r := wavBuf.Reader()
+ // stt.Buffer = &wavBuf
+ if _, err := io.Copy(stt.Buffer, r); err != nil {
+ stt.logger.Error("failed to encode WAV", "error", err)
+ }
+}
+
+func (stt *WhisperSTT) StopRecording() (string, error) {
+ if !stt.recording {
+ return "", nil
+ }
+
+ stt.streamer.Close()
+ stt.recording = false
+
+ // Send to Whisper.cpp server
+ req, err := http.NewRequest("POST", stt.ServerURL, stt.Buffer)
+ if err != nil {
+ return "", fmt.Errorf("failed to create request: %w", err)
+ }
+ req.Header.Set("Content-Type", "audio/wav")
+
+ resp, err := http.DefaultClient.Do(req)
+ if err != nil {
+ return "", fmt.Errorf("transcription request failed: %w", err)
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+ }
+
+ var result struct {
+ Text string `json:"text"`
+ }
+ if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+ return "", fmt.Errorf("failed to decode response: %w", err)
+ }
+
+ return result.Text, nil
+}
+
+func (stt *WhisperSTT) IsRecording() bool {
+ return stt.recording
+}
+
+func microphoneStream(sr beep.SampleRate) (beep.StreamCloser, error) {
+ if err := microphone.Init(); err != nil {
+ return nil, fmt.Errorf("microphone init failed: %w", err)
+ }
+
+ stream, _, err := microphone.OpenDefaultStream(sr, 1) // 1 channel mono
+ if err != nil {
+ microphone.Terminate()
+ return nil, fmt.Errorf("failed to open microphone: %w", err)
+ }
+
+ // Handle OS signals to clean up
+ sig := make(chan os.Signal, 1)
+ signal.Notify(sig, os.Interrupt, os.Kill)
+ go func() {
+ <-sig
+ stream.Stop()
+ stream.Close()
+ microphone.Terminate()
+ os.Exit(1)
+ }()
+
+ stream.Start()
+ return stream, nil
+}
diff --git a/go.mod b/go.mod
index aa00007..2ed8178 100644
--- a/go.mod
+++ b/go.mod
@@ -4,10 +4,11 @@ go 1.23.2
require (
github.com/BurntSushi/toml v1.4.0
+ github.com/MarkKremer/microphone/v2 v2.0.1
github.com/asg017/sqlite-vec-go-bindings v0.1.6
github.com/gdamore/tcell/v2 v2.7.4
github.com/glebarez/go-sqlite v1.22.0
- github.com/gopxl/beep v1.4.1
+ github.com/gopxl/beep/v2 v2.1.0
github.com/jmoiron/sqlx v1.4.0
github.com/ncruces/go-sqlite3 v0.21.3
github.com/neurosnap/sentences v1.1.2
@@ -16,10 +17,11 @@ require (
require (
github.com/dustin/go-humanize v1.0.1 // indirect
- github.com/ebitengine/oto/v3 v3.1.0 // indirect
+ github.com/ebitengine/oto/v3 v3.2.0 // indirect
github.com/ebitengine/purego v0.7.1 // indirect
github.com/gdamore/encoding v1.0.0 // indirect
github.com/google/uuid v1.6.0 // indirect
+ github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 // indirect
github.com/hajimehoshi/go-mp3 v0.3.4 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
diff --git a/go.sum b/go.sum
index ccac93c..3d9fb54 100644
--- a/go.sum
+++ b/go.sum
@@ -2,14 +2,16 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0=
github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho=
+github.com/MarkKremer/microphone/v2 v2.0.1 h1:PWI0MgBu3Nd9CSxdnIjwol8qshstNfywERIMOLD03Zk=
+github.com/MarkKremer/microphone/v2 v2.0.1/go.mod h1:IdM74GKdsZAWVbkgX8xLGAdd4ytzBt7uk5F0brfTZRM=
github.com/asg017/sqlite-vec-go-bindings v0.1.6 h1:Nx0jAzyS38XpkKznJ9xQjFXz2X9tI7KqjwVxV8RNoww=
github.com/asg017/sqlite-vec-go-bindings v0.1.6/go.mod h1:A8+cTt/nKFsYCQF6OgzSNpKZrzNo5gQsXBTfsXHXY0Q=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/ebitengine/oto/v3 v3.1.0 h1:9tChG6rizyeR2w3vsygTTTVVJ9QMMyu00m2yBOCch6U=
-github.com/ebitengine/oto/v3 v3.1.0/go.mod h1:IK1QTnlfZK2GIB6ziyECm433hAdTaPpOsGMLhEyEGTg=
+github.com/ebitengine/oto/v3 v3.2.0 h1:FuggTJTSI3/3hEYwZEIN0CZVXYT29ZOdCu+z/f4QjTw=
+github.com/ebitengine/oto/v3 v3.2.0/go.mod h1:dOKXShvy1EQbIXhXPFcKLargdnFqH0RjptecvyAxhyw=
github.com/ebitengine/purego v0.7.1 h1:6/55d26lG3o9VCZX8lping+bZcmShseiqlh2bnUDiPA=
github.com/ebitengine/purego v0.7.1/go.mod h1:ah1In8AOtksoNK6yk5z1HTJeUkC1Ez4Wk2idgGslMwQ=
github.com/gdamore/encoding v1.0.0 h1:+7OoQ1Bc6eTm5niUzBa0Ctsh6JbMW6Ra+YNuAtDBdko=
@@ -24,8 +26,10 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu
github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/gopxl/beep v1.4.1 h1:WqNs9RsDAhG9M3khMyc1FaVY50dTdxG/6S6a3qsUHqE=
-github.com/gopxl/beep v1.4.1/go.mod h1:A1dmiUkuY8kxsvcNJNUBIEcchmiP6eUyCHSxpXl0YO0=
+github.com/gopxl/beep/v2 v2.1.0 h1:Jv95iHw3aNWoAa/J78YyXvOvMHH2ZGeAYD5ug8tVt8c=
+github.com/gopxl/beep/v2 v2.1.0/go.mod h1:sQvj2oSsu8fmmDWH3t0DzIe0OZzTW6/TJEHW4Ku+22o=
+github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5 h1:5AlozfqaVjGYGhms2OsdUyfdJME76E6rx5MdGpjzZpc=
+github.com/gordonklaus/portaudio v0.0.0-20230709114228-aafa478834f5/go.mod h1:WY8R6YKlI2ZI3UyzFk7P6yGSuS+hFwNtEzrexRyD7Es=
github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
github.com/hajimehoshi/oto/v2 v2.3.1/go.mod h1:seWLbgHH7AyUMYKfKYT9pg7PhUu9/SisyJvNTT+ASQo=
@@ -47,6 +51,8 @@ github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt
github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g=
github.com/neurosnap/sentences v1.1.2 h1:iphYOzx/XckXeBiLIUBkPu2EKMJ+6jDbz/sLJZ7ZoUw=
github.com/neurosnap/sentences v1.1.2/go.mod h1:/pwU4E9XNL21ygMIkOIllv/SMy2ujHwpf8GQPu1YPbQ=
+github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e h1:s2RNOM/IGdY0Y6qfTeUKhDawdHDpK9RGBdx80qN4Ttw=
+github.com/orcaman/writerseeker v0.0.0-20200621085525-1d3f536ff85e/go.mod h1:nBdnFKj15wFbf94Rwfq4m30eAcyY9V/IyKAGQFtqkW0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -59,8 +65,8 @@ github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJ
github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
-github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
-github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4=
github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=