1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
package extra
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"log/slog"
"net/http"
"os"
"os/signal"
"github.com/MarkKremer/microphone/v2"
"github.com/gopxl/beep/v2"
"github.com/gopxl/beep/v2/wav"
)
type STT interface {
StartRecording() error
StopRecording() (string, error)
IsRecording() bool
}
type WhisperSTT struct {
logger *slog.Logger
ServerURL string
SampleRate beep.SampleRate
Buffer *bytes.Buffer
streamer beep.StreamCloser
recording bool
}
type writeseeker struct {
buf []byte
pos int
}
func (m *writeseeker) Write(p []byte) (n int, err error) {
minCap := m.pos + len(p)
if minCap > cap(m.buf) { // Make sure buf has enough capacity:
buf2 := make([]byte, len(m.buf), minCap+len(p)) // add some extra
copy(buf2, m.buf)
m.buf = buf2
}
if minCap > len(m.buf) {
m.buf = m.buf[:minCap]
}
copy(m.buf[m.pos:], p)
m.pos += len(p)
return len(p), nil
}
func (m *writeseeker) Seek(offset int64, whence int) (int64, error) {
newPos, offs := 0, int(offset)
switch whence {
case io.SeekStart:
newPos = offs
case io.SeekCurrent:
newPos = m.pos + offs
case io.SeekEnd:
newPos = len(m.buf) + offs
}
if newPos < 0 {
return 0, errors.New("negative result pos")
}
m.pos = newPos
return int64(newPos), nil
}
// Reader returns an io.Reader. Use it, for example, with io.Copy, to copy the content of the WriterSeeker buffer to an io.Writer
func (ws *writeseeker) Reader() io.Reader {
return bytes.NewReader(ws.buf)
}
func NewWhisperSTT(logger *slog.Logger, serverURL string, sampleRate beep.SampleRate) *WhisperSTT {
return &WhisperSTT{
logger: logger,
ServerURL: serverURL,
SampleRate: sampleRate,
Buffer: new(bytes.Buffer),
}
}
func (stt *WhisperSTT) StartRecording() error {
stream, err := microphoneStream(stt.SampleRate)
if err != nil {
return fmt.Errorf("failed to init microphone: %w", err)
}
stt.streamer = stream
stt.recording = true
go stt.capture()
return nil
}
func (stt *WhisperSTT) capture() {
sink := beep.NewBuffer(beep.Format{
SampleRate: stt.SampleRate,
NumChannels: 1,
Precision: 2,
})
// Append the streamer to the buffer and encode as WAV
sink.Append(stt.streamer)
// Encode the captured audio to WAV format using beep's WAV encoder
// var wavBuf bytes.Buffer
var wavBuf writeseeker
if err := wav.Encode(&wavBuf, sink.Streamer(0, sink.Len()), beep.Format{
SampleRate: stt.SampleRate,
NumChannels: 1,
Precision: 2,
}); err != nil {
stt.logger.Error("failed to encode WAV", "error", err)
}
r := wavBuf.Reader()
// stt.Buffer = &wavBuf
if _, err := io.Copy(stt.Buffer, r); err != nil {
stt.logger.Error("failed to encode WAV", "error", err)
}
}
func (stt *WhisperSTT) StopRecording() (string, error) {
if !stt.recording {
return "", nil
}
stt.streamer.Close()
stt.recording = false
// Send to Whisper.cpp server
req, err := http.NewRequest("POST", stt.ServerURL, stt.Buffer)
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "audio/wav")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("transcription request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
var result struct {
Text string `json:"text"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
return result.Text, nil
}
func (stt *WhisperSTT) IsRecording() bool {
return stt.recording
}
func microphoneStream(sr beep.SampleRate) (beep.StreamCloser, error) {
if err := microphone.Init(); err != nil {
return nil, fmt.Errorf("microphone init failed: %w", err)
}
stream, _, err := microphone.OpenDefaultStream(sr, 1) // 1 channel mono
if err != nil {
microphone.Terminate()
return nil, fmt.Errorf("failed to open microphone: %w", err)
}
// Handle OS signals to clean up
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, os.Kill)
go func() {
<-sig
stream.Stop()
stream.Close()
microphone.Terminate()
os.Exit(1)
}()
stream.Start()
return stream, nil
}
|