summaryrefslogtreecommitdiff
path: root/extra/whisper_server.go
blob: 7532f4a98762a3d02e3845f7a27bc393343ce86b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//go:build extra
// +build extra

package extra

import (
	"bytes"
	"errors"
	"fmt"
	"io"
	"log/slog"
	"mime/multipart"
	"net/http"
	"os/exec"
	"strings"
	"sync"
)

type WhisperServer struct {
	logger      *slog.Logger
	ServerURL   string
	SampleRate  int
	AudioBuffer *bytes.Buffer
	recording   bool          // protected by mu
	mu          sync.Mutex    // protects recording & AudioBuffer
	cmd         *exec.Cmd     // protected by cmdMu
	stopCh      chan struct{} // protected by cmdMu
	cmdMu       sync.Mutex    // protects cmd and stopCh
}

func (stt *WhisperServer) StartRecording() error {
	stt.mu.Lock()
	defer stt.mu.Unlock()
	if stt.recording {
		return nil
	}
	// Build ffmpeg command for microphone capture
	args := []string{
		"-f", "alsa",
		"-i", "default",
		"-acodec", "pcm_s16le",
		"-ar", fmt.Sprint(stt.SampleRate),
		"-ac", "1",
		"-f", "s16le",
		"-",
	}
	cmd := exec.Command("ffmpeg", args...)
	stdout, err := cmd.StdoutPipe()
	if err != nil {
		return fmt.Errorf("failed to get stdout pipe: %w", err)
	}
	stt.cmdMu.Lock()
	stt.cmd = cmd
	stt.stopCh = make(chan struct{})
	stt.cmdMu.Unlock()
	if err := cmd.Start(); err != nil {
		return fmt.Errorf("failed to start ffmpeg: %w", err)
	}
	stt.recording = true
	stt.AudioBuffer.Reset()
	// Read PCM data in goroutine
	go func() {
		buf := make([]byte, 4096)
		for {
			select {
			case <-stt.stopCh:
				return
			default:
				n, err := stdout.Read(buf)
				if n > 0 {
					stt.mu.Lock()
					stt.AudioBuffer.Write(buf[:n])
					stt.mu.Unlock()
				}
				if err != nil {
					if err != io.EOF {
						stt.logger.Error("recording read error", "error", err)
					}
					return
				}
			}
		}
	}()
	return nil
}

func (stt *WhisperServer) StopRecording() (string, error) {
	stt.mu.Lock()
	defer stt.mu.Unlock()
	if !stt.recording {
		return "", errors.New("not recording")
	}
	stt.recording = false
	// Stop ffmpeg
	stt.cmdMu.Lock()
	if stt.cmd != nil && stt.cmd.Process != nil {
		stt.cmd.Process.Kill()
		stt.cmd.Wait()
	}
	close(stt.stopCh)
	stt.cmdMu.Unlock()
	// Rest of StopRecording unchanged (WAV header + HTTP upload)
	// ...
	stt.recording = false
	// wait loop to finish?
	if stt.AudioBuffer == nil {
		err := errors.New("unexpected nil AudioBuffer")
		stt.logger.Error(err.Error())
		return "", err
	}
	// Create WAV header first
	body := &bytes.Buffer{}
	writer := multipart.NewWriter(body)
	// Add audio file part
	part, err := writer.CreateFormFile("file", "recording.wav")
	if err != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	// Stream directly to multipart writer: header + raw data
	dataSize := stt.AudioBuffer.Len()
	stt.writeWavHeader(part, dataSize)
	if _, err := io.Copy(part, stt.AudioBuffer); err != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	// Reset buffer for next recording
	stt.AudioBuffer.Reset()
	// Add response format field
	err = writer.WriteField("response_format", "text")
	if err != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	if writer.Close() != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	// Send request
	resp, err := http.Post(stt.ServerURL, writer.FormDataContentType(), body) //nolint:noctx
	if err != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	defer resp.Body.Close()
	// Read and print response
	responseTextBytes, err := io.ReadAll(resp.Body)
	if err != nil {
		stt.logger.Error("fn: StopRecording", "error", err)
		return "", err
	}
	resptext := strings.TrimRight(string(responseTextBytes), "\n")
	// in case there are special tokens like [_BEG_]
	resptext = specialRE.ReplaceAllString(resptext, "")
	return strings.TrimSpace(strings.ReplaceAll(resptext, "\n ", "\n")), nil
}