Add STT and TTS clients
This commit is contained in:
171
internal/stt/stt.go
Normal file
171
internal/stt/stt.go
Normal file
@@ -0,0 +1,171 @@
|
||||
// Package stt provides a client for whisper.cpp speech-to-text.
|
||||
package stt
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"log/slog"
|
||||
)
|
||||
|
||||
// Config holds the configuration for a STT client.
|
||||
type Config struct {
|
||||
// URL is the base URL of the whisper-server.
|
||||
// For example, "http://localhost:8178".
|
||||
URL string `yaml:"url"`
|
||||
// Timeout is the maximum duration for a transcription request.
|
||||
// Defaults to 30s.
|
||||
Timeout string `yaml:"timeout"`
|
||||
}
|
||||
|
||||
// Validate checks that required configuration values are present and valid.
|
||||
func (cfg Config) Validate() error {
|
||||
if cfg.URL == "" {
|
||||
return fmt.Errorf("missing URL")
|
||||
}
|
||||
if cfg.Timeout != "" {
|
||||
if _, err := time.ParseDuration(cfg.Timeout); err != nil {
|
||||
return fmt.Errorf("invalid timeout: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Client wraps an HTTP client for whisper-server requests.
|
||||
type Client struct {
|
||||
client *http.Client
|
||||
log *slog.Logger
|
||||
url string
|
||||
timeout time.Duration
|
||||
}
|
||||
|
||||
// NewClient creates a new STT client with the provided configuration.
|
||||
func NewClient(cfg Config, log *slog.Logger) (*Client, error) {
|
||||
if err := cfg.Validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid config: %v", err)
|
||||
}
|
||||
|
||||
var timeout = 30 * time.Second
|
||||
if cfg.Timeout != "" {
|
||||
d, err := time.ParseDuration(cfg.Timeout)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse timeout: %v", err)
|
||||
}
|
||||
timeout = d
|
||||
}
|
||||
|
||||
return &Client{
|
||||
client: &http.Client{Timeout: timeout},
|
||||
log: log,
|
||||
url: cfg.URL,
|
||||
timeout: timeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Output represents the JSON response from whisper.cpp (verbose_json format).
|
||||
type Output struct {
|
||||
Task string `json:"task,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Duration float64 `json:"duration,omitempty"`
|
||||
Text string `json:"text"`
|
||||
DetectedLanguage string `json:"detected_language,omitempty"`
|
||||
DetectedLanguageProbability float64 `json:"detected_language_probability,omitempty"`
|
||||
Segments []Segment `json:"segments,omitempty"`
|
||||
}
|
||||
|
||||
// Segment represents a transcription segment with timing and confidence
|
||||
// information.
|
||||
type Segment struct {
|
||||
ID int `json:"id"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Temperature float64 `json:"temperature,omitempty"`
|
||||
AvgLogProb float64 `json:"avg_logprob,omitempty"`
|
||||
NoSpeechProb float64 `json:"no_speech_prob,omitempty"`
|
||||
Tokens []int `json:"tokens,omitempty"`
|
||||
}
|
||||
|
||||
// Transcribe sends audio to whisper.cpp and returns the transcription output.
|
||||
func (c *Client) Transcribe(ctx context.Context, audio []byte) (*Output, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, c.timeout)
|
||||
defer cancel()
|
||||
|
||||
// Build multipart form.
|
||||
var buf bytes.Buffer
|
||||
w := multipart.NewWriter(&buf)
|
||||
|
||||
fw, err := w.CreateFormFile("file", "audio.webm")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create form file: %w", err)
|
||||
}
|
||||
if _, err := fw.Write(audio); err != nil {
|
||||
return nil, fmt.Errorf("write audio: %w", err)
|
||||
}
|
||||
|
||||
// Request verbose JSON response format to get full output including
|
||||
// detected_language.
|
||||
if err := w.WriteField("response_format", "verbose_json"); err != nil {
|
||||
return nil, fmt.Errorf("write response_format: %w", err)
|
||||
}
|
||||
|
||||
if err := w.Close(); err != nil {
|
||||
return nil, fmt.Errorf("close multipart: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(
|
||||
ctx,
|
||||
http.MethodPost,
|
||||
c.url+"/inference",
|
||||
&buf,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", w.FormDataContentType())
|
||||
|
||||
res, err := c.client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("send request: %w", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != http.StatusOK {
|
||||
body, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
c.log.ErrorContext(
|
||||
ctx,
|
||||
"failed to read response body",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf(
|
||||
"whisper error %d: %s", res.StatusCode, body,
|
||||
)
|
||||
}
|
||||
|
||||
body, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read response: %w", err)
|
||||
}
|
||||
|
||||
var output Output
|
||||
if err := json.Unmarshal(body, &output); err != nil {
|
||||
return nil, fmt.Errorf("parse response: %w", err)
|
||||
}
|
||||
c.log.DebugContext(ctx, "stt response",
|
||||
slog.String("text", output.Text),
|
||||
slog.String("language", output.Language),
|
||||
slog.String("detected_language", output.DetectedLanguage),
|
||||
slog.Float64("duration", output.Duration),
|
||||
)
|
||||
|
||||
return &output, nil
|
||||
}
|
||||
106
internal/stt/stt_test.go
Normal file
106
internal/stt/stt_test.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package stt
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestConfigValidate(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
cfg Config
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "empty config",
|
||||
cfg: Config{},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "missing URL",
|
||||
cfg: Config{
|
||||
Timeout: "30s",
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "invalid timeout",
|
||||
cfg: Config{
|
||||
URL: "http://localhost:8178",
|
||||
Timeout: "not-a-duration",
|
||||
},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "valid minimal config",
|
||||
cfg: Config{
|
||||
URL: "http://localhost:8178",
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "valid config with timeout",
|
||||
cfg: Config{
|
||||
URL: "http://localhost:8178",
|
||||
Timeout: "60s",
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := tt.cfg.Validate()
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf(
|
||||
"Validate() error = %v, wantErr %v",
|
||||
err, tt.wantErr,
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewClient(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
cfg Config
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "invalid config",
|
||||
cfg: Config{},
|
||||
wantErr: true,
|
||||
},
|
||||
{
|
||||
name: "valid config without timeout",
|
||||
cfg: Config{
|
||||
URL: "http://localhost:8178",
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
{
|
||||
name: "valid config with timeout",
|
||||
cfg: Config{
|
||||
URL: "http://localhost:8178",
|
||||
Timeout: "45s",
|
||||
},
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
client, err := NewClient(tt.cfg, nil)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf(
|
||||
"NewClient() error = %v, wantErr %v",
|
||||
err, tt.wantErr,
|
||||
)
|
||||
return
|
||||
}
|
||||
if !tt.wantErr && client == nil {
|
||||
t.Error("NewClient() returned nil client")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user