857 lines
20 KiB
Go
857 lines
20 KiB
Go
// Package service orchestrates the odidere voice assistant server.
|
|
// It coordinates the HTTP server, STT/LLM/TTS clients, and handles
|
|
// graceful shutdown.
|
|
package service
|
|
|
|
import (
|
|
"context"
|
|
"embed"
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"html/template"
|
|
"io/fs"
|
|
"log/slog"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"runtime/debug"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"code.chimeric.al/chimerical/odidere/internal/config"
|
|
"code.chimeric.al/chimerical/odidere/internal/llm"
|
|
"code.chimeric.al/chimerical/odidere/internal/service/templates"
|
|
"code.chimeric.al/chimerical/odidere/internal/stt"
|
|
"code.chimeric.al/chimerical/odidere/internal/tool"
|
|
"code.chimeric.al/chimerical/odidere/internal/tts"
|
|
|
|
"github.com/google/uuid"
|
|
openai "github.com/sashabaranov/go-openai"
|
|
"golang.org/x/sync/semaphore"
|
|
)
|
|
|
|
//go:embed all:static/*
|
|
var static embed.FS
|
|
|
|
// Service is the main application coordinator.
|
|
// It owns the HTTP server and all processing clients.
|
|
type Service struct {
|
|
cfg *config.Config
|
|
llm *llm.Client
|
|
log *slog.Logger
|
|
mux *http.ServeMux
|
|
sem *semaphore.Weighted
|
|
server *http.Server
|
|
stt *stt.Client
|
|
tmpl *template.Template
|
|
tools *tool.Registry
|
|
tts *tts.Client
|
|
}
|
|
|
|
// New creates a Service from the provided configuration.
|
|
// It initializes all clients and the HTTP server.
|
|
func New(cfg *config.Config, log *slog.Logger) (*Service, error) {
|
|
var svc = &Service{
|
|
cfg: cfg,
|
|
log: log,
|
|
mux: http.NewServeMux(),
|
|
sem: semaphore.NewWeighted(int64(cfg.Concurrency)),
|
|
}
|
|
|
|
// Setup tool registry.
|
|
registry, err := tool.NewRegistry(cfg.Tools)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("load tools: %v", err)
|
|
}
|
|
svc.tools = registry
|
|
|
|
// Create STT client.
|
|
sttClient, err := stt.NewClient(cfg.STT, log)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create STT client: %v", err)
|
|
}
|
|
svc.stt = sttClient
|
|
|
|
// Create LLM client.
|
|
llmClient, err := llm.NewClient(cfg.LLM, registry, log)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create LLM client: %v", err)
|
|
}
|
|
svc.llm = llmClient
|
|
|
|
// Create TTS client.
|
|
ttsClient, err := tts.NewClient(cfg.TTS, log)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create TTS client: %v", err)
|
|
}
|
|
svc.tts = ttsClient
|
|
|
|
// Parse templates.
|
|
tmpl, err := templates.Parse()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parse templates: %v", err)
|
|
}
|
|
svc.tmpl = tmpl
|
|
|
|
// Setup static file server.
|
|
staticFS, err := fs.Sub(static, "static")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("setup static fs: %v", err)
|
|
}
|
|
|
|
// Register routes.
|
|
svc.mux.HandleFunc("GET /", svc.home)
|
|
svc.mux.HandleFunc("GET /status", svc.status)
|
|
svc.mux.Handle(
|
|
"GET /static/",
|
|
http.StripPrefix(
|
|
"/static/", http.FileServer(http.FS(staticFS)),
|
|
),
|
|
)
|
|
svc.mux.HandleFunc("POST /v1/chat/voice", svc.voice)
|
|
svc.mux.HandleFunc("POST /v1/chat/voice/stream", svc.voiceStream)
|
|
svc.mux.HandleFunc("GET /v1/voices", svc.voices)
|
|
svc.mux.HandleFunc("GET /v1/models", svc.models)
|
|
|
|
svc.server = &http.Server{
|
|
Addr: cfg.Address,
|
|
Handler: svc,
|
|
}
|
|
|
|
return svc, nil
|
|
}
|
|
|
|
// ServeHTTP implements http.Handler. It logs requests, assigns a UUID,
|
|
// sets context values, handles panics, and delegates to the mux.
|
|
func (svc *Service) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
start = time.Now()
|
|
id = uuid.NewString()
|
|
ip = func() string {
|
|
if ip := r.Header.Get("X-Forwarded-For"); ip != "" {
|
|
if idx := strings.Index(ip, ","); idx != -1 {
|
|
return strings.TrimSpace(ip[:idx])
|
|
}
|
|
return ip
|
|
}
|
|
return r.RemoteAddr
|
|
}()
|
|
log = svc.log.With(slog.Group(
|
|
"request",
|
|
slog.String("id", id),
|
|
slog.String("ip", ip),
|
|
slog.String("method", r.Method),
|
|
slog.String("path", r.URL.Path),
|
|
))
|
|
)
|
|
|
|
// Log completion time.
|
|
defer func() {
|
|
log.InfoContext(
|
|
r.Context(),
|
|
"completed",
|
|
slog.Duration("duration", time.Since(start)),
|
|
)
|
|
}()
|
|
|
|
// Panic recovery.
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
log.ErrorContext(
|
|
r.Context(),
|
|
"panic recovered",
|
|
slog.Any("error", err),
|
|
slog.String("stack", string(debug.Stack())),
|
|
)
|
|
http.Error(
|
|
w,
|
|
http.StatusText(
|
|
http.StatusInternalServerError,
|
|
),
|
|
http.StatusInternalServerError,
|
|
)
|
|
}
|
|
}()
|
|
|
|
// Enrich context with request-scoped values.
|
|
ctx := r.Context()
|
|
ctx = context.WithValue(ctx, "log", log)
|
|
ctx = context.WithValue(ctx, "id", id)
|
|
ctx = context.WithValue(ctx, "ip", ip)
|
|
r = r.WithContext(ctx)
|
|
|
|
log.InfoContext(ctx, "handling")
|
|
|
|
// Pass the request on to the multiplexer.
|
|
svc.mux.ServeHTTP(w, r)
|
|
}
|
|
|
|
// Run starts the service and blocks until shutdown.
|
|
// Shutdown is triggered by SIGINT or SIGTERM.
|
|
func (svc *Service) Run(ctx context.Context) error {
|
|
svc.log.Info(
|
|
"starting odidere",
|
|
slog.Int("concurrency", svc.cfg.Concurrency),
|
|
slog.Group(
|
|
"llm",
|
|
slog.String("url", svc.cfg.LLM.URL),
|
|
slog.String("model", svc.cfg.LLM.Model),
|
|
),
|
|
slog.Group(
|
|
"server",
|
|
slog.String("address", svc.cfg.Address),
|
|
),
|
|
slog.Group(
|
|
"stt",
|
|
slog.String("url", svc.cfg.STT.URL),
|
|
),
|
|
slog.Group(
|
|
"tools",
|
|
slog.Int("count", len(svc.tools.List())),
|
|
slog.Any(
|
|
"names",
|
|
strings.Join(svc.tools.List(), ","),
|
|
),
|
|
),
|
|
slog.Group(
|
|
"tts",
|
|
slog.String("url", svc.cfg.TTS.URL),
|
|
slog.String("default_voice", svc.cfg.TTS.Voice),
|
|
),
|
|
slog.Any("shutdown_timeout", svc.cfg.GetShutdownTimeout()),
|
|
)
|
|
|
|
// Setup signal handling for graceful shutdown.
|
|
ctx, cancel := signal.NotifyContext(
|
|
ctx,
|
|
os.Interrupt, syscall.SIGTERM,
|
|
)
|
|
defer cancel()
|
|
|
|
// Start HTTP server in background.
|
|
var errs = make(chan error, 1)
|
|
go func() {
|
|
svc.log.Info(
|
|
"HTTP server listening",
|
|
slog.String("address", svc.cfg.Address),
|
|
)
|
|
if err := svc.server.ListenAndServe(); err != nil &&
|
|
err != http.ErrServerClosed {
|
|
errs <- err
|
|
}
|
|
close(errs)
|
|
}()
|
|
|
|
// Wait for shutdown signal or server error.
|
|
select {
|
|
case <-ctx.Done():
|
|
svc.log.Info("shutdown signal received")
|
|
case err := <-errs:
|
|
if err != nil {
|
|
return fmt.Errorf("server error: %w", err)
|
|
}
|
|
}
|
|
|
|
// Graceful shutdown with timeout.
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(
|
|
context.Background(),
|
|
svc.cfg.GetShutdownTimeout(),
|
|
)
|
|
defer shutdownCancel()
|
|
|
|
svc.log.Info("shutting down HTTP server")
|
|
if err := svc.server.Shutdown(shutdownCtx); err != nil {
|
|
svc.log.Warn(
|
|
"shutdown timeout reached",
|
|
slog.Any("error", err),
|
|
)
|
|
}
|
|
|
|
svc.log.Info("terminating")
|
|
return nil
|
|
}
|
|
|
|
func (svc *Service) home(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
ctx = r.Context()
|
|
log = ctx.Value("log").(*slog.Logger)
|
|
)
|
|
|
|
if r.URL.Path != "/" {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
|
if err := svc.tmpl.ExecuteTemplate(
|
|
w, "index.gohtml", nil,
|
|
); err != nil {
|
|
log.ErrorContext(
|
|
ctx, "template error", slog.Any("error", err),
|
|
)
|
|
http.Error(
|
|
w,
|
|
http.StatusText(http.StatusInternalServerError),
|
|
http.StatusInternalServerError,
|
|
)
|
|
}
|
|
}
|
|
|
|
// status returns server status.
|
|
func (svc *Service) status(w http.ResponseWriter, r *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
}
|
|
|
|
// Request is the incoming request format for chat and voice endpoints.
|
|
type Request struct {
|
|
// Audio is base64-encoded audio data (webm) for transcription.
|
|
Audio string `json:"audio,omitempty"`
|
|
// Messages is the conversation history.
|
|
Messages []openai.ChatCompletionMessage `json:"messages"`
|
|
// Model is the LLM model ID. If empty, the default model is used.
|
|
Model string `json:"model,omitempty"`
|
|
// Voice is the voice ID for TTS.
|
|
Voice string `json:"voice,omitempty"`
|
|
}
|
|
|
|
// Response is the response format for chat and voice endpoints.
|
|
type Response struct {
|
|
// Audio is the base64-encoded WAV audio response.
|
|
Audio string `json:"audio,omitempty"`
|
|
// DetectedLanguage is the language detected in the input speech.
|
|
DetectedLanguage string `json:"detected_language,omitempty"`
|
|
// Messages is the full list of messages generated during the query,
|
|
// including tool calls and tool results.
|
|
Messages []openai.ChatCompletionMessage `json:"messages,omitempty"`
|
|
// Model is the LLM model used for the response.
|
|
Model string `json:"used_model,omitempty"`
|
|
// Transcription is the transcribed user speech from the input audio.
|
|
Transcription string `json:"transcription,omitempty"`
|
|
// Voice is the voice used for TTS synthesis.
|
|
Voice string `json:"used_voice,omitempty"`
|
|
}
|
|
|
|
// voice processes voice requests with audio input/output.
|
|
func (svc *Service) voice(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
ctx = r.Context()
|
|
log = ctx.Value("log").(*slog.Logger)
|
|
)
|
|
|
|
// Parse request.
|
|
r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
|
|
var req Request
|
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to decode request",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(w, "invalid request", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Validate messages.
|
|
if len(req.Messages) == 0 {
|
|
http.Error(w, "messages required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
log.InfoContext(ctx, "messages",
|
|
slog.Any("data", req.Messages),
|
|
)
|
|
|
|
var (
|
|
messages = req.Messages
|
|
transcription string
|
|
detectedLang string
|
|
)
|
|
|
|
// If audio provided, transcribe and append to last message.
|
|
if req.Audio != "" {
|
|
last := &messages[len(messages)-1]
|
|
if last.Role != openai.ChatMessageRoleUser {
|
|
http.Error(
|
|
w,
|
|
"last message must be role=user when audio is provided",
|
|
http.StatusBadRequest,
|
|
)
|
|
return
|
|
}
|
|
|
|
data, err := base64.StdEncoding.DecodeString(req.Audio)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to decode audio",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(w, "invalid audio", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
output, err := svc.stt.Transcribe(ctx, data)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"STT failed",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(
|
|
w,
|
|
"STT error",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
|
|
transcription = strings.TrimSpace(output.Text)
|
|
detectedLang = output.DetectedLanguage
|
|
if detectedLang == "" {
|
|
detectedLang = output.Language
|
|
}
|
|
log.InfoContext(
|
|
ctx,
|
|
"transcribed audio",
|
|
slog.String("text", transcription),
|
|
slog.String("language", detectedLang),
|
|
)
|
|
|
|
// Append transcription to last message's content.
|
|
switch {
|
|
// Already using MultiContent, append text part.
|
|
case len(last.MultiContent) > 0:
|
|
last.MultiContent = append(last.MultiContent,
|
|
openai.ChatMessagePart{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: transcription,
|
|
},
|
|
)
|
|
last.Content = ""
|
|
|
|
// Has string content, convert to MultiContent.
|
|
case last.Content != "":
|
|
last.MultiContent = []openai.ChatMessagePart{
|
|
{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: last.Content,
|
|
},
|
|
{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: transcription,
|
|
},
|
|
}
|
|
last.Content = ""
|
|
// Empty message, just set content.
|
|
// Clear MultiContent, as required by the API spec.
|
|
default:
|
|
last.Content = transcription
|
|
last.MultiContent = nil
|
|
}
|
|
}
|
|
|
|
// Get LLM response.
|
|
var model = req.Model
|
|
if model == "" {
|
|
model = svc.llm.DefaultModel()
|
|
}
|
|
msgs, err := svc.llm.Query(ctx, messages, model)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"LLM request failed",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(w, "LLM error", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
if len(msgs) == 0 {
|
|
http.Error(
|
|
w,
|
|
"no response from LLM",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
final := msgs[len(msgs)-1]
|
|
log.InfoContext(
|
|
ctx,
|
|
"LLM response",
|
|
slog.String("text", final.Content),
|
|
slog.String("model", model),
|
|
)
|
|
|
|
// Determine voice to use.
|
|
var voice = req.Voice
|
|
if req.Voice == "" && detectedLang != "" {
|
|
if autoVoice := svc.tts.SelectVoice(
|
|
detectedLang,
|
|
); autoVoice != "" {
|
|
voice = autoVoice
|
|
log.InfoContext(ctx, "auto-selected voice",
|
|
slog.String("language", detectedLang),
|
|
slog.String("voice", voice),
|
|
)
|
|
}
|
|
} else if req.Voice == "" {
|
|
log.WarnContext(
|
|
ctx,
|
|
"auto-voice enabled but no language detected",
|
|
)
|
|
}
|
|
|
|
// Generate audio response with selected voice.
|
|
audio, err := svc.tts.Synthesize(ctx, final.Content, voice)
|
|
if err != nil {
|
|
log.ErrorContext(ctx, "TTS failed", slog.Any("error", err))
|
|
http.Error(w, "TTS error", http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
if err := json.NewEncoder(w).Encode(Response{
|
|
Audio: base64.StdEncoding.EncodeToString(audio),
|
|
DetectedLanguage: detectedLang,
|
|
Messages: msgs,
|
|
Model: model,
|
|
Transcription: transcription,
|
|
Voice: voice,
|
|
}); err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to json encode response",
|
|
slog.Any("error", err),
|
|
)
|
|
}
|
|
}
|
|
|
|
// StreamMessage is the SSE event payload for the streaming voice endpoint.
|
|
type StreamMessage struct {
|
|
// Audio is the base64-encoded WAV audio response.
|
|
Audio string `json:"audio,omitempty"`
|
|
// DetectedLanguage is the language detected in the input speech.
|
|
DetectedLanguage string `json:"detected_language,omitempty"`
|
|
// Error is an error message, if any.
|
|
Error string `json:"error,omitempty"`
|
|
// Message is the chat completion message.
|
|
Message openai.ChatCompletionMessage `json:"message"`
|
|
// Model is the LLM model used for the response.
|
|
Model string `json:"model,omitempty"`
|
|
// Transcription is the transcribed user speech from the input audio.
|
|
Transcription string `json:"transcription,omitempty"`
|
|
// Voice is the voice used for TTS synthesis.
|
|
Voice string `json:"voice,omitempty"`
|
|
}
|
|
|
|
// voiceStream processes voice requests with streaming SSE output.
|
|
func (svc *Service) voiceStream(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
ctx = r.Context()
|
|
log = ctx.Value("log").(*slog.Logger)
|
|
)
|
|
|
|
// Check that the response writer supports flushing.
|
|
flusher, ok := w.(http.Flusher)
|
|
if !ok {
|
|
http.Error(
|
|
w,
|
|
"streaming not supported",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
|
|
// Parse request.
|
|
r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
|
|
var req Request
|
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to decode request",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(w, "invalid request", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Validate messages.
|
|
if len(req.Messages) == 0 {
|
|
http.Error(w, "messages required", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
// Acquire semaphore.
|
|
if err := svc.sem.Acquire(ctx, 1); err != nil {
|
|
http.Error(
|
|
w,
|
|
"service unavailable",
|
|
http.StatusServiceUnavailable,
|
|
)
|
|
return
|
|
}
|
|
defer svc.sem.Release(1)
|
|
|
|
var (
|
|
messages = req.Messages
|
|
transcription string
|
|
detectedLang string
|
|
)
|
|
|
|
// If audio provided, transcribe and append to last message.
|
|
if req.Audio != "" {
|
|
last := &messages[len(messages)-1]
|
|
if last.Role != openai.ChatMessageRoleUser {
|
|
http.Error(
|
|
w,
|
|
"last message must be role=user when audio is provided",
|
|
http.StatusBadRequest,
|
|
)
|
|
return
|
|
}
|
|
|
|
data, err := base64.StdEncoding.DecodeString(req.Audio)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to decode audio",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(w, "invalid audio", http.StatusBadRequest)
|
|
return
|
|
}
|
|
|
|
output, err := svc.stt.Transcribe(ctx, data)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"STT failed",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(
|
|
w,
|
|
"STT error",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
|
|
transcription = strings.TrimSpace(output.Text)
|
|
detectedLang = output.DetectedLanguage
|
|
if detectedLang == "" {
|
|
detectedLang = output.Language
|
|
}
|
|
log.InfoContext(
|
|
ctx,
|
|
"transcribed audio",
|
|
slog.String("text", transcription),
|
|
slog.String("language", detectedLang),
|
|
)
|
|
|
|
// Append transcription to last message's content.
|
|
switch {
|
|
case len(last.MultiContent) > 0:
|
|
last.MultiContent = append(last.MultiContent,
|
|
openai.ChatMessagePart{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: transcription,
|
|
},
|
|
)
|
|
last.Content = ""
|
|
case last.Content != "":
|
|
last.MultiContent = []openai.ChatMessagePart{
|
|
{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: last.Content,
|
|
},
|
|
{
|
|
Type: openai.ChatMessagePartTypeText,
|
|
Text: transcription,
|
|
},
|
|
}
|
|
last.Content = ""
|
|
default:
|
|
last.Content = transcription
|
|
last.MultiContent = nil
|
|
}
|
|
}
|
|
|
|
// Set SSE headers.
|
|
w.Header().Set("Cache-Control", "no-cache")
|
|
w.Header().Set("Connection", "keep-alive")
|
|
w.Header().Set("Content-Type", "text/event-stream")
|
|
|
|
// Helper to send an SSE event.
|
|
send := func(msg StreamMessage) {
|
|
data, err := json.Marshal(msg)
|
|
if err != nil {
|
|
log.ErrorContext(ctx, "failed to marshal SSE event",
|
|
slog.Any("error", err),
|
|
)
|
|
return
|
|
}
|
|
fmt.Fprintf(w, "event: message\ndata: %s\n\n", data)
|
|
flusher.Flush()
|
|
}
|
|
|
|
// If audio was transcribed, send user message with transcription.
|
|
if transcription != "" {
|
|
send(StreamMessage{
|
|
Message: openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleUser,
|
|
Content: transcription,
|
|
},
|
|
Transcription: transcription,
|
|
DetectedLanguage: detectedLang,
|
|
})
|
|
}
|
|
|
|
// Get model.
|
|
var model = req.Model
|
|
if model == "" {
|
|
model = svc.llm.DefaultModel()
|
|
}
|
|
|
|
// Determine voice to use.
|
|
var voice = req.Voice
|
|
if req.Voice == "" && detectedLang != "" {
|
|
if autoVoice := svc.tts.SelectVoice(
|
|
detectedLang,
|
|
); autoVoice != "" {
|
|
voice = autoVoice
|
|
log.InfoContext(ctx, "auto-selected voice",
|
|
slog.String("language", detectedLang),
|
|
slog.String("voice", voice),
|
|
)
|
|
}
|
|
}
|
|
|
|
// Start streaming LLM query.
|
|
var (
|
|
events = make(chan llm.StreamEvent)
|
|
llmErr error
|
|
)
|
|
go func() {
|
|
llmErr = svc.llm.QueryStream(ctx, messages, model, events)
|
|
}()
|
|
|
|
// Consume events and send as SSE.
|
|
var last StreamMessage
|
|
for evt := range events {
|
|
msg := StreamMessage{Message: evt.Message}
|
|
|
|
// Track the last assistant message for TTS.
|
|
if evt.Message.Role == openai.ChatMessageRoleAssistant &&
|
|
len(evt.Message.ToolCalls) == 0 {
|
|
last = msg
|
|
continue
|
|
}
|
|
|
|
send(msg)
|
|
}
|
|
// Check for LLM errors.
|
|
if llmErr != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"LLM stream failed",
|
|
slog.Any("error", llmErr),
|
|
)
|
|
send(StreamMessage{
|
|
Message: openai.ChatCompletionMessage{
|
|
Role: openai.ChatMessageRoleAssistant,
|
|
},
|
|
Error: fmt.Sprintf("LLM error: %v", llmErr),
|
|
})
|
|
return
|
|
}
|
|
|
|
// Synthesize TTS for the final assistant message.
|
|
if last.Message.Content != "" {
|
|
audio, err := svc.tts.Synthesize(
|
|
ctx, last.Message.Content, voice,
|
|
)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx, "TTS failed", slog.Any("error", err),
|
|
)
|
|
last.Error = fmt.Sprintf("TTS error: %v", err)
|
|
} else {
|
|
last.Audio = base64.StdEncoding.EncodeToString(audio)
|
|
}
|
|
}
|
|
last.Model = model
|
|
last.Voice = voice
|
|
send(last)
|
|
}
|
|
|
|
// models returns available LLM models.
|
|
func (svc *Service) models(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
ctx = r.Context()
|
|
log = ctx.Value("log").(*slog.Logger)
|
|
)
|
|
|
|
models, err := svc.llm.ListModels(ctx)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to list models",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(
|
|
w,
|
|
"failed to list models",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
if err := json.NewEncoder(w).Encode(struct {
|
|
Models []openai.Model `json:"models"`
|
|
DefaultModel string `json:"default_model"`
|
|
}{
|
|
Models: models,
|
|
DefaultModel: svc.llm.DefaultModel(),
|
|
}); err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to encode models response",
|
|
slog.Any("error", err),
|
|
)
|
|
}
|
|
}
|
|
|
|
// voices returns available TTS voices.
|
|
func (svc *Service) voices(w http.ResponseWriter, r *http.Request) {
|
|
var (
|
|
ctx = r.Context()
|
|
log = ctx.Value("log").(*slog.Logger)
|
|
)
|
|
|
|
voices, err := svc.tts.ListVoices(ctx)
|
|
if err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to list voices",
|
|
slog.Any("error", err),
|
|
)
|
|
http.Error(
|
|
w,
|
|
"failed to list voices",
|
|
http.StatusInternalServerError,
|
|
)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
if err := json.NewEncoder(w).Encode(map[string][]string{
|
|
"voices": voices,
|
|
}); err != nil {
|
|
log.ErrorContext(
|
|
ctx,
|
|
"failed to encode voices response",
|
|
slog.Any("error", err),
|
|
)
|
|
}
|
|
}
|