Add service orchestration and web UI
This commit is contained in:
856
internal/service/service.go
Normal file
856
internal/service/service.go
Normal file
@@ -0,0 +1,856 @@
|
||||
// Package service orchestrates the odidere voice assistant server.
|
||||
// It coordinates the HTTP server, STT/LLM/TTS clients, and handles
|
||||
// graceful shutdown.
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"embed"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html/template"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/chimerical-llc/odidere/internal/config"
|
||||
"github.com/chimerical-llc/odidere/internal/llm"
|
||||
"github.com/chimerical-llc/odidere/internal/service/templates"
|
||||
"github.com/chimerical-llc/odidere/internal/stt"
|
||||
"github.com/chimerical-llc/odidere/internal/tool"
|
||||
"github.com/chimerical-llc/odidere/internal/tts"
|
||||
|
||||
"github.com/google/uuid"
|
||||
openai "github.com/sashabaranov/go-openai"
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
//go:embed all:static/*
|
||||
var static embed.FS
|
||||
|
||||
// Service is the main application coordinator.
|
||||
// It owns the HTTP server and all processing clients.
|
||||
type Service struct {
|
||||
cfg *config.Config
|
||||
llm *llm.Client
|
||||
log *slog.Logger
|
||||
mux *http.ServeMux
|
||||
sem *semaphore.Weighted
|
||||
server *http.Server
|
||||
stt *stt.Client
|
||||
tmpl *template.Template
|
||||
tools *tool.Registry
|
||||
tts *tts.Client
|
||||
}
|
||||
|
||||
// New creates a Service from the provided configuration.
|
||||
// It initializes all clients and the HTTP server.
|
||||
func New(cfg *config.Config, log *slog.Logger) (*Service, error) {
|
||||
var svc = &Service{
|
||||
cfg: cfg,
|
||||
log: log,
|
||||
mux: http.NewServeMux(),
|
||||
sem: semaphore.NewWeighted(int64(cfg.Concurrency)),
|
||||
}
|
||||
|
||||
// Setup tool registry.
|
||||
registry, err := tool.NewRegistry(cfg.Tools)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("load tools: %v", err)
|
||||
}
|
||||
svc.tools = registry
|
||||
|
||||
// Create STT client.
|
||||
sttClient, err := stt.NewClient(cfg.STT, log)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create STT client: %v", err)
|
||||
}
|
||||
svc.stt = sttClient
|
||||
|
||||
// Create LLM client.
|
||||
llmClient, err := llm.NewClient(cfg.LLM, registry, log)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create LLM client: %v", err)
|
||||
}
|
||||
svc.llm = llmClient
|
||||
|
||||
// Create TTS client.
|
||||
ttsClient, err := tts.NewClient(cfg.TTS, log)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create TTS client: %v", err)
|
||||
}
|
||||
svc.tts = ttsClient
|
||||
|
||||
// Parse templates.
|
||||
tmpl, err := templates.Parse()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse templates: %v", err)
|
||||
}
|
||||
svc.tmpl = tmpl
|
||||
|
||||
// Setup static file server.
|
||||
staticFS, err := fs.Sub(static, "static")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("setup static fs: %v", err)
|
||||
}
|
||||
|
||||
// Register routes.
|
||||
svc.mux.HandleFunc("GET /", svc.home)
|
||||
svc.mux.HandleFunc("GET /status", svc.status)
|
||||
svc.mux.Handle(
|
||||
"GET /static/",
|
||||
http.StripPrefix(
|
||||
"/static/", http.FileServer(http.FS(staticFS)),
|
||||
),
|
||||
)
|
||||
svc.mux.HandleFunc("POST /v1/chat/voice", svc.voice)
|
||||
svc.mux.HandleFunc("POST /v1/chat/voice/stream", svc.voiceStream)
|
||||
svc.mux.HandleFunc("GET /v1/voices", svc.voices)
|
||||
svc.mux.HandleFunc("GET /v1/models", svc.models)
|
||||
|
||||
svc.server = &http.Server{
|
||||
Addr: cfg.Address,
|
||||
Handler: svc,
|
||||
}
|
||||
|
||||
return svc, nil
|
||||
}
|
||||
|
||||
// ServeHTTP implements http.Handler. It logs requests, assigns a UUID,
|
||||
// sets context values, handles panics, and delegates to the mux.
|
||||
func (svc *Service) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
start = time.Now()
|
||||
id = uuid.NewString()
|
||||
ip = func() string {
|
||||
if ip := r.Header.Get("X-Forwarded-For"); ip != "" {
|
||||
if idx := strings.Index(ip, ","); idx != -1 {
|
||||
return strings.TrimSpace(ip[:idx])
|
||||
}
|
||||
return ip
|
||||
}
|
||||
return r.RemoteAddr
|
||||
}()
|
||||
log = svc.log.With(slog.Group(
|
||||
"request",
|
||||
slog.String("id", id),
|
||||
slog.String("ip", ip),
|
||||
slog.String("method", r.Method),
|
||||
slog.String("path", r.URL.Path),
|
||||
))
|
||||
)
|
||||
|
||||
// Log completion time.
|
||||
defer func() {
|
||||
log.InfoContext(
|
||||
r.Context(),
|
||||
"completed",
|
||||
slog.Duration("duration", time.Since(start)),
|
||||
)
|
||||
}()
|
||||
|
||||
// Panic recovery.
|
||||
defer func() {
|
||||
if err := recover(); err != nil {
|
||||
log.ErrorContext(
|
||||
r.Context(),
|
||||
"panic recovered",
|
||||
slog.Any("error", err),
|
||||
slog.String("stack", string(debug.Stack())),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
http.StatusText(
|
||||
http.StatusInternalServerError,
|
||||
),
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
}()
|
||||
|
||||
// Enrich context with request-scoped values.
|
||||
ctx := r.Context()
|
||||
ctx = context.WithValue(ctx, "log", log)
|
||||
ctx = context.WithValue(ctx, "id", id)
|
||||
ctx = context.WithValue(ctx, "ip", ip)
|
||||
r = r.WithContext(ctx)
|
||||
|
||||
log.InfoContext(ctx, "handling")
|
||||
|
||||
// Pass the request on to the multiplexer.
|
||||
svc.mux.ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
// Run starts the service and blocks until shutdown.
|
||||
// Shutdown is triggered by SIGINT or SIGTERM.
|
||||
func (svc *Service) Run(ctx context.Context) error {
|
||||
svc.log.Info(
|
||||
"starting odidere",
|
||||
slog.Int("concurrency", svc.cfg.Concurrency),
|
||||
slog.Group(
|
||||
"llm",
|
||||
slog.String("url", svc.cfg.LLM.URL),
|
||||
slog.String("model", svc.cfg.LLM.Model),
|
||||
),
|
||||
slog.Group(
|
||||
"server",
|
||||
slog.String("address", svc.cfg.Address),
|
||||
),
|
||||
slog.Group(
|
||||
"stt",
|
||||
slog.String("url", svc.cfg.STT.URL),
|
||||
),
|
||||
slog.Group(
|
||||
"tools",
|
||||
slog.Int("count", len(svc.tools.List())),
|
||||
slog.Any(
|
||||
"names",
|
||||
strings.Join(svc.tools.List(), ","),
|
||||
),
|
||||
),
|
||||
slog.Group(
|
||||
"tts",
|
||||
slog.String("url", svc.cfg.TTS.URL),
|
||||
slog.String("default_voice", svc.cfg.TTS.Voice),
|
||||
),
|
||||
slog.Any("shutdown_timeout", svc.cfg.GetShutdownTimeout()),
|
||||
)
|
||||
|
||||
// Setup signal handling for graceful shutdown.
|
||||
ctx, cancel := signal.NotifyContext(
|
||||
ctx,
|
||||
os.Interrupt, syscall.SIGTERM,
|
||||
)
|
||||
defer cancel()
|
||||
|
||||
// Start HTTP server in background.
|
||||
var errs = make(chan error, 1)
|
||||
go func() {
|
||||
svc.log.Info(
|
||||
"HTTP server listening",
|
||||
slog.String("address", svc.cfg.Address),
|
||||
)
|
||||
if err := svc.server.ListenAndServe(); err != nil &&
|
||||
err != http.ErrServerClosed {
|
||||
errs <- err
|
||||
}
|
||||
close(errs)
|
||||
}()
|
||||
|
||||
// Wait for shutdown signal or server error.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
svc.log.Info("shutdown signal received")
|
||||
case err := <-errs:
|
||||
if err != nil {
|
||||
return fmt.Errorf("server error: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Graceful shutdown with timeout.
|
||||
shutdownCtx, shutdownCancel := context.WithTimeout(
|
||||
context.Background(),
|
||||
svc.cfg.GetShutdownTimeout(),
|
||||
)
|
||||
defer shutdownCancel()
|
||||
|
||||
svc.log.Info("shutting down HTTP server")
|
||||
if err := svc.server.Shutdown(shutdownCtx); err != nil {
|
||||
svc.log.Warn(
|
||||
"shutdown timeout reached",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
}
|
||||
|
||||
svc.log.Info("terminating")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *Service) home(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ctx = r.Context()
|
||||
log = ctx.Value("log").(*slog.Logger)
|
||||
)
|
||||
|
||||
if r.URL.Path != "/" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
if err := svc.tmpl.ExecuteTemplate(
|
||||
w, "index.gohtml", nil,
|
||||
); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx, "template error", slog.Any("error", err),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
http.StatusText(http.StatusInternalServerError),
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// status returns server status.
|
||||
func (svc *Service) status(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}
|
||||
|
||||
// Request is the incoming request format for chat and voice endpoints.
|
||||
type Request struct {
|
||||
// Audio is base64-encoded audio data (webm) for transcription.
|
||||
Audio string `json:"audio,omitempty"`
|
||||
// Messages is the conversation history.
|
||||
Messages []openai.ChatCompletionMessage `json:"messages"`
|
||||
// Model is the LLM model ID. If empty, the default model is used.
|
||||
Model string `json:"model,omitempty"`
|
||||
// Voice is the voice ID for TTS.
|
||||
Voice string `json:"voice,omitempty"`
|
||||
}
|
||||
|
||||
// Response is the response format for chat and voice endpoints.
|
||||
type Response struct {
|
||||
// Audio is the base64-encoded WAV audio response.
|
||||
Audio string `json:"audio,omitempty"`
|
||||
// DetectedLanguage is the language detected in the input speech.
|
||||
DetectedLanguage string `json:"detected_language,omitempty"`
|
||||
// Messages is the full list of messages generated during the query,
|
||||
// including tool calls and tool results.
|
||||
Messages []openai.ChatCompletionMessage `json:"messages,omitempty"`
|
||||
// Model is the LLM model used for the response.
|
||||
Model string `json:"used_model,omitempty"`
|
||||
// Transcription is the transcribed user speech from the input audio.
|
||||
Transcription string `json:"transcription,omitempty"`
|
||||
// Voice is the voice used for TTS synthesis.
|
||||
Voice string `json:"used_voice,omitempty"`
|
||||
}
|
||||
|
||||
// voice processes voice requests with audio input/output.
|
||||
func (svc *Service) voice(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ctx = r.Context()
|
||||
log = ctx.Value("log").(*slog.Logger)
|
||||
)
|
||||
|
||||
// Parse request.
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
|
||||
var req Request
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to decode request",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(w, "invalid request", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate messages.
|
||||
if len(req.Messages) == 0 {
|
||||
http.Error(w, "messages required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
log.InfoContext(ctx, "messages",
|
||||
slog.Any("data", req.Messages),
|
||||
)
|
||||
|
||||
var (
|
||||
messages = req.Messages
|
||||
transcription string
|
||||
detectedLang string
|
||||
)
|
||||
|
||||
// If audio provided, transcribe and append to last message.
|
||||
if req.Audio != "" {
|
||||
last := &messages[len(messages)-1]
|
||||
if last.Role != openai.ChatMessageRoleUser {
|
||||
http.Error(
|
||||
w,
|
||||
"last message must be role=user when audio is provided",
|
||||
http.StatusBadRequest,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
data, err := base64.StdEncoding.DecodeString(req.Audio)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to decode audio",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(w, "invalid audio", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
output, err := svc.stt.Transcribe(ctx, data)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"STT failed",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
"STT error",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
transcription = strings.TrimSpace(output.Text)
|
||||
detectedLang = output.DetectedLanguage
|
||||
if detectedLang == "" {
|
||||
detectedLang = output.Language
|
||||
}
|
||||
log.InfoContext(
|
||||
ctx,
|
||||
"transcribed audio",
|
||||
slog.String("text", transcription),
|
||||
slog.String("language", detectedLang),
|
||||
)
|
||||
|
||||
// Append transcription to last message's content.
|
||||
switch {
|
||||
// Already using MultiContent, append text part.
|
||||
case len(last.MultiContent) > 0:
|
||||
last.MultiContent = append(last.MultiContent,
|
||||
openai.ChatMessagePart{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: transcription,
|
||||
},
|
||||
)
|
||||
last.Content = ""
|
||||
|
||||
// Has string content, convert to MultiContent.
|
||||
case last.Content != "":
|
||||
last.MultiContent = []openai.ChatMessagePart{
|
||||
{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: last.Content,
|
||||
},
|
||||
{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: transcription,
|
||||
},
|
||||
}
|
||||
last.Content = ""
|
||||
// Empty message, just set content.
|
||||
// Clear MultiContent, as required by the API spec.
|
||||
default:
|
||||
last.Content = transcription
|
||||
last.MultiContent = nil
|
||||
}
|
||||
}
|
||||
|
||||
// Get LLM response.
|
||||
var model = req.Model
|
||||
if model == "" {
|
||||
model = svc.llm.DefaultModel()
|
||||
}
|
||||
msgs, err := svc.llm.Query(ctx, messages, model)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"LLM request failed",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(w, "LLM error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
if len(msgs) == 0 {
|
||||
http.Error(
|
||||
w,
|
||||
"no response from LLM",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
final := msgs[len(msgs)-1]
|
||||
log.InfoContext(
|
||||
ctx,
|
||||
"LLM response",
|
||||
slog.String("text", final.Content),
|
||||
slog.String("model", model),
|
||||
)
|
||||
|
||||
// Determine voice to use.
|
||||
var voice = req.Voice
|
||||
if req.Voice == "" && detectedLang != "" {
|
||||
if autoVoice := svc.tts.SelectVoice(
|
||||
detectedLang,
|
||||
); autoVoice != "" {
|
||||
voice = autoVoice
|
||||
log.InfoContext(ctx, "auto-selected voice",
|
||||
slog.String("language", detectedLang),
|
||||
slog.String("voice", voice),
|
||||
)
|
||||
}
|
||||
} else if req.Voice == "" {
|
||||
log.WarnContext(
|
||||
ctx,
|
||||
"auto-voice enabled but no language detected",
|
||||
)
|
||||
}
|
||||
|
||||
// Generate audio response with selected voice.
|
||||
audio, err := svc.tts.Synthesize(ctx, final.Content, voice)
|
||||
if err != nil {
|
||||
log.ErrorContext(ctx, "TTS failed", slog.Any("error", err))
|
||||
http.Error(w, "TTS error", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(Response{
|
||||
Audio: base64.StdEncoding.EncodeToString(audio),
|
||||
DetectedLanguage: detectedLang,
|
||||
Messages: msgs,
|
||||
Model: model,
|
||||
Transcription: transcription,
|
||||
Voice: voice,
|
||||
}); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to json encode response",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// StreamMessage is the SSE event payload for the streaming voice endpoint.
|
||||
type StreamMessage struct {
|
||||
// Audio is the base64-encoded WAV audio response.
|
||||
Audio string `json:"audio,omitempty"`
|
||||
// DetectedLanguage is the language detected in the input speech.
|
||||
DetectedLanguage string `json:"detected_language,omitempty"`
|
||||
// Error is an error message, if any.
|
||||
Error string `json:"error,omitempty"`
|
||||
// Message is the chat completion message.
|
||||
Message openai.ChatCompletionMessage `json:"message"`
|
||||
// Model is the LLM model used for the response.
|
||||
Model string `json:"model,omitempty"`
|
||||
// Transcription is the transcribed user speech from the input audio.
|
||||
Transcription string `json:"transcription,omitempty"`
|
||||
// Voice is the voice used for TTS synthesis.
|
||||
Voice string `json:"voice,omitempty"`
|
||||
}
|
||||
|
||||
// voiceStream processes voice requests with streaming SSE output.
|
||||
func (svc *Service) voiceStream(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ctx = r.Context()
|
||||
log = ctx.Value("log").(*slog.Logger)
|
||||
)
|
||||
|
||||
// Check that the response writer supports flushing.
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
http.Error(
|
||||
w,
|
||||
"streaming not supported",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
// Parse request.
|
||||
r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
|
||||
var req Request
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to decode request",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(w, "invalid request", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Validate messages.
|
||||
if len(req.Messages) == 0 {
|
||||
http.Error(w, "messages required", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Acquire semaphore.
|
||||
if err := svc.sem.Acquire(ctx, 1); err != nil {
|
||||
http.Error(
|
||||
w,
|
||||
"service unavailable",
|
||||
http.StatusServiceUnavailable,
|
||||
)
|
||||
return
|
||||
}
|
||||
defer svc.sem.Release(1)
|
||||
|
||||
var (
|
||||
messages = req.Messages
|
||||
transcription string
|
||||
detectedLang string
|
||||
)
|
||||
|
||||
// If audio provided, transcribe and append to last message.
|
||||
if req.Audio != "" {
|
||||
last := &messages[len(messages)-1]
|
||||
if last.Role != openai.ChatMessageRoleUser {
|
||||
http.Error(
|
||||
w,
|
||||
"last message must be role=user when audio is provided",
|
||||
http.StatusBadRequest,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
data, err := base64.StdEncoding.DecodeString(req.Audio)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to decode audio",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(w, "invalid audio", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
output, err := svc.stt.Transcribe(ctx, data)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"STT failed",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
"STT error",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
transcription = strings.TrimSpace(output.Text)
|
||||
detectedLang = output.DetectedLanguage
|
||||
if detectedLang == "" {
|
||||
detectedLang = output.Language
|
||||
}
|
||||
log.InfoContext(
|
||||
ctx,
|
||||
"transcribed audio",
|
||||
slog.String("text", transcription),
|
||||
slog.String("language", detectedLang),
|
||||
)
|
||||
|
||||
// Append transcription to last message's content.
|
||||
switch {
|
||||
case len(last.MultiContent) > 0:
|
||||
last.MultiContent = append(last.MultiContent,
|
||||
openai.ChatMessagePart{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: transcription,
|
||||
},
|
||||
)
|
||||
last.Content = ""
|
||||
case last.Content != "":
|
||||
last.MultiContent = []openai.ChatMessagePart{
|
||||
{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: last.Content,
|
||||
},
|
||||
{
|
||||
Type: openai.ChatMessagePartTypeText,
|
||||
Text: transcription,
|
||||
},
|
||||
}
|
||||
last.Content = ""
|
||||
default:
|
||||
last.Content = transcription
|
||||
last.MultiContent = nil
|
||||
}
|
||||
}
|
||||
|
||||
// Set SSE headers.
|
||||
w.Header().Set("Cache-Control", "no-cache")
|
||||
w.Header().Set("Connection", "keep-alive")
|
||||
w.Header().Set("Content-Type", "text/event-stream")
|
||||
|
||||
// Helper to send an SSE event.
|
||||
send := func(msg StreamMessage) {
|
||||
data, err := json.Marshal(msg)
|
||||
if err != nil {
|
||||
log.ErrorContext(ctx, "failed to marshal SSE event",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
return
|
||||
}
|
||||
fmt.Fprintf(w, "event: message\ndata: %s\n\n", data)
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
// If audio was transcribed, send user message with transcription.
|
||||
if transcription != "" {
|
||||
send(StreamMessage{
|
||||
Message: openai.ChatCompletionMessage{
|
||||
Role: openai.ChatMessageRoleUser,
|
||||
Content: transcription,
|
||||
},
|
||||
Transcription: transcription,
|
||||
DetectedLanguage: detectedLang,
|
||||
})
|
||||
}
|
||||
|
||||
// Get model.
|
||||
var model = req.Model
|
||||
if model == "" {
|
||||
model = svc.llm.DefaultModel()
|
||||
}
|
||||
|
||||
// Determine voice to use.
|
||||
var voice = req.Voice
|
||||
if req.Voice == "" && detectedLang != "" {
|
||||
if autoVoice := svc.tts.SelectVoice(
|
||||
detectedLang,
|
||||
); autoVoice != "" {
|
||||
voice = autoVoice
|
||||
log.InfoContext(ctx, "auto-selected voice",
|
||||
slog.String("language", detectedLang),
|
||||
slog.String("voice", voice),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Start streaming LLM query.
|
||||
var (
|
||||
events = make(chan llm.StreamEvent)
|
||||
llmErr error
|
||||
)
|
||||
go func() {
|
||||
llmErr = svc.llm.QueryStream(ctx, messages, model, events)
|
||||
}()
|
||||
|
||||
// Consume events and send as SSE.
|
||||
var last StreamMessage
|
||||
for evt := range events {
|
||||
msg := StreamMessage{Message: evt.Message}
|
||||
|
||||
// Track the last assistant message for TTS.
|
||||
if evt.Message.Role == openai.ChatMessageRoleAssistant &&
|
||||
len(evt.Message.ToolCalls) == 0 {
|
||||
last = msg
|
||||
continue
|
||||
}
|
||||
|
||||
send(msg)
|
||||
}
|
||||
// Check for LLM errors.
|
||||
if llmErr != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"LLM stream failed",
|
||||
slog.Any("error", llmErr),
|
||||
)
|
||||
send(StreamMessage{
|
||||
Message: openai.ChatCompletionMessage{
|
||||
Role: openai.ChatMessageRoleAssistant,
|
||||
},
|
||||
Error: fmt.Sprintf("LLM error: %v", llmErr),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Synthesize TTS for the final assistant message.
|
||||
if last.Message.Content != "" {
|
||||
audio, err := svc.tts.Synthesize(
|
||||
ctx, last.Message.Content, voice,
|
||||
)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx, "TTS failed", slog.Any("error", err),
|
||||
)
|
||||
last.Error = fmt.Sprintf("TTS error: %v", err)
|
||||
} else {
|
||||
last.Audio = base64.StdEncoding.EncodeToString(audio)
|
||||
}
|
||||
}
|
||||
last.Model = model
|
||||
last.Voice = voice
|
||||
send(last)
|
||||
}
|
||||
|
||||
// models returns available LLM models.
|
||||
func (svc *Service) models(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ctx = r.Context()
|
||||
log = ctx.Value("log").(*slog.Logger)
|
||||
)
|
||||
|
||||
models, err := svc.llm.ListModels(ctx)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to list models",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
"failed to list models",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(struct {
|
||||
Models []openai.Model `json:"models"`
|
||||
DefaultModel string `json:"default_model"`
|
||||
}{
|
||||
Models: models,
|
||||
DefaultModel: svc.llm.DefaultModel(),
|
||||
}); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to encode models response",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// voices returns available TTS voices.
|
||||
func (svc *Service) voices(w http.ResponseWriter, r *http.Request) {
|
||||
var (
|
||||
ctx = r.Context()
|
||||
log = ctx.Value("log").(*slog.Logger)
|
||||
)
|
||||
|
||||
voices, err := svc.tts.ListVoices(ctx)
|
||||
if err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to list voices",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
http.Error(
|
||||
w,
|
||||
"failed to list voices",
|
||||
http.StatusInternalServerError,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
if err := json.NewEncoder(w).Encode(map[string][]string{
|
||||
"voices": voices,
|
||||
}); err != nil {
|
||||
log.ErrorContext(
|
||||
ctx,
|
||||
"failed to encode voices response",
|
||||
slog.Any("error", err),
|
||||
)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user