odidere/internal/service/service.go

// Package service orchestrates the odidere voice assistant server.
// It coordinates the HTTP server, STT/LLM/TTS clients, and handles
// graceful shutdown.
package service

import (
	"context"
	"embed"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"html/template"
	"io/fs"
	"log/slog"
	"net/http"
	"os"
	"os/signal"
	"runtime/debug"
	"strings"
	"syscall"
	"time"

	"code.chimeric.al/chimerical/odidere/internal/config"
	"code.chimeric.al/chimerical/odidere/internal/llm"
	"code.chimeric.al/chimerical/odidere/internal/service/templates"
	"code.chimeric.al/chimerical/odidere/internal/stt"
	"code.chimeric.al/chimerical/odidere/internal/tool"
	"code.chimeric.al/chimerical/odidere/internal/tts"

	"github.com/google/uuid"
	openai "github.com/sashabaranov/go-openai"
	"golang.org/x/sync/semaphore"
)

//go:embed all:static/*
var static embed.FS

// Service is the main application coordinator.
// It owns the HTTP server and all processing clients.
type Service struct {
	cfg    *config.Config
	llm    *llm.Client
	log    *slog.Logger
	mux    *http.ServeMux
	sem    *semaphore.Weighted
	server *http.Server
	stt    *stt.Client
	tmpl   *template.Template
	tools  *tool.Registry
	tts    *tts.Client
}

// New creates a Service from the provided configuration.
// It initializes all clients and the HTTP server.
func New(cfg *config.Config, log *slog.Logger) (*Service, error) {
	var svc = &Service{
		cfg: cfg,
		log: log,
		mux: http.NewServeMux(),
		sem: semaphore.NewWeighted(int64(cfg.Concurrency)),
	}

	// Setup tool registry.
	registry, err := tool.NewRegistry(cfg.Tools)
	if err != nil {
		return nil, fmt.Errorf("load tools: %v", err)
	}
	svc.tools = registry

	// Create STT client.
	sttClient, err := stt.NewClient(cfg.STT, log)
	if err != nil {
		return nil, fmt.Errorf("create STT client: %v", err)
	}
	svc.stt = sttClient

	// Create LLM client.
	llmClient, err := llm.NewClient(cfg.LLM, registry, log)
	if err != nil {
		return nil, fmt.Errorf("create LLM client: %v", err)
	}
	svc.llm = llmClient

	// Create TTS client.
	ttsClient, err := tts.NewClient(cfg.TTS, log)
	if err != nil {
		return nil, fmt.Errorf("create TTS client: %v", err)
	}
	svc.tts = ttsClient

	// Parse templates.
	tmpl, err := templates.Parse()
	if err != nil {
		return nil, fmt.Errorf("parse templates: %v", err)
	}
	svc.tmpl = tmpl

	// Setup static file server.
	staticFS, err := fs.Sub(static, "static")
	if err != nil {
		return nil, fmt.Errorf("setup static fs: %v", err)
	}

	// Register routes.
	svc.mux.HandleFunc("GET /", svc.home)
	svc.mux.HandleFunc("GET /status", svc.status)
	svc.mux.Handle(
		"GET /static/",
		http.StripPrefix(
			"/static/", http.FileServer(http.FS(staticFS)),
		),
	)
	svc.mux.HandleFunc("POST /v1/chat/voice", svc.voice)
	svc.mux.HandleFunc("POST /v1/chat/voice/stream", svc.voiceStream)
	svc.mux.HandleFunc("GET /v1/voices", svc.voices)
	svc.mux.HandleFunc("GET /v1/models", svc.models)

	svc.server = &http.Server{
		Addr:    cfg.Address,
		Handler: svc,
	}

	return svc, nil
}

// ServeHTTP implements http.Handler. It logs requests, assigns a UUID,
// sets context values, handles panics, and delegates to the mux.
func (svc *Service) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	var (
		start = time.Now()
		id    = uuid.NewString()
		ip    = func() string {
			if ip := r.Header.Get("X-Forwarded-For"); ip != "" {
				if idx := strings.Index(ip, ","); idx != -1 {
					return strings.TrimSpace(ip[:idx])
				}
				return ip
			}
			return r.RemoteAddr
		}()
		log = svc.log.With(slog.Group(
			"request",
			slog.String("id", id),
			slog.String("ip", ip),
			slog.String("method", r.Method),
			slog.String("path", r.URL.Path),
		))
	)

	// Log completion time.
	defer func() {
		log.InfoContext(
			r.Context(),
			"completed",
			slog.Duration("duration", time.Since(start)),
		)
	}()

	// Panic recovery.
	defer func() {
		if err := recover(); err != nil {
			log.ErrorContext(
				r.Context(),
				"panic recovered",
				slog.Any("error", err),
				slog.String("stack", string(debug.Stack())),
			)
			http.Error(
				w,
				http.StatusText(
					http.StatusInternalServerError,
				),
				http.StatusInternalServerError,
			)
		}
	}()

	// Enrich context with request-scoped values.
	ctx := r.Context()
	ctx = context.WithValue(ctx, "log", log)
	ctx = context.WithValue(ctx, "id", id)
	ctx = context.WithValue(ctx, "ip", ip)
	r = r.WithContext(ctx)

	log.InfoContext(ctx, "handling")

	// Pass the request on to the multiplexer.
	svc.mux.ServeHTTP(w, r)
}

// Run starts the service and blocks until shutdown.
// Shutdown is triggered by SIGINT or SIGTERM.
func (svc *Service) Run(ctx context.Context) error {
	svc.log.Info(
		"starting odidere",
		slog.Int("concurrency", svc.cfg.Concurrency),
		slog.Group(
			"llm",
			slog.String("url", svc.cfg.LLM.URL),
			slog.String("model", svc.cfg.LLM.Model),
		),
		slog.Group(
			"server",
			slog.String("address", svc.cfg.Address),
		),
		slog.Group(
			"stt",
			slog.String("url", svc.cfg.STT.URL),
		),
		slog.Group(
			"tools",
			slog.Int("count", len(svc.tools.List())),
			slog.Any(
				"names",
				strings.Join(svc.tools.List(), ","),
			),
		),
		slog.Group(
			"tts",
			slog.String("url", svc.cfg.TTS.URL),
			slog.String("default_voice", svc.cfg.TTS.Voice),
		),
		slog.Any("shutdown_timeout", svc.cfg.GetShutdownTimeout()),
	)

	// Setup signal handling for graceful shutdown.
	ctx, cancel := signal.NotifyContext(
		ctx,
		os.Interrupt, syscall.SIGTERM,
	)
	defer cancel()

	// Start HTTP server in background.
	var errs = make(chan error, 1)
	go func() {
		svc.log.Info(
			"HTTP server listening",
			slog.String("address", svc.cfg.Address),
		)
		if err := svc.server.ListenAndServe(); err != nil &&
			err != http.ErrServerClosed {
			errs <- err
		}
		close(errs)
	}()

	// Wait for shutdown signal or server error.
	select {
	case <-ctx.Done():
		svc.log.Info("shutdown signal received")
	case err := <-errs:
		if err != nil {
			return fmt.Errorf("server error: %w", err)
		}
	}

	// Graceful shutdown with timeout.
	shutdownCtx, shutdownCancel := context.WithTimeout(
		context.Background(),
		svc.cfg.GetShutdownTimeout(),
	)
	defer shutdownCancel()

	svc.log.Info("shutting down HTTP server")
	if err := svc.server.Shutdown(shutdownCtx); err != nil {
		svc.log.Warn(
			"shutdown timeout reached",
			slog.Any("error", err),
		)
	}

	svc.log.Info("terminating")
	return nil
}

func (svc *Service) home(w http.ResponseWriter, r *http.Request) {
	var (
		ctx = r.Context()
		log = ctx.Value("log").(*slog.Logger)
	)

	if r.URL.Path != "/" {
		http.NotFound(w, r)
		return
	}
	w.Header().Set("Content-Type", "text/html; charset=utf-8")
	if err := svc.tmpl.ExecuteTemplate(
		w, "index.gohtml", nil,
	); err != nil {
		log.ErrorContext(
			ctx, "template error", slog.Any("error", err),
		)
		http.Error(
			w,
			http.StatusText(http.StatusInternalServerError),
			http.StatusInternalServerError,
		)
	}
}

// status returns server status.
func (svc *Service) status(w http.ResponseWriter, r *http.Request) {
	w.WriteHeader(http.StatusOK)
}

// Request is the incoming request format for chat and voice endpoints.
type Request struct {
	// Audio is base64-encoded audio data (webm) for transcription.
	Audio string `json:"audio,omitempty"`
	// Messages is the conversation history.
	Messages []openai.ChatCompletionMessage `json:"messages"`
	// Model is the LLM model ID. If empty, the default model is used.
	Model string `json:"model,omitempty"`
	// Voice is the voice ID for TTS.
	Voice string `json:"voice,omitempty"`
}

// Response is the response format for chat and voice endpoints.
type Response struct {
	// Audio is the base64-encoded WAV audio response.
	Audio string `json:"audio,omitempty"`
	// DetectedLanguage is the language detected in the input speech.
	DetectedLanguage string `json:"detected_language,omitempty"`
	// Messages is the full list of messages generated during the query,
	// including tool calls and tool results.
	Messages []openai.ChatCompletionMessage `json:"messages,omitempty"`
	// Model is the LLM model used for the response.
	Model string `json:"used_model,omitempty"`
	// Transcription is the transcribed user speech from the input audio.
	Transcription string `json:"transcription,omitempty"`
	// Voice is the voice used for TTS synthesis.
	Voice string `json:"used_voice,omitempty"`
}

// voice processes voice requests with audio input/output.
func (svc *Service) voice(w http.ResponseWriter, r *http.Request) {
	var (
		ctx = r.Context()
		log = ctx.Value("log").(*slog.Logger)
	)

	// Parse request.
	r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
	var req Request
	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
		log.ErrorContext(
			ctx,
			"failed to decode request",
			slog.Any("error", err),
		)
		http.Error(w, "invalid request", http.StatusBadRequest)
		return
	}

	// Validate messages.
	if len(req.Messages) == 0 {
		http.Error(w, "messages required", http.StatusBadRequest)
		return
	}
	log.InfoContext(ctx, "messages",
		slog.Any("data", req.Messages),
	)

	var (
		messages      = req.Messages
		transcription string
		detectedLang  string
	)

	// If audio provided, transcribe and append to last message.
	if req.Audio != "" {
		last := &messages[len(messages)-1]
		if last.Role != openai.ChatMessageRoleUser {
			http.Error(
				w,
				"last message must be role=user when audio is provided",
				http.StatusBadRequest,
			)
			return
		}

		data, err := base64.StdEncoding.DecodeString(req.Audio)
		if err != nil {
			log.ErrorContext(
				ctx,
				"failed to decode audio",
				slog.Any("error", err),
			)
			http.Error(w, "invalid audio", http.StatusBadRequest)
			return
		}

		output, err := svc.stt.Transcribe(ctx, data)
		if err != nil {
			log.ErrorContext(
				ctx,
				"STT failed",
				slog.Any("error", err),
			)
			http.Error(
				w,
				"STT error",
				http.StatusInternalServerError,
			)
			return
		}

		transcription = strings.TrimSpace(output.Text)
		detectedLang = output.DetectedLanguage
		if detectedLang == "" {
			detectedLang = output.Language
		}
		log.InfoContext(
			ctx,
			"transcribed audio",
			slog.String("text", transcription),
			slog.String("language", detectedLang),
		)

		// Append transcription to last message's content.
		switch {
		// Already using MultiContent, append text part.
		case len(last.MultiContent) > 0:
			last.MultiContent = append(last.MultiContent,
				openai.ChatMessagePart{
					Type: openai.ChatMessagePartTypeText,
					Text: transcription,
				},
			)
			last.Content = ""

		// Has string content, convert to MultiContent.
		case last.Content != "":
			last.MultiContent = []openai.ChatMessagePart{
				{
					Type: openai.ChatMessagePartTypeText,
					Text: last.Content,
				},
				{
					Type: openai.ChatMessagePartTypeText,
					Text: transcription,
				},
			}
			last.Content = ""
		// Empty message, just set content.
		// Clear MultiContent, as required by the API spec.
		default:
			last.Content = transcription
			last.MultiContent = nil
		}
	}

	// Get LLM response.
	var model = req.Model
	if model == "" {
		model = svc.llm.DefaultModel()
	}
	msgs, err := svc.llm.Query(ctx, messages, model)
	if err != nil {
		log.ErrorContext(
			ctx,
			"LLM request failed",
			slog.Any("error", err),
		)
		http.Error(w, "LLM error", http.StatusInternalServerError)
		return
	}
	if len(msgs) == 0 {
		http.Error(
			w,
			"no response from LLM",
			http.StatusInternalServerError,
		)
		return
	}
	final := msgs[len(msgs)-1]
	log.InfoContext(
		ctx,
		"LLM response",
		slog.String("text", final.Content),
		slog.String("model", model),
	)

	// Determine voice to use.
	var voice = req.Voice
	if req.Voice == "" && detectedLang != "" {
		if autoVoice := svc.tts.SelectVoice(
			detectedLang,
		); autoVoice != "" {
			voice = autoVoice
			log.InfoContext(ctx, "auto-selected voice",
				slog.String("language", detectedLang),
				slog.String("voice", voice),
			)
		}
	} else if req.Voice == "" {
		log.WarnContext(
			ctx,
			"auto-voice enabled but no language detected",
		)
	}

	// Generate audio response with selected voice.
	audio, err := svc.tts.Synthesize(ctx, final.Content, voice)
	if err != nil {
		log.ErrorContext(ctx, "TTS failed", slog.Any("error", err))
		http.Error(w, "TTS error", http.StatusInternalServerError)
		return
	}

	w.Header().Set("Content-Type", "application/json")
	if err := json.NewEncoder(w).Encode(Response{
		Audio:            base64.StdEncoding.EncodeToString(audio),
		DetectedLanguage: detectedLang,
		Messages:         msgs,
		Model:            model,
		Transcription:    transcription,
		Voice:            voice,
	}); err != nil {
		log.ErrorContext(
			ctx,
			"failed to json encode response",
			slog.Any("error", err),
		)
	}
}

// StreamMessage is the SSE event payload for the streaming voice endpoint.
type StreamMessage struct {
	// Audio is the base64-encoded WAV audio response.
	Audio string `json:"audio,omitempty"`
	// DetectedLanguage is the language detected in the input speech.
	DetectedLanguage string `json:"detected_language,omitempty"`
	// Error is an error message, if any.
	Error string `json:"error,omitempty"`
	// Message is the chat completion message.
	Message openai.ChatCompletionMessage `json:"message"`
	// Model is the LLM model used for the response.
	Model string `json:"model,omitempty"`
	// Transcription is the transcribed user speech from the input audio.
	Transcription string `json:"transcription,omitempty"`
	// Voice is the voice used for TTS synthesis.
	Voice string `json:"voice,omitempty"`
}

// voiceStream processes voice requests with streaming SSE output.
func (svc *Service) voiceStream(w http.ResponseWriter, r *http.Request) {
	var (
		ctx = r.Context()
		log = ctx.Value("log").(*slog.Logger)
	)

	// Check that the response writer supports flushing.
	flusher, ok := w.(http.Flusher)
	if !ok {
		http.Error(
			w,
			"streaming not supported",
			http.StatusInternalServerError,
		)
		return
	}

	// Parse request.
	r.Body = http.MaxBytesReader(w, r.Body, 32<<20)
	var req Request
	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
		log.ErrorContext(
			ctx,
			"failed to decode request",
			slog.Any("error", err),
		)
		http.Error(w, "invalid request", http.StatusBadRequest)
		return
	}

	// Validate messages.
	if len(req.Messages) == 0 {
		http.Error(w, "messages required", http.StatusBadRequest)
		return
	}

	// Acquire semaphore.
	if err := svc.sem.Acquire(ctx, 1); err != nil {
		http.Error(
			w,
			"service unavailable",
			http.StatusServiceUnavailable,
		)
		return
	}
	defer svc.sem.Release(1)

	var (
		messages      = req.Messages
		transcription string
		detectedLang  string
	)

	// If audio provided, transcribe and append to last message.
	if req.Audio != "" {
		last := &messages[len(messages)-1]
		if last.Role != openai.ChatMessageRoleUser {
			http.Error(
				w,
				"last message must be role=user when audio is provided",
				http.StatusBadRequest,
			)
			return
		}

		data, err := base64.StdEncoding.DecodeString(req.Audio)
		if err != nil {
			log.ErrorContext(
				ctx,
				"failed to decode audio",
				slog.Any("error", err),
			)
			http.Error(w, "invalid audio", http.StatusBadRequest)
			return
		}

		output, err := svc.stt.Transcribe(ctx, data)
		if err != nil {
			log.ErrorContext(
				ctx,
				"STT failed",
				slog.Any("error", err),
			)
			http.Error(
				w,
				"STT error",
				http.StatusInternalServerError,
			)
			return
		}

		transcription = strings.TrimSpace(output.Text)
		detectedLang = output.DetectedLanguage
		if detectedLang == "" {
			detectedLang = output.Language
		}
		log.InfoContext(
			ctx,
			"transcribed audio",
			slog.String("text", transcription),
			slog.String("language", detectedLang),
		)

		// Append transcription to last message's content.
		switch {
		case len(last.MultiContent) > 0:
			last.MultiContent = append(last.MultiContent,
				openai.ChatMessagePart{
					Type: openai.ChatMessagePartTypeText,
					Text: transcription,
				},
			)
			last.Content = ""
		case last.Content != "":
			last.MultiContent = []openai.ChatMessagePart{
				{
					Type: openai.ChatMessagePartTypeText,
					Text: last.Content,
				},
				{
					Type: openai.ChatMessagePartTypeText,
					Text: transcription,
				},
			}
			last.Content = ""
		default:
			last.Content = transcription
			last.MultiContent = nil
		}
	}

	// Set SSE headers.
	w.Header().Set("Cache-Control", "no-cache")
	w.Header().Set("Connection", "keep-alive")
	w.Header().Set("Content-Type", "text/event-stream")

	// Helper to send an SSE event.
	send := func(msg StreamMessage) {
		data, err := json.Marshal(msg)
		if err != nil {
			log.ErrorContext(ctx, "failed to marshal SSE event",
				slog.Any("error", err),
			)
			return
		}
		fmt.Fprintf(w, "event: message\ndata: %s\n\n", data)
		flusher.Flush()
	}

	// If audio was transcribed, send user message with transcription.
	if transcription != "" {
		send(StreamMessage{
			Message: openai.ChatCompletionMessage{
				Role:    openai.ChatMessageRoleUser,
				Content: transcription,
			},
			Transcription:    transcription,
			DetectedLanguage: detectedLang,
		})
	}

	// Get model.
	var model = req.Model
	if model == "" {
		model = svc.llm.DefaultModel()
	}

	// Determine voice to use.
	var voice = req.Voice
	if req.Voice == "" && detectedLang != "" {
		if autoVoice := svc.tts.SelectVoice(
			detectedLang,
		); autoVoice != "" {
			voice = autoVoice
			log.InfoContext(ctx, "auto-selected voice",
				slog.String("language", detectedLang),
				slog.String("voice", voice),
			)
		}
	}

	// Start streaming LLM query.
	var (
		events = make(chan llm.StreamEvent)
		llmErr error
	)
	go func() {
		llmErr = svc.llm.QueryStream(ctx, messages, model, events)
	}()

	// Consume events and send as SSE.
	var last StreamMessage
	for evt := range events {
		msg := StreamMessage{Message: evt.Message}

		// Track the last assistant message for TTS.
		if evt.Message.Role == openai.ChatMessageRoleAssistant &&
			len(evt.Message.ToolCalls) == 0 {
			last = msg
			continue
		}

		send(msg)
	}
	// Check for LLM errors.
	if llmErr != nil {
		log.ErrorContext(
			ctx,
			"LLM stream failed",
			slog.Any("error", llmErr),
		)
		send(StreamMessage{
			Message: openai.ChatCompletionMessage{
				Role: openai.ChatMessageRoleAssistant,
			},
			Error: fmt.Sprintf("LLM error: %v", llmErr),
		})
		return
	}

	// Synthesize TTS for the final assistant message.
	if last.Message.Content != "" {
		audio, err := svc.tts.Synthesize(
			ctx, last.Message.Content, voice,
		)
		if err != nil {
			log.ErrorContext(
				ctx, "TTS failed", slog.Any("error", err),
			)
			last.Error = fmt.Sprintf("TTS error: %v", err)
		} else {
			last.Audio = base64.StdEncoding.EncodeToString(audio)
		}
	}
	last.Model = model
	last.Voice = voice
	send(last)
}

// models returns available LLM models.
func (svc *Service) models(w http.ResponseWriter, r *http.Request) {
	var (
		ctx = r.Context()
		log = ctx.Value("log").(*slog.Logger)
	)

	models, err := svc.llm.ListModels(ctx)
	if err != nil {
		log.ErrorContext(
			ctx,
			"failed to list models",
			slog.Any("error", err),
		)
		http.Error(
			w,
			"failed to list models",
			http.StatusInternalServerError,
		)
		return
	}

	w.Header().Set("Content-Type", "application/json")
	if err := json.NewEncoder(w).Encode(struct {
		Models       []openai.Model `json:"models"`
		DefaultModel string         `json:"default_model"`
	}{
		Models:       models,
		DefaultModel: svc.llm.DefaultModel(),
	}); err != nil {
		log.ErrorContext(
			ctx,
			"failed to encode models response",
			slog.Any("error", err),
		)
	}
}

// voices returns available TTS voices.
func (svc *Service) voices(w http.ResponseWriter, r *http.Request) {
	var (
		ctx = r.Context()
		log = ctx.Value("log").(*slog.Logger)
	)

	voices, err := svc.tts.ListVoices(ctx)
	if err != nil {
		log.ErrorContext(
			ctx,
			"failed to list voices",
			slog.Any("error", err),
		)
		http.Error(
			w,
			"failed to list voices",
			http.StatusInternalServerError,
		)
		return
	}

	w.Header().Set("Content-Type", "application/json")
	if err := json.NewEncoder(w).Encode(map[string][]string{
		"voices": voices,
	}); err != nil {
		log.ErrorContext(
			ctx,
			"failed to encode voices response",
			slog.Any("error", err),
		)
	}
}