from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form from fastapi.responses import Response from pydantic import BaseModel from time import perf_counter from auth.jwt import get_current_user from services.tts_service import synthesize_wav, warmup_xtts_model, get_xtts_warmup_state from services.stt_service import transcribe_audio_bytes, warmup_whisper_model from services.latency_service import record_latency router = APIRouter() class SpeechSynthesisRequest(BaseModel): text: str voice_gender: str = "female" @router.get("/health") async def speech_health(current_user: dict = Depends(get_current_user)): """Check whether speech route is available for authenticated users.""" _ = current_user state = get_xtts_warmup_state() return { "status": "ok", "service": "speech", "xtts_ready": bool(state.get("is_warm")), } @router.post("/warmup") async def speech_warmup(current_user: dict = Depends(get_current_user)): """Warm XTTS model so first interview playback does not hit cold-start delay.""" _ = current_user xtts_ready = await warmup_xtts_model() await warmup_whisper_model() state = get_xtts_warmup_state() if not xtts_ready: raise HTTPException( status_code=503, detail=f"XTTS warmup failed: {state.get('last_error') or 'unknown error'}", ) return { "status": "ok", "message": "speech model warmed", "xtts_ready": True, } @router.post("/synthesize") async def synthesize_speech( request: SpeechSynthesisRequest, current_user: dict = Depends(get_current_user), ): """Synthesize text to WAV bytes using Coqui TTS models.""" try: wav_bytes = await synthesize_wav(request.text, request.voice_gender) return Response(content=wav_bytes, media_type="audio/wav") except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except RuntimeError as e: # XTTS may be in cold-start transition; warm once and retry before failing. try: xtts_ready = await warmup_xtts_model() if not xtts_ready: state = get_xtts_warmup_state() raise HTTPException( status_code=503, detail=f"XTTS warmup failed: {state.get('last_error') or str(e)}", ) wav_bytes = await synthesize_wav(request.text, request.voice_gender) return Response(content=wav_bytes, media_type="audio/wav") except HTTPException: raise except Exception: raise HTTPException(status_code=503, detail=str(e)) except Exception as e: # Retry once after explicit warmup even for non-RuntimeError failures. try: xtts_ready = await warmup_xtts_model() if xtts_ready: wav_bytes = await synthesize_wav(request.text, request.voice_gender) return Response(content=wav_bytes, media_type="audio/wav") except Exception: pass state = get_xtts_warmup_state() raise HTTPException( status_code=503, detail=f"Speech synthesis backend unavailable: {state.get('last_error') or str(e)}", ) @router.post("/transcribe") async def transcribe_speech( audio: UploadFile = File(...), language: str = Form("en"), current_user: dict = Depends(get_current_user), ): """Transcribe uploaded interview audio using Whisper model.""" started_at = perf_counter() try: payload = await audio.read() text = await transcribe_audio_bytes( audio_bytes=payload, filename=audio.filename or "speech.webm", language=language, ) elapsed_ms = (perf_counter() - started_at) * 1000.0 await record_latency("stt_ms", elapsed_ms) return {"text": text, "stt_ms": round(elapsed_ms, 2)} except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) except RuntimeError as e: raise HTTPException(status_code=503, detail=str(e)) except Exception as e: raise HTTPException(status_code=500, detail=f"Speech transcription failed: {str(e)}")