Spaces:
Sleeping
Sleeping
File size: 4,270 Bytes
0eda9c2 5094515 5837391 5094515 5837391 0eda9c2 5837391 5094515 5837391 5094515 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5094515 be9a4dd 5837391 be9a4dd 5837391 be9a4dd 5094515 5837391 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
from fastapi.responses import Response
from pydantic import BaseModel
from time import perf_counter
from auth.jwt import get_current_user
from services.tts_service import synthesize_wav, warmup_xtts_model, get_xtts_warmup_state
from services.stt_service import transcribe_audio_bytes, warmup_whisper_model
from services.latency_service import record_latency
router = APIRouter()
class SpeechSynthesisRequest(BaseModel):
text: str
voice_gender: str = "female"
@router.get("/health")
async def speech_health(current_user: dict = Depends(get_current_user)):
"""Check whether speech route is available for authenticated users."""
_ = current_user
state = get_xtts_warmup_state()
return {
"status": "ok",
"service": "speech",
"xtts_ready": bool(state.get("is_warm")),
}
@router.post("/warmup")
async def speech_warmup(current_user: dict = Depends(get_current_user)):
"""Warm XTTS model so first interview playback does not hit cold-start delay."""
_ = current_user
xtts_ready = await warmup_xtts_model()
await warmup_whisper_model()
state = get_xtts_warmup_state()
if not xtts_ready:
raise HTTPException(
status_code=503,
detail=f"XTTS warmup failed: {state.get('last_error') or 'unknown error'}",
)
return {
"status": "ok",
"message": "speech model warmed",
"xtts_ready": True,
}
@router.post("/synthesize")
async def synthesize_speech(
request: SpeechSynthesisRequest,
current_user: dict = Depends(get_current_user),
):
"""Synthesize text to WAV bytes using Coqui TTS models."""
try:
wav_bytes = await synthesize_wav(request.text, request.voice_gender)
return Response(content=wav_bytes, media_type="audio/wav")
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
# XTTS may be in cold-start transition; warm once and retry before failing.
try:
xtts_ready = await warmup_xtts_model()
if not xtts_ready:
state = get_xtts_warmup_state()
raise HTTPException(
status_code=503,
detail=f"XTTS warmup failed: {state.get('last_error') or str(e)}",
)
wav_bytes = await synthesize_wav(request.text, request.voice_gender)
return Response(content=wav_bytes, media_type="audio/wav")
except HTTPException:
raise
except Exception:
raise HTTPException(status_code=503, detail=str(e))
except Exception as e:
# Retry once after explicit warmup even for non-RuntimeError failures.
try:
xtts_ready = await warmup_xtts_model()
if xtts_ready:
wav_bytes = await synthesize_wav(request.text, request.voice_gender)
return Response(content=wav_bytes, media_type="audio/wav")
except Exception:
pass
state = get_xtts_warmup_state()
raise HTTPException(
status_code=503,
detail=f"Speech synthesis backend unavailable: {state.get('last_error') or str(e)}",
)
@router.post("/transcribe")
async def transcribe_speech(
audio: UploadFile = File(...),
language: str = Form("en"),
current_user: dict = Depends(get_current_user),
):
"""Transcribe uploaded interview audio using Whisper model."""
started_at = perf_counter()
try:
payload = await audio.read()
text = await transcribe_audio_bytes(
audio_bytes=payload,
filename=audio.filename or "speech.webm",
language=language,
)
elapsed_ms = (perf_counter() - started_at) * 1000.0
await record_latency("stt_ms", elapsed_ms)
return {"text": text, "stt_ms": round(elapsed_ms, 2)}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Speech transcription failed: {str(e)}")
|