File size: 4,270 Bytes
0eda9c2
5094515
 
5837391
5094515
 
5837391
0eda9c2
5837391
5094515
 
 
 
 
 
 
 
 
 
 
 
5837391
 
 
 
 
 
 
5094515
 
0eda9c2
 
 
5837391
 
0eda9c2
5837391
 
 
 
 
 
 
 
 
 
 
 
 
0eda9c2
 
5094515
 
 
 
 
 
 
 
 
 
 
 
be9a4dd
 
5837391
 
 
 
 
 
 
be9a4dd
 
5837391
 
 
be9a4dd
5094515
5837391
 
 
 
 
 
 
 
 
 
 
 
 
 
0eda9c2
 
 
 
 
 
 
 
 
5837391
0eda9c2
 
 
 
 
 
 
5837391
 
 
0eda9c2
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
from fastapi.responses import Response
from pydantic import BaseModel
from time import perf_counter

from auth.jwt import get_current_user
from services.tts_service import synthesize_wav, warmup_xtts_model, get_xtts_warmup_state
from services.stt_service import transcribe_audio_bytes, warmup_whisper_model
from services.latency_service import record_latency

router = APIRouter()


class SpeechSynthesisRequest(BaseModel):
    text: str
    voice_gender: str = "female"


@router.get("/health")
async def speech_health(current_user: dict = Depends(get_current_user)):
    """Check whether speech route is available for authenticated users."""
    _ = current_user
    state = get_xtts_warmup_state()
    return {
        "status": "ok",
        "service": "speech",
        "xtts_ready": bool(state.get("is_warm")),
    }


@router.post("/warmup")
async def speech_warmup(current_user: dict = Depends(get_current_user)):
    """Warm XTTS model so first interview playback does not hit cold-start delay."""
    _ = current_user
    xtts_ready = await warmup_xtts_model()
    await warmup_whisper_model()

    state = get_xtts_warmup_state()
    if not xtts_ready:
        raise HTTPException(
            status_code=503,
            detail=f"XTTS warmup failed: {state.get('last_error') or 'unknown error'}",
        )

    return {
        "status": "ok",
        "message": "speech model warmed",
        "xtts_ready": True,
    }


@router.post("/synthesize")
async def synthesize_speech(
    request: SpeechSynthesisRequest,
    current_user: dict = Depends(get_current_user),
):
    """Synthesize text to WAV bytes using Coqui TTS models."""
    try:
        wav_bytes = await synthesize_wav(request.text, request.voice_gender)
        return Response(content=wav_bytes, media_type="audio/wav")
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        # XTTS may be in cold-start transition; warm once and retry before failing.
        try:
            xtts_ready = await warmup_xtts_model()
            if not xtts_ready:
                state = get_xtts_warmup_state()
                raise HTTPException(
                    status_code=503,
                    detail=f"XTTS warmup failed: {state.get('last_error') or str(e)}",
                )
            wav_bytes = await synthesize_wav(request.text, request.voice_gender)
            return Response(content=wav_bytes, media_type="audio/wav")
        except HTTPException:
            raise
        except Exception:
            raise HTTPException(status_code=503, detail=str(e))
    except Exception as e:
        # Retry once after explicit warmup even for non-RuntimeError failures.
        try:
            xtts_ready = await warmup_xtts_model()
            if xtts_ready:
                wav_bytes = await synthesize_wav(request.text, request.voice_gender)
                return Response(content=wav_bytes, media_type="audio/wav")
        except Exception:
            pass

        state = get_xtts_warmup_state()
        raise HTTPException(
            status_code=503,
            detail=f"Speech synthesis backend unavailable: {state.get('last_error') or str(e)}",
        )


@router.post("/transcribe")
async def transcribe_speech(
    audio: UploadFile = File(...),
    language: str = Form("en"),
    current_user: dict = Depends(get_current_user),
):
    """Transcribe uploaded interview audio using Whisper model."""
    started_at = perf_counter()
    try:
        payload = await audio.read()
        text = await transcribe_audio_bytes(
            audio_bytes=payload,
            filename=audio.filename or "speech.webm",
            language=language,
        )
        elapsed_ms = (perf_counter() - started_at) * 1000.0
        await record_latency("stt_ms", elapsed_ms)
        return {"text": text, "stt_ms": round(elapsed_ms, 2)}
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        raise HTTPException(status_code=503, detail=str(e))
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Speech transcription failed: {str(e)}")