Spaces:

sajith-0701
/

interviewbot

Sleeping

App Files Files Community

sajith-0701 commited on Apr 5

Commit

0eda9c2

1 Parent(s): d50ee26

v2.1

Browse files

XTTS and Whisper are initialized

Files changed (7) hide show

.gitignore +2 -1
backend/main.py +14 -0
backend/requirements.txt +1 -0
backend/routers/speech.py +34 -2
backend/services/interview_service.py +69 -1
backend/services/stt_service.py +110 -0
backend/services/tts_service.py +160 -6

.gitignore CHANGED Viewed

@@ -11,4 +11,5 @@ dist
 inter
 Resume.pdf
 LANGGRAPH_AND_TOOLS.md
-WORKFLOW.md

 inter
 Resume.pdf
 LANGGRAPH_AND_TOOLS.md
+WORKFLOW.md
+voice_name_list_xtts.txt

backend/main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -8,6 +9,8 @@ import os
 from config import get_settings
 from database import connect_db, close_db
 from routers import auth, resume, profile, interview, reports, admin, speech
@@ -19,6 +22,17 @@ async def lifespan(app: FastAPI):
     # Startup
     await connect_db()
     os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
     print(f"🚀 Interview Bot API running in {settings.APP_ENV} mode")
     yield
     # Shutdown

 from contextlib import asynccontextmanager
+import asyncio
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 from config import get_settings
 from database import connect_db, close_db
+from services.tts_service import warmup_xtts_model
+from services.stt_service import warmup_whisper_model
 from routers import auth, resume, profile, interview, reports, admin, speech
     # Startup
     await connect_db()
     os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
+    try:
+        await asyncio.wait_for(warmup_xtts_model(), timeout=45)
+        print("XTTS warmup: ready")
+    except Exception as exc:
+        print(f"XTTS warmup skipped: {exc}")
+    try:
+        await asyncio.wait_for(warmup_whisper_model(), timeout=45)
+        print("Whisper warmup: ready")
+    except Exception as exc:
+        print(f"Whisper warmup skipped: {exc}")
     print(f"🚀 Interview Bot API running in {settings.APP_ENV} mode")
     yield
     # Shutdown

backend/requirements.txt CHANGED Viewed

@@ -16,3 +16,4 @@ python-dotenv==1.0.1
 aiofiles==24.1.0
 pypdf==5.4.0
 python-docx==1.1.2

 aiofiles==24.1.0
 pypdf==5.4.0
 python-docx==1.1.2
+faster-whisper==1.0.3

backend/routers/speech.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response
 from pydantic import BaseModel
 from auth.jwt import get_current_user
-from services.tts_service import synthesize_wav
 router = APIRouter()
@@ -19,6 +20,14 @@ async def speech_health(current_user: dict = Depends(get_current_user)):
     return {"status": "ok", "service": "speech"}
 @router.post("/synthesize")
 async def synthesize_speech(
     request: SpeechSynthesisRequest,
@@ -34,3 +43,26 @@ async def synthesize_speech(
         raise HTTPException(status_code=503, detail=str(e))
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {str(e)}")

+from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form
 from fastapi.responses import Response
 from pydantic import BaseModel
 from auth.jwt import get_current_user
+from services.tts_service import synthesize_wav, warmup_xtts_model
+from services.stt_service import transcribe_audio_bytes, warmup_whisper_model
 router = APIRouter()
     return {"status": "ok", "service": "speech"}
+@router.post("/warmup")
+async def speech_warmup(current_user: dict = Depends(get_current_user)):
+    """Warm XTTS model so first interview playback does not hit cold-start delay."""
+    await warmup_xtts_model()
+    await warmup_whisper_model()
+    return {"status": "ok", "message": "speech model warmed"}
 @router.post("/synthesize")
 async def synthesize_speech(
     request: SpeechSynthesisRequest,
         raise HTTPException(status_code=503, detail=str(e))
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Speech synthesis failed: {str(e)}")
+@router.post("/transcribe")
+async def transcribe_speech(
+    audio: UploadFile = File(...),
+    language: str = Form("en"),
+    current_user: dict = Depends(get_current_user),
+):
+    """Transcribe uploaded interview audio using Whisper model."""
+    try:
+        payload = await audio.read()
+        text = await transcribe_audio_bytes(
+            audio_bytes=payload,
+            filename=audio.filename or "speech.webm",
+            language=language,
+        )
+        return {"text": text}
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except RuntimeError as e:
+        raise HTTPException(status_code=503, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Speech transcription failed: {str(e)}")

backend/services/interview_service.py CHANGED Viewed

@@ -2,12 +2,13 @@ import json
 import asyncio
 from bson import ObjectId
 from database import get_db, get_redis
-from models.collections import SESSIONS, JOB_ROLES, SKILLS, QUESTIONS, TOPICS, TOPIC_QUESTIONS, ROLE_REQUIREMENTS, RESUMES
 from utils.helpers import generate_id, utc_now, str_objectid
 from utils.skills import normalize_skill_list, find_matching_skills, find_missing_skills, build_interview_focus_skills
 from services.interview_graph import run_interview_graph
 from utils.gemini import generate_interview_question_batch, analyze_resume_vs_job_description
 from services.job_description_service import get_job_description_for_user
 MAX_QUESTIONS = 20
 SESSION_TTL = 7200  # 2 hours
@@ -43,6 +44,31 @@ def _safe_int(value, default: int = 0) -> int:
         return default
 def _normalize_bank_difficulty(value: str) -> str:
     difficulty = (value or "medium").strip().lower()
     if difficulty not in {"easy", "medium", "hard"}:
@@ -418,6 +444,13 @@ async def _start_topic_interview(user_id: str, topic_id: str) -> dict:
     session_id = generate_id()
     _LOCAL_SUMMARIES[session_id] = ""
     session_doc = {
         "session_id": session_id,
         "user_id": user_id,
@@ -434,6 +467,7 @@ async def _start_topic_interview(user_id: str, topic_id: str) -> dict:
         "metrics_bank_questions": 0,
         "metrics_bank_shortfall": 0,
         "metrics_generation_batches": 0,
         "timer_enabled": timer_enabled,
         "timer_seconds": timer_seconds,
         "started_at": utc_now(),
@@ -459,6 +493,7 @@ async def _start_topic_interview(user_id: str, topic_id: str) -> dict:
         "timer_enabled": str(timer_enabled),
         "timer_seconds": str(timer_seconds or ""),
         "status": "in_progress",
         "metrics_gemini_calls": 0,
         "metrics_gemini_questions": 0,
         "metrics_bank_questions": 0,
@@ -492,6 +527,14 @@ async def _start_topic_interview(user_id: str, topic_id: str) -> dict:
         await redis.expire(f"session:{session_id}:pending_questions", SESSION_TTL)
     first_q_data = await redis.hgetall(f"session:{session_id}:q:{first_id}")
     return {
         "session_id": session_id,
         "interview_type": "topic",
@@ -598,6 +641,13 @@ async def start_interview(
     db = get_db()
     redis = get_redis()
     # Get user skills
     skills_doc = await db[SKILLS].find_one({"user_id": user_id})
     user_skills = skills_doc.get("skills", ["general"]) if skills_doc else ["general"]
@@ -680,6 +730,7 @@ async def start_interview(
         "metrics_bank_questions": initial_bank_questions,
         "metrics_bank_shortfall": initial_bank_shortfall,
         "metrics_generation_batches": 1,
         "started_at": utc_now(),
     }
     await db[SESSIONS].insert_one(session_doc)
@@ -702,6 +753,7 @@ async def start_interview(
         "current_difficulty": last_difficulty,
         "interview_type": "resume",
         "status": "in_progress",
         "metrics_gemini_calls": initial_gemini_calls,
         "metrics_gemini_questions": initial_gemini_questions,
         "metrics_bank_questions": initial_bank_questions,
@@ -720,6 +772,13 @@ async def start_interview(
         await redis.expire(f"session:{session_id}:pending_questions", SESSION_TTL)
     first_q_data = await redis.hgetall(f"session:{session_id}:q:{first_id}")
     return {
         "session_id": session_id,
@@ -885,6 +944,15 @@ async def submit_answer(session_id: str, question_id: str, answer: str) -> dict:
         raise ValueError("Unable to fetch or generate next question")
     q_data = await redis.hgetall(f"session:{session_id}:q:{next_question_id}")
     next_difficulty = q_data.get("difficulty", session.get("current_difficulty", "medium"))
     new_count = question_count + 1
     new_served_count = served_count + 1

 import asyncio
 from bson import ObjectId
 from database import get_db, get_redis
+from models.collections import SESSIONS, USERS, JOB_ROLES, SKILLS, QUESTIONS, TOPICS, TOPIC_QUESTIONS, ROLE_REQUIREMENTS, RESUMES
 from utils.helpers import generate_id, utc_now, str_objectid
 from utils.skills import normalize_skill_list, find_matching_skills, find_missing_skills, build_interview_focus_skills
 from services.interview_graph import run_interview_graph
 from utils.gemini import generate_interview_question_batch, analyze_resume_vs_job_description
 from services.job_description_service import get_job_description_for_user
+from services.tts_service import prefetch_wav
 MAX_QUESTIONS = 20
 SESSION_TTL = 7200  # 2 hours
         return default
+def _normalize_voice_gender(value: str | None) -> str:
+    return "male" if (value or "").strip().lower() == "male" else "female"
+def _consume_prefetch_task_result(task: asyncio.Task) -> None:
+    try:
+        task.result()
+    except Exception:
+        # Prefetch is optional; ignore failures to avoid noisy task warnings.
+        pass
+def _schedule_question_audio_prefetch(questions: list[str], voice_gender: str) -> None:
+    for q in questions:
+        text = (q or "").strip()
+        if not text:
+            continue
+        try:
+            task = asyncio.create_task(prefetch_wav(text, voice_gender))
+            task.add_done_callback(_consume_prefetch_task_result)
+        except Exception:
+            # Best-effort optimization only.
+            pass
 def _normalize_bank_difficulty(value: str) -> str:
     difficulty = (value or "medium").strip().lower()
     if difficulty not in {"easy", "medium", "hard"}:
     session_id = generate_id()
     _LOCAL_SUMMARIES[session_id] = ""
+    user_doc = None
+    try:
+        user_doc = await db[USERS].find_one({"_id": ObjectId(user_id)}, {"speech_settings": 1})
+    except Exception:
+        user_doc = await db[USERS].find_one({"user_id": user_id}, {"speech_settings": 1})
+    speech_voice_gender = _normalize_voice_gender(((user_doc or {}).get("speech_settings") or {}).get("voice_gender"))
     session_doc = {
         "session_id": session_id,
         "user_id": user_id,
         "metrics_bank_questions": 0,
         "metrics_bank_shortfall": 0,
         "metrics_generation_batches": 0,
+        "speech_voice_gender": speech_voice_gender,
         "timer_enabled": timer_enabled,
         "timer_seconds": timer_seconds,
         "started_at": utc_now(),
         "timer_enabled": str(timer_enabled),
         "timer_seconds": str(timer_seconds or ""),
         "status": "in_progress",
+        "speech_voice_gender": speech_voice_gender,
         "metrics_gemini_calls": 0,
         "metrics_gemini_questions": 0,
         "metrics_bank_questions": 0,
         await redis.expire(f"session:{session_id}:pending_questions", SESSION_TTL)
     first_q_data = await redis.hgetall(f"session:{session_id}:q:{first_id}")
+    _schedule_question_audio_prefetch(
+        [
+            first_q_data.get("question", ""),
+            *[q.get("question", "") for q in selected[1:3]],
+        ],
+        speech_voice_gender,
+    )
     return {
         "session_id": session_id,
         "interview_type": "topic",
     db = get_db()
     redis = get_redis()
+    user_doc = None
+    try:
+        user_doc = await db[USERS].find_one({"_id": ObjectId(user_id)}, {"speech_settings": 1})
+    except Exception:
+        user_doc = await db[USERS].find_one({"user_id": user_id}, {"speech_settings": 1})
+    speech_voice_gender = _normalize_voice_gender(((user_doc or {}).get("speech_settings") or {}).get("voice_gender"))
     # Get user skills
     skills_doc = await db[SKILLS].find_one({"user_id": user_id})
     user_skills = skills_doc.get("skills", ["general"]) if skills_doc else ["general"]
         "metrics_bank_questions": initial_bank_questions,
         "metrics_bank_shortfall": initial_bank_shortfall,
         "metrics_generation_batches": 1,
+        "speech_voice_gender": speech_voice_gender,
         "started_at": utc_now(),
     }
     await db[SESSIONS].insert_one(session_doc)
         "current_difficulty": last_difficulty,
         "interview_type": "resume",
         "status": "in_progress",
+        "speech_voice_gender": speech_voice_gender,
         "metrics_gemini_calls": initial_gemini_calls,
         "metrics_gemini_questions": initial_gemini_questions,
         "metrics_bank_questions": initial_bank_questions,
         await redis.expire(f"session:{session_id}:pending_questions", SESSION_TTL)
     first_q_data = await redis.hgetall(f"session:{session_id}:q:{first_id}")
+    _schedule_question_audio_prefetch(
+        [
+            first_q_data.get("question", ""),
+            *[item.get("question", "") for item in initial_batch[1:4]],
+        ],
+        speech_voice_gender,
+    )
     return {
         "session_id": session_id,
         raise ValueError("Unable to fetch or generate next question")
     q_data = await redis.hgetall(f"session:{session_id}:q:{next_question_id}")
+    speech_voice_gender = _normalize_voice_gender(session.get("speech_voice_gender"))
+    # Prefetch the spoken audio for this question and one-ahead question.
+    prefetch_texts = [q_data.get("question", "")]
+    peek_next_id = await redis.lindex(f"session:{session_id}:pending_questions", 0)
+    if peek_next_id:
+        peek_q = await redis.hgetall(f"session:{session_id}:q:{peek_next_id}")
+        prefetch_texts.append(peek_q.get("question", ""))
+    _schedule_question_audio_prefetch(prefetch_texts, speech_voice_gender)
     next_difficulty = q_data.get("difficulty", session.get("current_difficulty", "medium"))
     new_count = question_count + 1
     new_served_count = served_count + 1

backend/services/stt_service.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import asyncio
+import os
+import tempfile
+# On Windows, ctranslate2 and torch can load separate OpenMP runtimes.
+# Allowing duplicates avoids process aborts during model initialization.
+os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+_WHISPER_MODEL_CACHE = {}
+_WHISPER_MODEL_LOCK = asyncio.Lock()
+def _resolve_device() -> str:
+    pref = os.getenv("WHISPER_DEVICE", "auto").strip().lower()
+    if pref in {"cpu", "cuda"}:
+        return pref
+    try:
+        import torch
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    except Exception:
+        return "cpu"
+def _resolve_compute_type(device: str) -> str:
+    pref = os.getenv("WHISPER_COMPUTE_TYPE", "auto").strip().lower()
+    if pref and pref != "auto":
+        return pref
+    return "float16" if device == "cuda" else "int8"
+def _resolve_model_size() -> str:
+    return os.getenv("WHISPER_MODEL_SIZE", "base").strip() or "base"
+async def _get_whisper_model():
+    model_size = _resolve_model_size()
+    device = _resolve_device()
+    compute_type = _resolve_compute_type(device)
+    cache_key = f"{model_size}|{device}|{compute_type}"
+    async with _WHISPER_MODEL_LOCK:
+        if cache_key in _WHISPER_MODEL_CACHE:
+            return _WHISPER_MODEL_CACHE[cache_key]
+        def _load_model():
+            try:
+                from faster_whisper import WhisperModel
+            except Exception as exc:
+                raise RuntimeError(
+                    "faster-whisper is not installed in the active Python environment"
+                ) from exc
+            try:
+                return WhisperModel(model_size, device=device, compute_type=compute_type)
+            except Exception:
+                # Keep service resilient if GPU config mismatches runtime.
+                return WhisperModel(model_size, device="cpu", compute_type="int8")
+        model = await asyncio.to_thread(_load_model)
+        _WHISPER_MODEL_CACHE[cache_key] = model
+        return model
+async def warmup_whisper_model() -> None:
+    try:
+        await _get_whisper_model()
+    except Exception:
+        # Best-effort warmup only.
+        pass
+async def transcribe_audio_bytes(audio_bytes: bytes, filename: str = "speech.webm", language: str = "en") -> str:
+    if not audio_bytes:
+        raise ValueError("audio file is required")
+    model = await _get_whisper_model()
+    ext = os.path.splitext(filename or "speech.webm")[1] or ".webm"
+    target_language = (language or "en").strip().lower() or "en"
+    fd, tmp_path = tempfile.mkstemp(suffix=ext)
+    os.close(fd)
+    try:
+        with open(tmp_path, "wb") as f:
+            f.write(audio_bytes)
+        def _transcribe() -> str:
+            segments, _ = model.transcribe(
+                tmp_path,
+                language=target_language,
+                beam_size=1,
+                best_of=1,
+                vad_filter=True,
+                condition_on_previous_text=False,
+                temperature=0.0,
+            )
+            parts = []
+            for seg in segments:
+                text = (seg.text or "").strip()
+                if text:
+                    parts.append(text)
+            return " ".join(parts).strip()
+        text = await asyncio.to_thread(_transcribe)
+        return text
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)

backend/services/tts_service.py CHANGED Viewed

@@ -2,9 +2,28 @@ import asyncio
 import os
 import tempfile
 from typing import Tuple
 _MODEL_CACHE = {}
 _MODEL_LOCK = asyncio.Lock()
 def _select_model(voice_gender: str) -> Tuple[str, str | None]:
@@ -29,7 +48,27 @@ async def _get_tts_model(model_name: str):
                     "Coqui TTS is not installed in the active Python environment"
                 ) from exc
-            # Use CPU by default for compatibility.
             return TTS(model_name=model_name, progress_bar=False, gpu=False)
         model = await asyncio.to_thread(_load_model)
@@ -37,11 +76,78 @@ async def _get_tts_model(model_name: str):
         return model
-async def synthesize_wav(text: str, voice_gender: str = "female") -> bytes:
-    content = (text or "").strip()
-    if not content:
-        raise ValueError("text is required")
     model_name, speaker = _select_model(voice_gender)
     tts = await _get_tts_model(model_name)
@@ -50,7 +156,7 @@ async def synthesize_wav(text: str, voice_gender: str = "female") -> bytes:
     try:
         def _synthesize():
             kwargs = {
-                "text": content,
                 "file_path": tmp_path,
             }
             if speaker:
@@ -63,3 +169,51 @@ async def synthesize_wav(text: str, voice_gender: str = "female") -> bytes:
     finally:
         if os.path.exists(tmp_path):
             os.remove(tmp_path)

 import os
 import tempfile
 from typing import Tuple
+from collections import OrderedDict
 _MODEL_CACHE = {}
 _MODEL_LOCK = asyncio.Lock()
+_AUDIO_CACHE = OrderedDict()
+_AUDIO_CACHE_LOCK = asyncio.Lock()
+XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
+XTTS_LANGUAGE = "en"
+XTTS_SPEED = 1.2
+MAX_TEXT_LENGTH = 220
+_XTTS_WARM = False
+AUDIO_CACHE_MAX_ITEMS = 300
+# User-approved stable voices:
+# - Female: index 45 => Alexandra Hisakawa
+# - Male: index 21 => Abrahan Mack
+XTTS_SPEAKER_BY_GENDER = {
+    "female": "Alexandra Hisakawa",
+    "male": "Abrahan Mack",
+    "auto": "Alexandra Hisakawa",
+}
 def _select_model(voice_gender: str) -> Tuple[str, str | None]:
                     "Coqui TTS is not installed in the active Python environment"
                 ) from exc
+            gpu_pref = os.getenv("XTTS_USE_GPU", "auto").strip().lower()
+            use_gpu = False
+            if gpu_pref in {"1", "true", "yes", "on"}:
+                use_gpu = True
+            elif gpu_pref in {"0", "false", "no", "off"}:
+                use_gpu = False
+            else:
+                try:
+                    import torch
+                    use_gpu = bool(torch.cuda.is_available())
+                except Exception:
+                    use_gpu = False
+            if use_gpu:
+                try:
+                    return TTS(model_name=model_name, progress_bar=False, gpu=True)
+                except Exception:
+                    # Graceful CPU fallback when CUDA runtime is unavailable/mismatched.
+                    return TTS(model_name=model_name, progress_bar=False, gpu=False)
             return TTS(model_name=model_name, progress_bar=False, gpu=False)
         model = await asyncio.to_thread(_load_model)
         return model
+def _resolve_xtts_speaker(voice_gender: str) -> str:
+    gender = (voice_gender or "female").strip().lower()
+    if gender not in XTTS_SPEAKER_BY_GENDER:
+        gender = "female"
+    return XTTS_SPEAKER_BY_GENDER[gender]
+def _truncate_text(value: str, max_length: int = MAX_TEXT_LENGTH) -> str:
+    content = " ".join((value or "").strip().split())
+    if len(content) <= max_length:
+        return content
+    trimmed = content[:max_length].rstrip()
+    # Keep sentence boundaries cleaner when truncating.
+    for marker in ("?", "!", "."):
+        if marker in trimmed:
+            head = trimmed.rsplit(marker, 1)[0].strip()
+            if len(head) >= max_length // 2:
+                return f"{head}{marker}"
+    return trimmed
+async def warmup_xtts_model() -> None:
+    """Preload XTTS to avoid long cold-start on first interview question."""
+    global _XTTS_WARM
+    if _XTTS_WARM:
+        return
+    try:
+        await _get_tts_model(XTTS_MODEL)
+        _XTTS_WARM = True
+    except Exception:
+        # Keep API startup resilient; synthesis route still has fallbacks.
+        pass
+def _synthesize_xtts_to_file(tts, text: str, speaker: str, file_path: str) -> None:
+    kwargs = {
+        "text": text,
+        "file_path": file_path,
+        "speaker": speaker,
+        "language": XTTS_LANGUAGE,
+    }
+    try:
+        # Faster delivery for interview prompts.
+        tts.tts_to_file(**kwargs, speed=XTTS_SPEED)
+    except TypeError:
+        # Some model/runtime combinations may not expose speed arg.
+        tts.tts_to_file(**kwargs)
+def _build_audio_cache_key(text: str, voice_gender: str) -> str:
+    return f"{(voice_gender or 'female').strip().lower()}::{text.strip()}"
+async def _get_cached_audio(cache_key: str) -> bytes | None:
+    async with _AUDIO_CACHE_LOCK:
+        value = _AUDIO_CACHE.get(cache_key)
+        if value is None:
+            return None
+        # LRU touch.
+        _AUDIO_CACHE.move_to_end(cache_key)
+        return value
+async def _set_cached_audio(cache_key: str, data: bytes) -> None:
+    async with _AUDIO_CACHE_LOCK:
+        _AUDIO_CACHE[cache_key] = data
+        _AUDIO_CACHE.move_to_end(cache_key)
+        while len(_AUDIO_CACHE) > AUDIO_CACHE_MAX_ITEMS:
+            _AUDIO_CACHE.popitem(last=False)
+async def _synthesize_fallback_wav(text: str, voice_gender: str) -> bytes:
     model_name, speaker = _select_model(voice_gender)
     tts = await _get_tts_model(model_name)
     try:
         def _synthesize():
             kwargs = {
+                "text": text,
                 "file_path": tmp_path,
             }
             if speaker:
     finally:
         if os.path.exists(tmp_path):
             os.remove(tmp_path)
+async def prefetch_wav(text: str, voice_gender: str = "female") -> None:
+    """Best-effort speech prefetch to warm audio cache."""
+    try:
+        await synthesize_wav(text, voice_gender)
+    except Exception:
+        # Silent prefetch failure; runtime synth may still succeed later.
+        pass
+async def synthesize_wav(text: str, voice_gender: str = "female") -> bytes:
+    content = _truncate_text(text)
+    if not content:
+        raise ValueError("text is required")
+    normalized_gender = (voice_gender or "female").strip().lower()
+    if normalized_gender not in {"male", "female", "auto"}:
+        normalized_gender = "female"
+    cache_key = _build_audio_cache_key(content, normalized_gender)
+    cached = await _get_cached_audio(cache_key)
+    if cached:
+        return cached
+    speaker = _resolve_xtts_speaker(normalized_gender)
+    tts = await _get_tts_model(XTTS_MODEL)
+    fd, tmp_path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    try:
+        def _synthesize():
+            _synthesize_xtts_to_file(tts, text=content, speaker=speaker, file_path=tmp_path)
+        try:
+            await asyncio.to_thread(_synthesize)
+            with open(tmp_path, "rb") as f:
+                wav = f.read()
+            await _set_cached_audio(cache_key, wav)
+            return wav
+        except Exception:
+            # Keep speech available even if XTTS runtime has temporary issues.
+            wav = await _synthesize_fallback_wav(content, normalized_gender)
+            await _set_cached_audio(cache_key, wav)
+            return wav
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)