Spaces:
Sleeping
Sleeping
File size: 9,331 Bytes
5094515 0eda9c2 5837391 5094515 0eda9c2 be9a4dd 5837391 0eda9c2 5837391 0eda9c2 be9a4dd 0eda9c2 5094515 5837391 5094515 5837391 5094515 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5837391 5094515 0eda9c2 be9a4dd 0eda9c2 be9a4dd 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5837391 0eda9c2 5094515 0eda9c2 5094515 0eda9c2 5094515 0eda9c2 be9a4dd 0eda9c2 be9a4dd 0eda9c2 be9a4dd 0eda9c2 be9a4dd 0eda9c2 be9a4dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 | import asyncio
import os
import tempfile
from typing import Tuple
from collections import OrderedDict
from functools import wraps
_MODEL_CACHE = {}
_MODEL_LOCK = asyncio.Lock()
_AUDIO_CACHE = OrderedDict()
_AUDIO_CACHE_LOCK = asyncio.Lock()
_SYNTHESIZE_LOCK = asyncio.Lock()
_TORCH_LOAD_PATCHED = False
XTTS_MODEL = "tts_models/multilingual/multi-dataset/xtts_v2"
XTTS_LANGUAGE = "en"
XTTS_SPEED = 1.2
_XTTS_WARM = False
_XTTS_LAST_ERROR: str | None = None
AUDIO_CACHE_MAX_ITEMS = 300
def _resolve_xtts_max_text_length() -> int:
"""0 disables truncation so full question text is spoken."""
try:
return max(0, int(os.getenv("XTTS_MAX_TEXT_LENGTH", "0")))
except Exception:
return 0
XTTS_MAX_TEXT_LENGTH = _resolve_xtts_max_text_length()
# User-approved stable voices:
# - Female: index 45 => Alexandra Hisakawa
# - Male: index 21 => Abrahan Mack
XTTS_SPEAKER_BY_GENDER = {
"female": "Alexandra Hisakawa",
"male": "Abrahan Mack",
"auto": "Alexandra Hisakawa",
}
def _resolve_xtts_checkpoint_trust() -> bool:
"""Enable trusted local checkpoint loading compatibility by default."""
value = os.getenv("XTTS_TRUSTED_CHECKPOINTS", "1").strip().lower()
return value in {"1", "true", "yes", "on"}
def _ensure_torch_load_compat_for_xtts() -> None:
"""Patch torch.load default for PyTorch 2.6+ when loading trusted XTTS checkpoints."""
global _TORCH_LOAD_PATCHED
if _TORCH_LOAD_PATCHED or not _resolve_xtts_checkpoint_trust():
return
try:
import torch
except Exception:
return
original_load = getattr(torch, "load", None)
if not callable(original_load):
return
@wraps(original_load)
def _torch_load_compat(*args, **kwargs):
# Coqui XTTS checkpoints require full object unpickling on newer PyTorch.
kwargs.setdefault("weights_only", False)
return original_load(*args, **kwargs)
torch.load = _torch_load_compat
_TORCH_LOAD_PATCHED = True
def _select_model(voice_gender: str) -> Tuple[str, str | None]:
gender = (voice_gender or "female").strip().lower()
if gender == "male":
# Multi-speaker model; use a male VCTK speaker token.
return "tts_models/en/vctk/vits", "p226"
# Default female-like English voice model.
return "tts_models/en/ljspeech/tacotron2-DDC", None
async def _get_tts_model(model_name: str):
async with _MODEL_LOCK:
if model_name in _MODEL_CACHE:
return _MODEL_CACHE[model_name]
def _load_model():
_ensure_torch_load_compat_for_xtts()
try:
from TTS.api import TTS
except Exception as exc:
raise RuntimeError(
"Coqui TTS is not installed in the active Python environment"
) from exc
gpu_pref = os.getenv("XTTS_USE_GPU", "auto").strip().lower()
use_gpu = False
if gpu_pref in {"1", "true", "yes", "on"}:
use_gpu = True
elif gpu_pref in {"0", "false", "no", "off"}:
use_gpu = False
else:
try:
import torch
use_gpu = bool(torch.cuda.is_available())
except Exception:
use_gpu = False
# TTS(..., gpu=...) is deprecated upstream. Load once, then move model.
tts = TTS(model_name=model_name, progress_bar=False)
if use_gpu:
try:
tts.to("cuda")
return tts
except Exception:
# Graceful CPU fallback when CUDA runtime is unavailable/mismatched.
try:
tts.to("cpu")
except Exception:
pass
return tts
try:
tts.to("cpu")
except Exception:
pass
return tts
model = await asyncio.to_thread(_load_model)
_MODEL_CACHE[model_name] = model
return model
def _resolve_xtts_speaker(voice_gender: str) -> str:
gender = (voice_gender or "female").strip().lower()
if gender not in XTTS_SPEAKER_BY_GENDER:
gender = "female"
return XTTS_SPEAKER_BY_GENDER[gender]
def _normalize_text_for_speech(value: str, max_length: int = XTTS_MAX_TEXT_LENGTH) -> str:
content = " ".join((value or "").strip().split())
if max_length <= 0:
return content
if len(content) <= max_length:
return content
trimmed = content[:max_length].rstrip()
# Keep sentence boundaries cleaner when truncating.
for marker in ("?", "!", "."):
if marker in trimmed:
head = trimmed.rsplit(marker, 1)[0].strip()
if len(head) >= max_length // 2:
return f"{head}{marker}"
return trimmed
async def warmup_xtts_model() -> bool:
"""Preload XTTS to avoid long cold-start on first interview question."""
global _XTTS_WARM, _XTTS_LAST_ERROR
if _XTTS_WARM:
return True
try:
await _get_tts_model(XTTS_MODEL)
_XTTS_WARM = True
_XTTS_LAST_ERROR = None
return True
except Exception as exc:
# Keep API startup resilient; routes decide whether to surface this.
_XTTS_LAST_ERROR = str(exc)
return False
def get_xtts_warmup_state() -> dict:
return {
"is_warm": _XTTS_WARM,
"last_error": _XTTS_LAST_ERROR,
}
def _synthesize_xtts_to_file(tts, text: str, speaker: str, file_path: str) -> None:
kwargs = {
"text": text,
"file_path": file_path,
"speaker": speaker,
"language": XTTS_LANGUAGE,
}
try:
# Faster delivery for interview prompts.
tts.tts_to_file(**kwargs, speed=XTTS_SPEED)
except TypeError:
# Some model/runtime combinations may not expose speed arg.
tts.tts_to_file(**kwargs)
def _build_audio_cache_key(text: str, voice_gender: str) -> str:
return f"{(voice_gender or 'female').strip().lower()}::{text.strip()}"
async def _get_cached_audio(cache_key: str) -> bytes | None:
async with _AUDIO_CACHE_LOCK:
value = _AUDIO_CACHE.get(cache_key)
if value is None:
return None
# LRU touch.
_AUDIO_CACHE.move_to_end(cache_key)
return value
async def _set_cached_audio(cache_key: str, data: bytes) -> None:
async with _AUDIO_CACHE_LOCK:
_AUDIO_CACHE[cache_key] = data
_AUDIO_CACHE.move_to_end(cache_key)
while len(_AUDIO_CACHE) > AUDIO_CACHE_MAX_ITEMS:
_AUDIO_CACHE.popitem(last=False)
async def _synthesize_fallback_wav(text: str, voice_gender: str) -> bytes:
model_name, speaker = _select_model(voice_gender)
tts = await _get_tts_model(model_name)
fd, tmp_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
try:
def _synthesize():
kwargs = {
"text": text,
"file_path": tmp_path,
}
if speaker:
kwargs["speaker"] = speaker
tts.tts_to_file(**kwargs)
await asyncio.to_thread(_synthesize)
with open(tmp_path, "rb") as f:
return f.read()
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
async def prefetch_wav(text: str, voice_gender: str = "female") -> None:
"""Best-effort speech prefetch to warm audio cache."""
try:
await synthesize_wav(text, voice_gender)
except Exception:
# Silent prefetch failure; runtime synth may still succeed later.
pass
async def synthesize_wav(text: str, voice_gender: str = "female") -> bytes:
content = _normalize_text_for_speech(text)
if not content:
raise ValueError("text is required")
normalized_gender = (voice_gender or "female").strip().lower()
if normalized_gender not in {"male", "female", "auto"}:
normalized_gender = "female"
cache_key = _build_audio_cache_key(content, normalized_gender)
cached = await _get_cached_audio(cache_key)
if cached:
return cached
async with _SYNTHESIZE_LOCK:
# Recheck cache after waiting for lock in case another request already synthesized it.
cached = await _get_cached_audio(cache_key)
if cached:
return cached
speaker = _resolve_xtts_speaker(normalized_gender)
tts = await _get_tts_model(XTTS_MODEL)
fd, tmp_path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
try:
def _synthesize():
_synthesize_xtts_to_file(tts, text=content, speaker=speaker, file_path=tmp_path)
try:
await asyncio.to_thread(_synthesize)
with open(tmp_path, "rb") as f:
wav = f.read()
await _set_cached_audio(cache_key, wav)
return wav
except Exception:
# Keep speech available even if XTTS runtime has temporary issues.
wav = await _synthesize_fallback_wav(content, normalized_gender)
await _set_cached_audio(cache_key, wav)
return wav
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
|