Spaces:
Sleeping
Sleeping
| import asyncio | |
| import os | |
| import tempfile | |
| # On Windows, ctranslate2 and torch can load separate OpenMP runtimes. | |
| # Allowing duplicates avoids process aborts during model initialization. | |
| os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") | |
| _WHISPER_MODEL_CACHE = {} | |
| _WHISPER_MODEL_LOCK = asyncio.Lock() | |
| _WHISPER_RUNTIME_FORCE_CPU = False | |
| _WHISPER_LAST_ERROR: str | None = None | |
| def _is_cuda_runtime_error(error: Exception) -> bool: | |
| message = str(error or "").strip().lower() | |
| if not message: | |
| return False | |
| markers = ( | |
| "cublas64_12.dll", | |
| "cublas", | |
| "cudnn", | |
| "libcudart", | |
| "cuda", | |
| "ctranslate2", | |
| "failed to load library", | |
| "cannot be loaded", | |
| ) | |
| return any(marker in message for marker in markers) | |
| def _force_whisper_cpu_mode(reason: Exception | None = None) -> None: | |
| global _WHISPER_RUNTIME_FORCE_CPU, _WHISPER_LAST_ERROR | |
| _WHISPER_RUNTIME_FORCE_CPU = True | |
| if reason is not None: | |
| _WHISPER_LAST_ERROR = str(reason) | |
| # Drop cached CUDA models so all future requests resolve to CPU safely. | |
| for key in list(_WHISPER_MODEL_CACHE.keys()): | |
| if "|cuda|" in key: | |
| _WHISPER_MODEL_CACHE.pop(key, None) | |
| def _has_cuda_device_via_ctranslate2() -> bool: | |
| try: | |
| import ctranslate2 | |
| return ctranslate2.get_cuda_device_count() > 0 | |
| except Exception: | |
| return False | |
| def _resolve_device() -> str: | |
| if _WHISPER_RUNTIME_FORCE_CPU: | |
| return "cpu" | |
| pref = os.getenv("WHISPER_DEVICE", "auto").strip().lower() | |
| if pref in {"cpu", "cuda"}: | |
| return pref | |
| # Prefer ctranslate2 probe first because faster-whisper relies on it. | |
| if _has_cuda_device_via_ctranslate2(): | |
| return "cuda" | |
| try: | |
| import torch | |
| return "cuda" if torch.cuda.is_available() else "cpu" | |
| except Exception: | |
| return "cpu" | |
| def _resolve_compute_type(device: str) -> str: | |
| pref = os.getenv("WHISPER_COMPUTE_TYPE", "auto").strip().lower() | |
| if pref and pref != "auto": | |
| return pref | |
| return "float16" if device == "cuda" else "int8" | |
| def _resolve_model_size() -> str: | |
| # Fast default for real-time interview UX; can be overridden in env. | |
| return os.getenv("WHISPER_MODEL_SIZE", "small.en").strip() or "small.en" | |
| def _resolve_beam_size() -> int: | |
| try: | |
| return max(1, int(os.getenv("WHISPER_BEAM_SIZE", "1"))) | |
| except Exception: | |
| return 1 | |
| def _resolve_best_of() -> int: | |
| try: | |
| return max(1, int(os.getenv("WHISPER_BEST_OF", "1"))) | |
| except Exception: | |
| return 1 | |
| def _resolve_vad_filter() -> bool: | |
| value = os.getenv("WHISPER_VAD_FILTER", "0").strip().lower() | |
| return value in {"1", "true", "yes", "on"} | |
| async def _get_whisper_model(): | |
| model_size = _resolve_model_size() | |
| device = _resolve_device() | |
| compute_type = _resolve_compute_type(device) | |
| cache_key = f"{model_size}|{device}|{compute_type}" | |
| async with _WHISPER_MODEL_LOCK: | |
| if cache_key in _WHISPER_MODEL_CACHE: | |
| return _WHISPER_MODEL_CACHE[cache_key] | |
| def _load_model(): | |
| try: | |
| from faster_whisper import WhisperModel | |
| except Exception as exc: | |
| raise RuntimeError( | |
| "faster-whisper is not installed in the active Python environment" | |
| ) from exc | |
| try: | |
| return WhisperModel(model_size, device=device, compute_type=compute_type) | |
| except Exception as exc: | |
| if device == "cuda" and _is_cuda_runtime_error(exc): | |
| _force_whisper_cpu_mode(exc) | |
| # Keep service resilient if GPU config mismatches runtime. | |
| return WhisperModel(model_size, device="cpu", compute_type="int8") | |
| model = await asyncio.to_thread(_load_model) | |
| _WHISPER_MODEL_CACHE[cache_key] = model | |
| return model | |
| async def warmup_whisper_model() -> None: | |
| try: | |
| await _get_whisper_model() | |
| except Exception: | |
| # Best-effort warmup only. | |
| pass | |
| async def transcribe_audio_bytes(audio_bytes: bytes, filename: str = "speech.webm", language: str = "en") -> str: | |
| if not audio_bytes: | |
| raise ValueError("audio file is required") | |
| model = await _get_whisper_model() | |
| ext = os.path.splitext(filename or "speech.webm")[1] or ".webm" | |
| target_language = (language or "en").strip().lower() or "en" | |
| beam_size = _resolve_beam_size() | |
| best_of = _resolve_best_of() | |
| vad_filter = _resolve_vad_filter() | |
| fd, tmp_path = tempfile.mkstemp(suffix=ext) | |
| os.close(fd) | |
| try: | |
| with open(tmp_path, "wb") as f: | |
| f.write(audio_bytes) | |
| def _transcribe(model_instance) -> str: | |
| segments, _ = model_instance.transcribe( | |
| tmp_path, | |
| language=target_language, | |
| beam_size=beam_size, | |
| best_of=best_of, | |
| vad_filter=vad_filter, | |
| condition_on_previous_text=False, | |
| temperature=0.0, | |
| without_timestamps=True, | |
| ) | |
| parts = [] | |
| for seg in segments: | |
| text = (seg.text or "").strip() | |
| if text: | |
| parts.append(text) | |
| return " ".join(parts).strip() | |
| try: | |
| text = await asyncio.to_thread(_transcribe, model) | |
| except Exception as exc: | |
| if not _is_cuda_runtime_error(exc): | |
| raise RuntimeError(f"Whisper transcription failed: {str(exc)}") from exc | |
| # Runtime CUDA failures can occur even after successful model construction. | |
| _force_whisper_cpu_mode(exc) | |
| cpu_model = await _get_whisper_model() | |
| try: | |
| text = await asyncio.to_thread(_transcribe, cpu_model) | |
| except Exception as retry_exc: | |
| raise RuntimeError( | |
| f"Whisper transcription failed after CPU fallback: {str(retry_exc)}" | |
| ) from retry_exc | |
| return text | |
| finally: | |
| if os.path.exists(tmp_path): | |
| os.remove(tmp_path) | |