Spaces:

Rafii
/

videovoice

Running on Zero

App Files Files Community

Rafii commited on Apr 22

Commit

abc7c46

1 Parent(s): fc71180

deploy: switch to chatterbox requirements @ 68ada45

Browse files

Files changed (1) hide show

steps/s2_transcribe.py +63 -14

steps/s2_transcribe.py CHANGED Viewed

@@ -24,20 +24,56 @@ POLLEN_TRANSCRIBE_MODEL = os.getenv("POLLEN_TRANSCRIBE_MODEL", "whisper-large-v3
 MLX_MODEL = os.getenv("MLX_WHISPER_MODEL", "mlx-community/whisper-large-mlx")
 FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "large-v3")
 OPENAI_WHISPER_MODEL = os.getenv("OPENAI_WHISPER_MODEL", "large-v3")
-if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
-    LOCAL_WHISPER_BACKEND = "mlx-whisper"
-elif torch.cuda.is_available():
-    # PyTorch-based path so @spaces.GPU can intercept the CUDA allocation.
-    # faster-whisper uses CTranslate2 which bypasses PyTorch and breaks ZeroGPU.
-    LOCAL_WHISPER_BACKEND = "openai-whisper-cuda"
-else:
-    LOCAL_WHISPER_BACKEND = "faster-whisper-cpu"
 _FASTER_WHISPER_MODELS = {}
 _OPENAI_WHISPER_MODEL = None
 def _extract_words(raw_words: list[dict]) -> list[dict]:
     """Normalise word timestamps into {word, start, end}."""
     output = []
@@ -279,11 +315,13 @@ def _segments_from_openai_whisper(
 def _segments_from_local_backend(audio_path: str, language: str) -> list[dict]:
-    """Dispatch local whisper backend from startup device detection."""
-    if LOCAL_WHISPER_BACKEND == "mlx-whisper":
         return _segments_from_mlx(audio_path, language)
-    if LOCAL_WHISPER_BACKEND == "openai-whisper-cuda":
         print("[s2] Using openai-whisper backend (cuda)...")
         try:
             return _segments_from_openai_whisper(audio_path, language)
@@ -306,6 +344,8 @@ def transcribe(audio_path: str, language: str = "en") -> list[dict]:
     print(f"[s2] Transcribing {audio_path} (lang={language})...")
     segments = None
     # 1. Try Pollinations API first
     try:
@@ -317,21 +357,30 @@ def transcribe(audio_path: str, language: str = "en") -> list[dict]:
             segments = None
     except Exception as exc:
         print(f"[s2] Pollinations error ({exc}) — falling back to local backend.")
         segments = None
     # 2. Try Local Backend (GPU or CPU)
     if segments is None:
         try:
-            print(f"[s2] Trying local backend ({LOCAL_WHISPER_BACKEND})...")
             segments = _segments_from_local_backend(audio_path, language)
             if segments:
                 print(f"[s2] Local backend returned {len(segments)} segments ✓")
         except Exception as exc:
             print(f"[s2] Local backend error ({exc}).")
             segments = None
     if segments is None:
-        raise RuntimeError("Transcription failed on all available backends.")
     before = len(segments)
     segments = _split_oversized_segments(segments)

 MLX_MODEL = os.getenv("MLX_WHISPER_MODEL", "mlx-community/whisper-large-mlx")
 FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "large-v3")
 OPENAI_WHISPER_MODEL = os.getenv("OPENAI_WHISPER_MODEL", "large-v3")
+LOCAL_WHISPER_BACKEND_ENV = "VIDEOVOICE_WHISPER_BACKEND"
+_VALID_LOCAL_BACKENDS = {
+    "mlx-whisper",
+    "openai-whisper-cuda",
+    "faster-whisper-cpu",
+}
 _FASTER_WHISPER_MODELS = {}
 _OPENAI_WHISPER_MODEL = None
+def _running_on_hf_space() -> bool:
+    return bool(
+        os.getenv("SPACE_ID")
+        or os.getenv("SPACE_HOST")
+        or os.getenv("HF_SPACE_ID")
+    )
+def _get_local_whisper_backend() -> str:
+    """
+    Resolve the local transcription backend lazily.
+    On HF Spaces, default to CPU faster-whisper unless explicitly overridden.
+    ZeroGPU can report CUDA availability outside an active @spaces.GPU call,
+    which makes import-time backend selection unreliable.
+    """
+    override = os.getenv(LOCAL_WHISPER_BACKEND_ENV, "").strip().lower()
+    if override:
+        if override not in _VALID_LOCAL_BACKENDS:
+            raise ValueError(
+                f"Invalid {LOCAL_WHISPER_BACKEND_ENV}={override!r}. "
+                f"Expected one of: {', '.join(sorted(_VALID_LOCAL_BACKENDS))}."
+            )
+        return override
+    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mlx-whisper"
+    if _running_on_hf_space():
+        return "faster-whisper-cpu"
+    if torch.cuda.is_available():
+        # PyTorch-based path so @spaces.GPU can intercept the CUDA allocation.
+        # faster-whisper uses CTranslate2 which bypasses PyTorch and breaks ZeroGPU.
+        return "openai-whisper-cuda"
+    return "faster-whisper-cpu"
 def _extract_words(raw_words: list[dict]) -> list[dict]:
     """Normalise word timestamps into {word, start, end}."""
     output = []
 def _segments_from_local_backend(audio_path: str, language: str) -> list[dict]:
+    """Dispatch local whisper backend from runtime device detection."""
+    backend = _get_local_whisper_backend()
+    if backend == "mlx-whisper":
         return _segments_from_mlx(audio_path, language)
+    if backend == "openai-whisper-cuda":
         print("[s2] Using openai-whisper backend (cuda)...")
         try:
             return _segments_from_openai_whisper(audio_path, language)
     print(f"[s2] Transcribing {audio_path} (lang={language})...")
     segments = None
+    pollinations_error = None
+    local_error = None
     # 1. Try Pollinations API first
     try:
             segments = None
     except Exception as exc:
         print(f"[s2] Pollinations error ({exc}) — falling back to local backend.")
+        pollinations_error = exc
         segments = None
     # 2. Try Local Backend (GPU or CPU)
     if segments is None:
         try:
+            backend = _get_local_whisper_backend()
+            print(f"[s2] Trying local backend ({backend})...")
             segments = _segments_from_local_backend(audio_path, language)
             if segments:
                 print(f"[s2] Local backend returned {len(segments)} segments ✓")
         except Exception as exc:
             print(f"[s2] Local backend error ({exc}).")
+            local_error = exc
             segments = None
     if segments is None:
+        details = []
+        if pollinations_error is not None:
+            details.append(f"Pollinations: {pollinations_error}")
+        if local_error is not None:
+            details.append(f"Local backend: {local_error}")
+        suffix = f" Details: {' | '.join(details)}" if details else ""
+        raise RuntimeError(f"Transcription failed on all available backends.{suffix}")
     before = len(segments)
     segments = _split_oversized_segments(segments)