Spaces:

Rafii
/

videovoice

Running on Zero

github-actions[bot] commited on 11 days ago

Commit

80f0ab9

1 Parent(s): 0b16c0f

deploy: switch to chatterbox requirements @ 21354c9

Files changed (4) hide show

.gitignore CHANGED Viewed

@@ -28,3 +28,4 @@ batch_outputs/
 social_distributor/.venv/
 social_distributor/poster/auth/storage/
 social_distributor/debug_*.png

 social_distributor/.venv/
 social_distributor/poster/auth/storage/
 social_distributor/debug_*.png
+fine_tuning/

pipeline.py CHANGED Viewed

@@ -68,7 +68,7 @@ LANGUAGE_CODES = {
 def run_pipeline(
     video_path: str,
     target_language: str = "Spanish",
-    source_language: str = "en",
     output_path: str | None = None,
     voice_mode: str = "chatterbox",
     preview_event: threading.Event | None = None,
@@ -84,7 +84,9 @@ def run_pipeline(
     Args:
         video_path: Path to the input video file.
         target_language: Target language name (e.g. "Spanish").
-        source_language: ISO-639-1 code of the source language (default "en").
         output_path: Where to save the output video. Auto-generated if None.
         voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
                     In Space deployments, this must match TTS_ENGINE env var.
@@ -328,8 +330,8 @@ def main():
     )
     parser.add_argument(
         "--source-lang",
-        default="en",
-        help="Source language ISO-639-1 code (default: en)",
     )
     parser.add_argument("--output", default=None, help="Output video path")
     parser.add_argument(

 def run_pipeline(
     video_path: str,
     target_language: str = "Spanish",
+    source_language: str = "auto",
     output_path: str | None = None,
     voice_mode: str = "chatterbox",
     preview_event: threading.Event | None = None,
     Args:
         video_path: Path to the input video file.
         target_language: Target language name (e.g. "Spanish").
+        source_language: ISO-639-1 code of the source language, or "auto" for
+            Whisper to auto-detect (default "auto"). Forcing a wrong code makes
+            Whisper silently translate-and-transcribe instead of transcribing.
         output_path: Where to save the output video. Auto-generated if None.
         voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
                     In Space deployments, this must match TTS_ENGINE env var.
     )
     parser.add_argument(
         "--source-lang",
+        default="auto",
+        help="Source language ISO-639-1 code or 'auto' to let Whisper detect (default: auto)",
     )
     parser.add_argument("--output", default=None, help="Output video path")
     parser.add_argument(

server.py CHANGED Viewed

@@ -642,7 +642,7 @@ async def create_job(
     file: Optional[UploadFile] = File(None),
     url: Optional[str] = Form(None),
     target_language: str = Form("Spanish"),
-    source_language: str = Form("en"),
     voice_mode: str = Form("chatterbox"),
     captions: str = Form("true"),
     preserve_music: str = Form("false"),

     file: Optional[UploadFile] = File(None),
     url: Optional[str] = Form(None),
     target_language: str = Form("Spanish"),
+    source_language: str = Form("auto"),
     voice_mode: str = Form("chatterbox"),
     captions: str = Form("true"),
     preserve_music: str = Form("false"),

steps/s2_transcribe.py CHANGED Viewed

@@ -182,13 +182,18 @@ def _segments_from_pollinations(audio_path: str, language: str) -> list[dict]:
     with open(audio_path, "rb") as audio_file:
         files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
         data = {
             "model": POLLEN_TRANSCRIBE_MODEL,
-            "language": language,
             "response_format": "verbose_json",
             "temperature": 0,
             "timestamp_granularities[]": "word",
         }
         response = requests.post(
             POLLINATIONS_URL,
             headers=headers,

     with open(audio_path, "rb") as audio_file:
         files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
+        # When the caller passes "auto" (or empty), omit the `language` field so
+        # Whisper auto-detects. Forcing a wrong language code makes Whisper
+        # silently switch to translate-mode (e.g. Hindi audio + language="en"
+        # produces an English translation, not a Hindi transcript).
         data = {
             "model": POLLEN_TRANSCRIBE_MODEL,
             "response_format": "verbose_json",
             "temperature": 0,
             "timestamp_granularities[]": "word",
         }
+        if language and language.lower() not in ("auto", ""):
+            data["language"] = language
         response = requests.post(
             POLLINATIONS_URL,
             headers=headers,