github-actions[bot] commited on
Commit
80f0ab9
·
1 Parent(s): 0b16c0f

deploy: switch to chatterbox requirements @ 21354c9

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. pipeline.py +6 -4
  3. server.py +1 -1
  4. steps/s2_transcribe.py +6 -1
.gitignore CHANGED
@@ -28,3 +28,4 @@ batch_outputs/
28
  social_distributor/.venv/
29
  social_distributor/poster/auth/storage/
30
  social_distributor/debug_*.png
 
 
28
  social_distributor/.venv/
29
  social_distributor/poster/auth/storage/
30
  social_distributor/debug_*.png
31
+ fine_tuning/
pipeline.py CHANGED
@@ -68,7 +68,7 @@ LANGUAGE_CODES = {
68
  def run_pipeline(
69
  video_path: str,
70
  target_language: str = "Spanish",
71
- source_language: str = "en",
72
  output_path: str | None = None,
73
  voice_mode: str = "chatterbox",
74
  preview_event: threading.Event | None = None,
@@ -84,7 +84,9 @@ def run_pipeline(
84
  Args:
85
  video_path: Path to the input video file.
86
  target_language: Target language name (e.g. "Spanish").
87
- source_language: ISO-639-1 code of the source language (default "en").
 
 
88
  output_path: Where to save the output video. Auto-generated if None.
89
  voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
90
  In Space deployments, this must match TTS_ENGINE env var.
@@ -328,8 +330,8 @@ def main():
328
  )
329
  parser.add_argument(
330
  "--source-lang",
331
- default="en",
332
- help="Source language ISO-639-1 code (default: en)",
333
  )
334
  parser.add_argument("--output", default=None, help="Output video path")
335
  parser.add_argument(
 
68
  def run_pipeline(
69
  video_path: str,
70
  target_language: str = "Spanish",
71
+ source_language: str = "auto",
72
  output_path: str | None = None,
73
  voice_mode: str = "chatterbox",
74
  preview_event: threading.Event | None = None,
 
84
  Args:
85
  video_path: Path to the input video file.
86
  target_language: Target language name (e.g. "Spanish").
87
+ source_language: ISO-639-1 code of the source language, or "auto" for
88
+ Whisper to auto-detect (default "auto"). Forcing a wrong code makes
89
+ Whisper silently translate-and-transcribe instead of transcribing.
90
  output_path: Where to save the output video. Auto-generated if None.
91
  voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
92
  In Space deployments, this must match TTS_ENGINE env var.
 
330
  )
331
  parser.add_argument(
332
  "--source-lang",
333
+ default="auto",
334
+ help="Source language ISO-639-1 code or 'auto' to let Whisper detect (default: auto)",
335
  )
336
  parser.add_argument("--output", default=None, help="Output video path")
337
  parser.add_argument(
server.py CHANGED
@@ -642,7 +642,7 @@ async def create_job(
642
  file: Optional[UploadFile] = File(None),
643
  url: Optional[str] = Form(None),
644
  target_language: str = Form("Spanish"),
645
- source_language: str = Form("en"),
646
  voice_mode: str = Form("chatterbox"),
647
  captions: str = Form("true"),
648
  preserve_music: str = Form("false"),
 
642
  file: Optional[UploadFile] = File(None),
643
  url: Optional[str] = Form(None),
644
  target_language: str = Form("Spanish"),
645
+ source_language: str = Form("auto"),
646
  voice_mode: str = Form("chatterbox"),
647
  captions: str = Form("true"),
648
  preserve_music: str = Form("false"),
steps/s2_transcribe.py CHANGED
@@ -182,13 +182,18 @@ def _segments_from_pollinations(audio_path: str, language: str) -> list[dict]:
182
 
183
  with open(audio_path, "rb") as audio_file:
184
  files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
 
 
 
 
185
  data = {
186
  "model": POLLEN_TRANSCRIBE_MODEL,
187
- "language": language,
188
  "response_format": "verbose_json",
189
  "temperature": 0,
190
  "timestamp_granularities[]": "word",
191
  }
 
 
192
  response = requests.post(
193
  POLLINATIONS_URL,
194
  headers=headers,
 
182
 
183
  with open(audio_path, "rb") as audio_file:
184
  files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
185
+ # When the caller passes "auto" (or empty), omit the `language` field so
186
+ # Whisper auto-detects. Forcing a wrong language code makes Whisper
187
+ # silently switch to translate-mode (e.g. Hindi audio + language="en"
188
+ # produces an English translation, not a Hindi transcript).
189
  data = {
190
  "model": POLLEN_TRANSCRIBE_MODEL,
 
191
  "response_format": "verbose_json",
192
  "temperature": 0,
193
  "timestamp_granularities[]": "word",
194
  }
195
+ if language and language.lower() not in ("auto", ""):
196
+ data["language"] = language
197
  response = requests.post(
198
  POLLINATIONS_URL,
199
  headers=headers,