Spaces:
Running on Zero
Running on Zero
github-actions[bot] commited on
Commit ·
80f0ab9
1
Parent(s): 0b16c0f
deploy: switch to chatterbox requirements @ 21354c9
Browse files- .gitignore +1 -0
- pipeline.py +6 -4
- server.py +1 -1
- steps/s2_transcribe.py +6 -1
.gitignore
CHANGED
|
@@ -28,3 +28,4 @@ batch_outputs/
|
|
| 28 |
social_distributor/.venv/
|
| 29 |
social_distributor/poster/auth/storage/
|
| 30 |
social_distributor/debug_*.png
|
|
|
|
|
|
| 28 |
social_distributor/.venv/
|
| 29 |
social_distributor/poster/auth/storage/
|
| 30 |
social_distributor/debug_*.png
|
| 31 |
+
fine_tuning/
|
pipeline.py
CHANGED
|
@@ -68,7 +68,7 @@ LANGUAGE_CODES = {
|
|
| 68 |
def run_pipeline(
|
| 69 |
video_path: str,
|
| 70 |
target_language: str = "Spanish",
|
| 71 |
-
source_language: str = "
|
| 72 |
output_path: str | None = None,
|
| 73 |
voice_mode: str = "chatterbox",
|
| 74 |
preview_event: threading.Event | None = None,
|
|
@@ -84,7 +84,9 @@ def run_pipeline(
|
|
| 84 |
Args:
|
| 85 |
video_path: Path to the input video file.
|
| 86 |
target_language: Target language name (e.g. "Spanish").
|
| 87 |
-
source_language: ISO-639-1 code of the source language
|
|
|
|
|
|
|
| 88 |
output_path: Where to save the output video. Auto-generated if None.
|
| 89 |
voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
|
| 90 |
In Space deployments, this must match TTS_ENGINE env var.
|
|
@@ -328,8 +330,8 @@ def main():
|
|
| 328 |
)
|
| 329 |
parser.add_argument(
|
| 330 |
"--source-lang",
|
| 331 |
-
default="
|
| 332 |
-
help="Source language ISO-639-1 code (default:
|
| 333 |
)
|
| 334 |
parser.add_argument("--output", default=None, help="Output video path")
|
| 335 |
parser.add_argument(
|
|
|
|
| 68 |
def run_pipeline(
|
| 69 |
video_path: str,
|
| 70 |
target_language: str = "Spanish",
|
| 71 |
+
source_language: str = "auto",
|
| 72 |
output_path: str | None = None,
|
| 73 |
voice_mode: str = "chatterbox",
|
| 74 |
preview_event: threading.Event | None = None,
|
|
|
|
| 84 |
Args:
|
| 85 |
video_path: Path to the input video file.
|
| 86 |
target_language: Target language name (e.g. "Spanish").
|
| 87 |
+
source_language: ISO-639-1 code of the source language, or "auto" for
|
| 88 |
+
Whisper to auto-detect (default "auto"). Forcing a wrong code makes
|
| 89 |
+
Whisper silently translate-and-transcribe instead of transcribing.
|
| 90 |
output_path: Where to save the output video. Auto-generated if None.
|
| 91 |
voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
|
| 92 |
In Space deployments, this must match TTS_ENGINE env var.
|
|
|
|
| 330 |
)
|
| 331 |
parser.add_argument(
|
| 332 |
"--source-lang",
|
| 333 |
+
default="auto",
|
| 334 |
+
help="Source language ISO-639-1 code or 'auto' to let Whisper detect (default: auto)",
|
| 335 |
)
|
| 336 |
parser.add_argument("--output", default=None, help="Output video path")
|
| 337 |
parser.add_argument(
|
server.py
CHANGED
|
@@ -642,7 +642,7 @@ async def create_job(
|
|
| 642 |
file: Optional[UploadFile] = File(None),
|
| 643 |
url: Optional[str] = Form(None),
|
| 644 |
target_language: str = Form("Spanish"),
|
| 645 |
-
source_language: str = Form("
|
| 646 |
voice_mode: str = Form("chatterbox"),
|
| 647 |
captions: str = Form("true"),
|
| 648 |
preserve_music: str = Form("false"),
|
|
|
|
| 642 |
file: Optional[UploadFile] = File(None),
|
| 643 |
url: Optional[str] = Form(None),
|
| 644 |
target_language: str = Form("Spanish"),
|
| 645 |
+
source_language: str = Form("auto"),
|
| 646 |
voice_mode: str = Form("chatterbox"),
|
| 647 |
captions: str = Form("true"),
|
| 648 |
preserve_music: str = Form("false"),
|
steps/s2_transcribe.py
CHANGED
|
@@ -182,13 +182,18 @@ def _segments_from_pollinations(audio_path: str, language: str) -> list[dict]:
|
|
| 182 |
|
| 183 |
with open(audio_path, "rb") as audio_file:
|
| 184 |
files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
data = {
|
| 186 |
"model": POLLEN_TRANSCRIBE_MODEL,
|
| 187 |
-
"language": language,
|
| 188 |
"response_format": "verbose_json",
|
| 189 |
"temperature": 0,
|
| 190 |
"timestamp_granularities[]": "word",
|
| 191 |
}
|
|
|
|
|
|
|
| 192 |
response = requests.post(
|
| 193 |
POLLINATIONS_URL,
|
| 194 |
headers=headers,
|
|
|
|
| 182 |
|
| 183 |
with open(audio_path, "rb") as audio_file:
|
| 184 |
files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
|
| 185 |
+
# When the caller passes "auto" (or empty), omit the `language` field so
|
| 186 |
+
# Whisper auto-detects. Forcing a wrong language code makes Whisper
|
| 187 |
+
# silently switch to translate-mode (e.g. Hindi audio + language="en"
|
| 188 |
+
# produces an English translation, not a Hindi transcript).
|
| 189 |
data = {
|
| 190 |
"model": POLLEN_TRANSCRIBE_MODEL,
|
|
|
|
| 191 |
"response_format": "verbose_json",
|
| 192 |
"temperature": 0,
|
| 193 |
"timestamp_granularities[]": "word",
|
| 194 |
}
|
| 195 |
+
if language and language.lower() not in ("auto", ""):
|
| 196 |
+
data["language"] = language
|
| 197 |
response = requests.post(
|
| 198 |
POLLINATIONS_URL,
|
| 199 |
headers=headers,
|