Spaces:
Running on Zero
Running on Zero
github-actions[bot] commited on
Commit ·
0422215
1
Parent(s): 10b6cf0
deploy: switch to dramabox requirements @ a95fda4
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .env.example +35 -0
- .gitattributes +14 -35
- .gitignore +31 -0
- CLAUDE.md +13 -0
- LICENSE +21 -0
- README.md +330 -8
- app.py +80 -0
- graphify-out/.graphify_python +1 -0
- graphify-out/.graphify_root +1 -0
- graphify-out/GRAPH_REPORT.md +465 -0
- graphify-out/graph.html +0 -0
- packages.txt +4 -0
- pipeline.py +363 -0
- pyproject.toml +59 -0
- requirements-cbox.txt +51 -0
- requirements-omni.txt +157 -0
- requirements-qwen3.txt +55 -0
- requirements.txt +62 -0
- scripts/prefetch_models.py +47 -0
- server.py +929 -0
- social_distributor/.env.example +16 -0
- social_distributor/.gitignore +8 -0
- social_distributor/README.md +205 -0
- social_distributor/post.py +311 -0
- social_distributor/poster/__init__.py +0 -0
- social_distributor/poster/auth/__init__.py +0 -0
- social_distributor/poster/auth/session.py +111 -0
- social_distributor/poster/caption_gen.py +164 -0
- social_distributor/poster/config.py +88 -0
- social_distributor/poster/creator_extract.py +149 -0
- social_distributor/poster/models.py +29 -0
- social_distributor/poster/platforms/__init__.py +0 -0
- social_distributor/poster/platforms/base.py +57 -0
- social_distributor/poster/platforms/instagram.py +206 -0
- social_distributor/poster/platforms/tiktok.py +155 -0
- social_distributor/poster/platforms/youtube.py +165 -0
- social_distributor/poster/post_log.py +45 -0
- social_distributor/poster/video_loader.py +101 -0
- social_distributor/pyproject.toml +20 -0
- social_distributor/uv.lock +0 -0
- steps/__init__.py +1 -0
- steps/lang/__init__.py +38 -0
- steps/lang/_shared.py +150 -0
- steps/lang/omnivoice_languages.py +652 -0
- steps/lang/qwen3_languages.py +15 -0
- steps/lang/urdu.py +324 -0
- steps/s1_extract_audio.py +68 -0
- steps/s1b_separate.py +152 -0
- steps/s2_transcribe.py +395 -0
- steps/s3_translate.py +195 -0
.env.example
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VideoVoice — Environment Variables
|
| 2 |
+
# Copy this to .env and fill in values
|
| 3 |
+
|
| 4 |
+
# Server port (default 8000)
|
| 5 |
+
PORT=8000
|
| 6 |
+
|
| 7 |
+
# Where per-job artifact folders get written. On HF Spaces this is resolved
|
| 8 |
+
# automatically (/data/jobs with persistent storage, /tmp/videovoice_jobs
|
| 9 |
+
# without). For local dev, set this to ./data so jobs land next to the repo
|
| 10 |
+
# — same layout the old `main` used.
|
| 11 |
+
ARTIFACTS_ROOT=./data
|
| 12 |
+
|
| 13 |
+
# OpenAI API key (for translation step)
|
| 14 |
+
OPENAI_API_KEY=sk-...
|
| 15 |
+
|
| 16 |
+
# Pollinations API key (optional, for Whisper transcription fallback)
|
| 17 |
+
POLLINATIONS_API_KEY=
|
| 18 |
+
POLLEN_TRANSCRIBE_MODEL=whisper-large-v3
|
| 19 |
+
POLLEN_MODEL=gemini-search
|
| 20 |
+
|
| 21 |
+
# Stripe (optional, for paid tiers)
|
| 22 |
+
STRIPE_PUBLISHABLE_KEY=
|
| 23 |
+
STRIPE_SECRET_KEY=
|
| 24 |
+
|
| 25 |
+
# AWS S3 (optional, for cloud storage)
|
| 26 |
+
AWS_ACCESS_KEY_ID=
|
| 27 |
+
AWS_SECRET_ACCESS_KEY=
|
| 28 |
+
AWS_S3_BUCKET=
|
| 29 |
+
AWS_REGION=us-east-1
|
| 30 |
+
|
| 31 |
+
# AWS Bedrock (optional, fallback translator for Urdu)
|
| 32 |
+
AWS_BEDROCK_API_KEY=
|
| 33 |
+
BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
|
| 34 |
+
|
| 35 |
+
HF_TOKEN=
|
.gitattributes
CHANGED
|
@@ -1,35 +1,14 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Files in this repo that are dev-only and must NOT ship to the HF Spaces.
|
| 2 |
+
# `deploy.sh` honors this via `git archive --worktree-attributes`.
|
| 3 |
+
# Rule of thumb: if HF Spaces would never import/execute it, export-ignore it.
|
| 4 |
+
# Do NOT export-ignore server.py — app.py imports from it at runtime on HF.
|
| 5 |
+
|
| 6 |
+
.github/ export-ignore
|
| 7 |
+
SPLIT_STRATEGY.md export-ignore
|
| 8 |
+
deploy.sh export-ignore
|
| 9 |
+
Dockerfile export-ignore
|
| 10 |
+
.dockerignore export-ignore
|
| 11 |
+
social_media_distributor/ export-ignore
|
| 12 |
+
frontend/ export-ignore
|
| 13 |
+
batch_translate.py export-ignore
|
| 14 |
+
client_insta_links.jsonl export-ignore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.wav
|
| 2 |
+
*.mp4
|
| 3 |
+
*.mov
|
| 4 |
+
*.webp
|
| 5 |
+
*.ass
|
| 6 |
+
*.txt
|
| 7 |
+
!requirements.txt
|
| 8 |
+
!requirements-cbox.txt
|
| 9 |
+
!requirements-omni.txt
|
| 10 |
+
!requirements-qwen3.txt
|
| 11 |
+
!packages.txt
|
| 12 |
+
!SPLIT_STRATEGY.md
|
| 13 |
+
*.DS_Store
|
| 14 |
+
.env
|
| 15 |
+
.venv/
|
| 16 |
+
__pycache__/
|
| 17 |
+
**/__pycache__/
|
| 18 |
+
*.py[cod]
|
| 19 |
+
*$py.class
|
| 20 |
+
*.json
|
| 21 |
+
!data/showcase.json
|
| 22 |
+
tmp/
|
| 23 |
+
uploads/
|
| 24 |
+
outputs/
|
| 25 |
+
data/
|
| 26 |
+
batch_outputs/
|
| 27 |
+
# Subproject runtime artifacts (not for HF Space)
|
| 28 |
+
social_distributor/.venv/
|
| 29 |
+
social_distributor/poster/auth/storage/
|
| 30 |
+
social_distributor/debug_*.png
|
| 31 |
+
fine_tuning/
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Deployment
|
| 2 |
+
|
| 3 |
+
HF Spaces deployment is fully automated via `.github/workflows/deploy-hf.yml`. Pushing to `origin/main` triggers the workflow which runs `./deploy.sh --force` and pushes to all three Spaces (Chatterbox, OmniVoice, Qwen3). Do not run `./deploy.sh` locally after a push — it is redundant. To verify a deploy, use `gh run list --workflow=deploy-hf.yml`.
|
| 4 |
+
|
| 5 |
+
## graphify
|
| 6 |
+
|
| 7 |
+
This project has a graphify knowledge graph at graphify-out/.
|
| 8 |
+
|
| 9 |
+
Rules:
|
| 10 |
+
- Before answering architecture or codebase questions, read graphify-out/GRAPH_REPORT.md for god nodes and community structure
|
| 11 |
+
- If graphify-out/wiki/index.md exists, navigate it instead of reading raw files
|
| 12 |
+
- For cross-module "how does X relate to Y" questions, prefer `graphify query "<question>"`, `graphify path "<A>" "<B>"`, or `graphify explain "<concept>"` over grep — these traverse the graph's EXTRACTED + INFERRED edges instead of scanning files
|
| 13 |
+
- After modifying code files in this session, run `graphify update .` to keep the graph current (AST-only, no API cost)
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 Raafi
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,13 +1,335 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: 👀
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.
|
| 8 |
-
python_version: '3.12'
|
| 9 |
app_file: app.py
|
| 10 |
-
|
| 11 |
---
|
| 12 |
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: VideoVoice API
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: gradio
|
| 4 |
+
sdk_version: 6.12.0
|
|
|
|
| 5 |
app_file: app.py
|
| 6 |
+
python_version: "3.10"
|
| 7 |
---
|
| 8 |
|
| 9 |
+
<!--
|
| 10 |
+
ZeroGPU is enabled from the Space Settings UI (not via frontmatter).
|
| 11 |
+
PRO account required. `app.py` mounts the FastAPI pipeline onto Gradio
|
| 12 |
+
so the React client keeps calling `/api/*` over CORS unchanged.
|
| 13 |
+
-->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# VideoVoice
|
| 17 |
+
|
| 18 |
+
**AI-powered short video translation with zero-shot voice cloning.**
|
| 19 |
+
|
| 20 |
+
Translate any short video (≤60s) into 23+ languages while preserving the original speaker's voice. Paste an Instagram Reel, YouTube Short, or upload any video file.
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## How It Works
|
| 25 |
+
|
| 26 |
+
1. **Upload or Paste URL** — Drop a video file or paste a social media link
|
| 27 |
+
2. **AI Translates & Clones** — Our 6-step pipeline transcribes, translates, and synthesizes new speech using a voice clone of the original speaker
|
| 28 |
+
3. **Preview & Download** — Watch your translated video and download in full quality
|
| 29 |
+
|
| 30 |
+
### Pipeline Architecture
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
Video → Extract Audio → Whisper Transcription → LLM Translation
|
| 34 |
+
→ Chatterbox Voice Clone + TTS → Time-Sync → Final Merge
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
| Step | Component | Description |
|
| 38 |
+
|------|-----------|-------------|
|
| 39 |
+
| 1 | FFmpeg | Extract audio track from video |
|
| 40 |
+
| 2 | Whisper Large V3 | Transcribe with word-level timestamps |
|
| 41 |
+
| 3 | GPT-4o-mini | Context-aware subtitle translation |
|
| 42 |
+
| 4 | Chatterbox Multilingual | Zero-shot voice cloning + TTS synthesis |
|
| 43 |
+
| 5 | Dynamic Time-Stretch | Align translated audio to original timing |
|
| 44 |
+
| 6 | FFmpeg | Merge new audio track back into video |
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## Running Locally
|
| 49 |
+
|
| 50 |
+
### Prerequisites
|
| 51 |
+
|
| 52 |
+
- Python 3.10+ (`requires-python = ">=3.10,<3.13"`)
|
| 53 |
+
- FFmpeg (`brew install ffmpeg` on macOS, `sudo apt install ffmpeg` on Ubuntu)
|
| 54 |
+
- An OpenAI API key
|
| 55 |
+
|
| 56 |
+
### First-time setup
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# 1. Install uv (skip if you already have it)
|
| 60 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 61 |
+
|
| 62 |
+
# 2. Clone and enter the repo
|
| 63 |
+
git clone https://github.com/Video-Voice/VideoVoice-be.git
|
| 64 |
+
cd VideoVoice-be
|
| 65 |
+
|
| 66 |
+
# 3. Install deps with the chatterbox TTS engine (default for local dev)
|
| 67 |
+
# Use `--extra omnivoice` instead if you want OmniVoice. The two extras
|
| 68 |
+
# are mutually exclusive — pick one.
|
| 69 |
+
uv sync --extra chatterbox
|
| 70 |
+
|
| 71 |
+
# 4. Configure env vars
|
| 72 |
+
cp .env.example .env
|
| 73 |
+
# Edit .env — at minimum set OPENAI_API_KEY and ARTIFACTS_ROOT=./data
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### One-time: hide the vendored chatterbox folder
|
| 77 |
+
|
| 78 |
+
The repo ships a vendored `./chatterbox/` folder that the HF Chatterbox Space needs (it has ZeroGPU-specific tweaks). Locally we want Python to import the PyPI `chatterbox-tts` package instead, so tell git to ignore the working-tree state for that folder and delete it locally:
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
git ls-files chatterbox/ | xargs git update-index --skip-worktree
|
| 82 |
+
rm -rf chatterbox/
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
HEAD still contains the folder, so HF deploys are unaffected. Reverse with `git update-index --no-skip-worktree` + `git checkout HEAD -- chatterbox/`.
|
| 86 |
+
|
| 87 |
+
### Run the server
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
uv run python server.py
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
Open [http://localhost:8000](http://localhost:8000). `/api/*` are the backend routes; `/` serves the legacy static UI in `frontend/`. If the port is in use, set `PORT=8001`.
|
| 94 |
+
|
| 95 |
+
Per-job artifacts land in `$ARTIFACTS_ROOT/<job_id>/`. With `ARTIFACTS_ROOT=./data` (in `.env`) that's `./data/<job_id>/` next to the repo — same layout the repo has always used.
|
| 96 |
+
|
| 97 |
+
### Run the pipeline headlessly
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
uv run python pipeline.py --input data/my_video.mp4 --target-lang Spanish
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
---
|
| 104 |
+
## API Reference
|
| 105 |
+
|
| 106 |
+
The following endpoints are available on the backend (FastAPI/Gradio Server). When running on Hugging Face, replace `localhost:8000` with your Space's API URL (e.g., `https://rafii-videovoice.hf.space`).
|
| 107 |
+
|
| 108 |
+
### Core Endpoints
|
| 109 |
+
|
| 110 |
+
#### `POST /api/jobs`
|
| 111 |
+
Submit a video for translation. You can provide either a local file or a URL.
|
| 112 |
+
|
| 113 |
+
**Form Data:**
|
| 114 |
+
- `file`: (Optional) Video file upload (MP4, MOV, WebM, ≤90MB).
|
| 115 |
+
- `url`: (Optional) Social media URL (Instagram, YouTube, TikTok).
|
| 116 |
+
- `target_language`: (Required) Name of target language (e.g., "Spanish", "Hindi").
|
| 117 |
+
- `source_language`: (Optional) ISO code of source (default: "en").
|
| 118 |
+
- `voice_mode`: (Optional) `chatterbox` or `omnivoice` (must match Space engine).
|
| 119 |
+
- `captions`: (Optional) "true" or "false" (default: "true").
|
| 120 |
+
- `preserve_music`: (Optional) "true" or "false" (default: "false").
|
| 121 |
+
|
| 122 |
+
**Example:**
|
| 123 |
+
```bash
|
| 124 |
+
curl -X POST http://localhost:8000/api/jobs \
|
| 125 |
+
-F "file=@my_video.mp4" \
|
| 126 |
+
-F "target_language=French"
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
#### `GET /api/jobs/{job_id}`
|
| 130 |
+
Poll for the real-time status and progress messages of a specific job.
|
| 131 |
+
|
| 132 |
+
**Query Parameters:**
|
| 133 |
+
- `after`: (Optional) Index of the last message received to fetch only new ones.
|
| 134 |
+
|
| 135 |
+
**Example:**
|
| 136 |
+
```bash
|
| 137 |
+
curl http://localhost:8000/api/jobs/abc123_1?after=5
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
#### `GET /api/jobs/{job_id}/result`
|
| 141 |
+
Download the final translated video file.
|
| 142 |
+
|
| 143 |
+
**Example:**
|
| 144 |
+
```bash
|
| 145 |
+
curl -O -L http://localhost:8000/api/jobs/abc123_1/result
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
### Utility & Configuration
|
| 151 |
+
|
| 152 |
+
#### `GET /api/config`
|
| 153 |
+
Fetch server configuration, including supported languages, max file size, and the active TTS engine.
|
| 154 |
+
|
| 155 |
+
#### `GET /api/health`
|
| 156 |
+
Check if the server is alive and see GPU availability/queue depth.
|
| 157 |
+
|
| 158 |
+
#### `GET /api/showcase`
|
| 159 |
+
Retrieve curated "before & after" demo entries defined in `data/showcase.json`.
|
| 160 |
+
|
| 161 |
+
#### `GET /api/demo-videos`
|
| 162 |
+
List all whitelisted demo videos available for streaming from the `outputs/` and `data/` folders.
|
| 163 |
+
|
| 164 |
+
#### `GET /api/demo-videos/{video_id}/stream`
|
| 165 |
+
Stream a specific demo video by its opaque ID.
|
| 166 |
+
|
| 167 |
+
---
|
| 168 |
+
|
| 169 |
+
### Interactive / Preview Endpoints
|
| 170 |
+
|
| 171 |
+
#### `GET /api/jobs/{job_id}/preview/{model_name}`
|
| 172 |
+
Retrieve a short audio snippet of the cloned voice for a specific TTS model before proceeding with full synthesis.
|
| 173 |
+
|
| 174 |
+
#### `POST /api/jobs/{job_id}/select-model`
|
| 175 |
+
Confirm which TTS model to use after listening to previews (used in multi-model workflows).
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
### ZeroGPU / Gradio Internal API
|
| 180 |
+
|
| 181 |
+
#### `POST /run_pipeline` (Gradio API)
|
| 182 |
+
Internal endpoint used by ZeroGPU to trigger the heavy ML processing logic. Recommended for use via `gradio_client`.
|
| 183 |
+
|
| 184 |
+
**Example (Python):**
|
| 185 |
+
```python
|
| 186 |
+
from gradio_client import Client
|
| 187 |
+
client = Client("Rafii/videovoice")
|
| 188 |
+
client.predict(job_id="abc123_1", api_name="/run_pipeline")
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
## Testing the API (Hugging Face Spaces)
|
| 195 |
+
|
| 196 |
+
When running on Hugging Face Spaces (using `app.py`), you can test the API using standard HTTP tools or the Gradio Client. Choose the Space corresponding to the desired TTS engine:
|
| 197 |
+
|
| 198 |
+
| TTS Engine | Space URL | API Endpoint |
|
| 199 |
+
|------------|-----------|--------------|
|
| 200 |
+
| **Chatterbox** | `Rafii/videovoice` | `https://rafii-videovoice.hf.space` |
|
| 201 |
+
| **OmniVoice** | `Rafii/videovoice-omni` | `https://rafii-videovoice-omni.hf.space` |
|
| 202 |
+
|
| 203 |
+
### 1. Using `curl` (FastAPI Routes)
|
| 204 |
+
|
| 205 |
+
You can check the health of the API and verify that it's running:
|
| 206 |
+
|
| 207 |
+
```bash
|
| 208 |
+
# Chatterbox Space
|
| 209 |
+
curl https://rafii-videovoice.hf.space/api/health
|
| 210 |
+
|
| 211 |
+
# OmniVoice Space
|
| 212 |
+
curl https://rafii-videovoice-omni.hf.space/api/health
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
To submit a job via the standard API:
|
| 216 |
+
|
| 217 |
+
```bash
|
| 218 |
+
curl -X POST https://rafii-videovoice.hf.space/api/jobs \
|
| 219 |
+
-F "url=https://www.instagram.com/reels/XYZ/" \
|
| 220 |
+
-F "target_language=Spanish"
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
### 2. Using `gradio_client` (Gradio API Routes)
|
| 224 |
+
|
| 225 |
+
The `gradio.Server` endpoints are optimized for ZeroGPU and can be accessed using the Python `gradio_client`:
|
| 226 |
+
|
| 227 |
+
```python
|
| 228 |
+
from gradio_client import Client
|
| 229 |
+
|
| 230 |
+
# Change to "Rafii/videovoice-omni" for OmniVoice
|
| 231 |
+
client = Client("Rafii/videovoice")
|
| 232 |
+
result = client.predict(
|
| 233 |
+
job_id="abc123",
|
| 234 |
+
api_name="/run_pipeline"
|
| 235 |
+
)
|
| 236 |
+
print(result)
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### 3. Using JavaScript (Frontend)
|
| 240 |
+
|
| 241 |
+
The new `gradio.Server` mode is designed for custom frontends. You can use the `@gradio/client` JS library:
|
| 242 |
+
|
| 243 |
+
```javascript
|
| 244 |
+
import { Client } from "@gradio/client";
|
| 245 |
+
|
| 246 |
+
// Connect to the specific Space
|
| 247 |
+
const client = await Client.connect("Rafii/videovoice");
|
| 248 |
+
const result = await client.predict("/run_pipeline", {
|
| 249 |
+
job_id: "abc123",
|
| 250 |
+
});
|
| 251 |
+
```
|
| 252 |
+
|
| 253 |
+
---
|
| 254 |
+
|
| 255 |
+
## Supported Languages
|
| 256 |
+
|
| 257 |
+
Spanish, French, German, Hindi, Portuguese, Italian, Japanese, Chinese, Arabic, Korean — and more.
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
## Project Structure
|
| 262 |
+
|
| 263 |
+
```
|
| 264 |
+
VideoVoice/
|
| 265 |
+
├── server.py # FastAPI backend
|
| 266 |
+
├── pipeline.py # Core translation pipeline
|
| 267 |
+
├── steps/ # Pipeline step modules
|
| 268 |
+
│ ├── s1_extract_audio.py
|
| 269 |
+
│ ├── s2_transcribe.py
|
| 270 |
+
│ ├── s3_translate.py
|
| 271 |
+
│ ├── s4_tts.py
|
| 272 |
+
│ ├── s5_sync.py
|
| 273 |
+
│ └── s6_merge.py
|
| 274 |
+
├── frontend/ # Static web UI
|
| 275 |
+
│ ├── index.html
|
| 276 |
+
│ ├── style.css
|
| 277 |
+
│ └── app.js
|
| 278 |
+
├── pyproject.toml # Dependencies & project config
|
| 279 |
+
├── uv.lock # Lockfile (reproducible installs)
|
| 280 |
+
├── .env.example
|
| 281 |
+
└── README.md
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
## Entrypoints
|
| 287 |
+
|
| 288 |
+
Two files intentionally exist, run in different contexts, but **ship the same code**:
|
| 289 |
+
|
| 290 |
+
| File | When it runs | What it does |
|
| 291 |
+
|------|-------------|--------------|
|
| 292 |
+
| `server.py` | Local dev (`uv run python server.py`) | Plain FastAPI app — defines every `/api/*` route. |
|
| 293 |
+
| `app.py` | Hugging Face Spaces | Gradio Server that imports `server.py`'s router and wraps it with `@spaces.GPU` for ZeroGPU. |
|
| 294 |
+
|
| 295 |
+
`app.py` depends on `server.py`, so server.py must ship to HF. Do not strip it.
|
| 296 |
+
|
| 297 |
+
## Deployment
|
| 298 |
+
|
| 299 |
+
### Hugging Face Spaces (production)
|
| 300 |
+
|
| 301 |
+
Push to `main` → GitHub Actions runs `.github/workflows/deploy-hf.yml` → both Spaces (`Rafii/videovoice` and `Rafii/videovoice-omni`) redeploy automatically. No manual step.
|
| 302 |
+
|
| 303 |
+
One-time CI setup:
|
| 304 |
+
1. Create an HF access token with write access to both Spaces: https://huggingface.co/settings/tokens
|
| 305 |
+
2. Add it as `HF_TOKEN` under **Settings → Secrets and variables → Actions** in the GitHub repo.
|
| 306 |
+
|
| 307 |
+
Manual fallback (from a local clean checkout with `space` and `space-omni` remotes configured):
|
| 308 |
+
```bash
|
| 309 |
+
./deploy.sh # skips if remote is already at HEAD
|
| 310 |
+
./deploy.sh --force # always redeploy
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
Files filtered out of every Space deploy are listed in `.gitattributes` (`export-ignore`).
|
| 314 |
+
|
| 315 |
+
### Branching
|
| 316 |
+
|
| 317 |
+
`main` is canonical. Use short-lived `feat/<thing>` branches, open a PR, merge, delete. Never maintain a parallel deploy branch — every change on main reaches both Spaces via CI.
|
| 318 |
+
|
| 319 |
+
### AWS (alternative GPU host)
|
| 320 |
+
|
| 321 |
+
```bash
|
| 322 |
+
# On a g4dn.xlarge instance
|
| 323 |
+
sudo apt update && sudo apt install -y ffmpeg
|
| 324 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 325 |
+
uv sync
|
| 326 |
+
uv run python server.py
|
| 327 |
+
```
|
| 328 |
+
|
| 329 |
+
Recommended: use `systemd` service for auto-restart, CloudFront for CDN, S3 for video storage with 24h auto-delete lifecycle policy.
|
| 330 |
+
|
| 331 |
+
---
|
| 332 |
+
|
| 333 |
+
## License
|
| 334 |
+
|
| 335 |
+
MIT License — see [LICENSE](LICENSE).
|
app.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ZeroGPU-compatible entrypoint using gradio.Server.
|
| 3 |
+
Server extends FastAPI, so all your existing API routes work unchanged.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# 1. Lightweight imports only at top level
|
| 10 |
+
import spaces
|
| 11 |
+
import gradio as gr
|
| 12 |
+
from gradio import Server
|
| 13 |
+
from gradio.data_classes import FileData
|
| 14 |
+
from fastapi import Request
|
| 15 |
+
from slowapi.errors import RateLimitExceeded
|
| 16 |
+
from slowapi import _rate_limit_exceeded_handler
|
| 17 |
+
|
| 18 |
+
TTS_ENGINE = os.getenv("TTS_ENGINE", "chatterbox").lower()
|
| 19 |
+
|
| 20 |
+
# 2. Create Server instead of FastAPI
|
| 21 |
+
# Name it 'demo' so HF Space picks it up automatically
|
| 22 |
+
demo = Server()
|
| 23 |
+
|
| 24 |
+
# -----------------------------------------------------
|
| 25 |
+
# INTEGRATE SERVER.PY ROUTES
|
| 26 |
+
# -----------------------------------------------------
|
| 27 |
+
from server import router, limiter, enforce_content_length_limit
|
| 28 |
+
from tools_api import router as tools_router
|
| 29 |
+
|
| 30 |
+
demo.include_router(router)
|
| 31 |
+
demo.include_router(tools_router)
|
| 32 |
+
demo.state.limiter = limiter
|
| 33 |
+
demo.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 34 |
+
|
| 35 |
+
# Apply content length middleware to the main app
|
| 36 |
+
@demo.middleware("http")
|
| 37 |
+
async def content_length_middleware(request: Request, call_next):
|
| 38 |
+
return await enforce_content_length_limit(request, call_next)
|
| 39 |
+
|
| 40 |
+
@demo.get("/api/health")
|
| 41 |
+
def health():
|
| 42 |
+
return {"status": "ok", "tts": TTS_ENGINE}
|
| 43 |
+
|
| 44 |
+
# -----------------------------------------------------
|
| 45 |
+
# ZERO GPU FUNCTION — lazy-loads torch/CUDA
|
| 46 |
+
# -----------------------------------------------------
|
| 47 |
+
@spaces.GPU(duration=60)
|
| 48 |
+
def run_pipeline(job_id: str):
|
| 49 |
+
from pipeline import process_job
|
| 50 |
+
return process_job(job_id)
|
| 51 |
+
|
| 52 |
+
# -----------------------------------------------------
|
| 53 |
+
# GRADIO API INTEGRATION (this is what ZeroGPU detects)
|
| 54 |
+
# -----------------------------------------------------
|
| 55 |
+
@demo.api(name="run_pipeline")
|
| 56 |
+
def api_run_pipeline(job_id: str):
|
| 57 |
+
"""
|
| 58 |
+
Exposed through Gradio's API engine.
|
| 59 |
+
ZeroGPU will allocate a GPU when this endpoint is called.
|
| 60 |
+
"""
|
| 61 |
+
return run_pipeline(job_id)
|
| 62 |
+
|
| 63 |
+
# -----------------------------------------------------
|
| 64 |
+
# OPTIONAL: Gradio UI (if you still want a basic UI)
|
| 65 |
+
# -----------------------------------------------------
|
| 66 |
+
with gr.Blocks(title="VideoVoice API") as ui:
|
| 67 |
+
gr.Markdown(f"# VideoVoice API ({TTS_ENGINE.upper()})")
|
| 68 |
+
job_id_box = gr.Textbox(label="Job ID")
|
| 69 |
+
output_box = gr.Textbox(label="Result")
|
| 70 |
+
btn = gr.Button("Run Pipeline")
|
| 71 |
+
btn.click(fn=run_pipeline, inputs=job_id_box, outputs=output_box)
|
| 72 |
+
|
| 73 |
+
# Mount the UI onto the Server instance
|
| 74 |
+
gr.mount_gradio_app(demo, ui, path="/ui")
|
| 75 |
+
|
| 76 |
+
# -----------------------------------------------------
|
| 77 |
+
# ENTRYPOINT
|
| 78 |
+
# -----------------------------------------------------
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
demo.launch(show_error=True)
|
graphify-out/.graphify_python
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/Users/rafa/.local/share/uv/tools/graphifyy/bin/python
|
graphify-out/.graphify_root
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
/Users/rafa/MscAi/VideoVoice-be
|
graphify-out/GRAPH_REPORT.md
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Graph Report - VideoVoice-be (2026-05-17)
|
| 2 |
+
|
| 3 |
+
## Corpus Check
|
| 4 |
+
- 60 files · ~254,726 words
|
| 5 |
+
- Verdict: corpus is large enough that graph structure adds value.
|
| 6 |
+
|
| 7 |
+
## Summary
|
| 8 |
+
- 1065 nodes · 1859 edges · 64 communities detected
|
| 9 |
+
- Extraction: 79% EXTRACTED · 21% INFERRED · 0% AMBIGUOUS · INFERRED: 397 edges (avg confidence: 0.62)
|
| 10 |
+
- Token cost: 0 input · 0 output
|
| 11 |
+
|
| 12 |
+
## Community Hubs (Navigation)
|
| 13 |
+
- [[_COMMUNITY_Community 0|Community 0]]
|
| 14 |
+
- [[_COMMUNITY_Community 1|Community 1]]
|
| 15 |
+
- [[_COMMUNITY_Community 2|Community 2]]
|
| 16 |
+
- [[_COMMUNITY_Community 3|Community 3]]
|
| 17 |
+
- [[_COMMUNITY_Community 4|Community 4]]
|
| 18 |
+
- [[_COMMUNITY_Community 5|Community 5]]
|
| 19 |
+
- [[_COMMUNITY_Community 6|Community 6]]
|
| 20 |
+
- [[_COMMUNITY_Community 7|Community 7]]
|
| 21 |
+
- [[_COMMUNITY_Community 8|Community 8]]
|
| 22 |
+
- [[_COMMUNITY_Community 9|Community 9]]
|
| 23 |
+
- [[_COMMUNITY_Community 10|Community 10]]
|
| 24 |
+
- [[_COMMUNITY_Community 11|Community 11]]
|
| 25 |
+
- [[_COMMUNITY_Community 12|Community 12]]
|
| 26 |
+
- [[_COMMUNITY_Community 13|Community 13]]
|
| 27 |
+
- [[_COMMUNITY_Community 14|Community 14]]
|
| 28 |
+
- [[_COMMUNITY_Community 15|Community 15]]
|
| 29 |
+
- [[_COMMUNITY_Community 16|Community 16]]
|
| 30 |
+
- [[_COMMUNITY_Community 17|Community 17]]
|
| 31 |
+
- [[_COMMUNITY_Community 18|Community 18]]
|
| 32 |
+
- [[_COMMUNITY_Community 19|Community 19]]
|
| 33 |
+
- [[_COMMUNITY_Community 20|Community 20]]
|
| 34 |
+
- [[_COMMUNITY_Community 21|Community 21]]
|
| 35 |
+
- [[_COMMUNITY_Community 22|Community 22]]
|
| 36 |
+
- [[_COMMUNITY_Community 23|Community 23]]
|
| 37 |
+
- [[_COMMUNITY_Community 25|Community 25]]
|
| 38 |
+
- [[_COMMUNITY_Community 33|Community 33]]
|
| 39 |
+
- [[_COMMUNITY_Community 34|Community 34]]
|
| 40 |
+
- [[_COMMUNITY_Community 35|Community 35]]
|
| 41 |
+
- [[_COMMUNITY_Community 36|Community 36]]
|
| 42 |
+
- [[_COMMUNITY_Community 37|Community 37]]
|
| 43 |
+
- [[_COMMUNITY_Community 38|Community 38]]
|
| 44 |
+
- [[_COMMUNITY_Community 39|Community 39]]
|
| 45 |
+
- [[_COMMUNITY_Community 40|Community 40]]
|
| 46 |
+
- [[_COMMUNITY_Community 41|Community 41]]
|
| 47 |
+
- [[_COMMUNITY_Community 42|Community 42]]
|
| 48 |
+
- [[_COMMUNITY_Community 43|Community 43]]
|
| 49 |
+
- [[_COMMUNITY_Community 44|Community 44]]
|
| 50 |
+
- [[_COMMUNITY_Community 45|Community 45]]
|
| 51 |
+
- [[_COMMUNITY_Community 46|Community 46]]
|
| 52 |
+
- [[_COMMUNITY_Community 47|Community 47]]
|
| 53 |
+
- [[_COMMUNITY_Community 48|Community 48]]
|
| 54 |
+
- [[_COMMUNITY_Community 49|Community 49]]
|
| 55 |
+
- [[_COMMUNITY_Community 50|Community 50]]
|
| 56 |
+
- [[_COMMUNITY_Community 51|Community 51]]
|
| 57 |
+
- [[_COMMUNITY_Community 52|Community 52]]
|
| 58 |
+
- [[_COMMUNITY_Community 53|Community 53]]
|
| 59 |
+
- [[_COMMUNITY_Community 54|Community 54]]
|
| 60 |
+
- [[_COMMUNITY_Community 55|Community 55]]
|
| 61 |
+
- [[_COMMUNITY_Community 56|Community 56]]
|
| 62 |
+
- [[_COMMUNITY_Community 57|Community 57]]
|
| 63 |
+
- [[_COMMUNITY_Community 58|Community 58]]
|
| 64 |
+
- [[_COMMUNITY_Community 59|Community 59]]
|
| 65 |
+
- [[_COMMUNITY_Community 60|Community 60]]
|
| 66 |
+
- [[_COMMUNITY_Community 61|Community 61]]
|
| 67 |
+
- [[_COMMUNITY_Community 62|Community 62]]
|
| 68 |
+
- [[_COMMUNITY_Community 63|Community 63]]
|
| 69 |
+
- [[_COMMUNITY_Community 64|Community 64]]
|
| 70 |
+
- [[_COMMUNITY_Community 65|Community 65]]
|
| 71 |
+
- [[_COMMUNITY_Community 66|Community 66]]
|
| 72 |
+
- [[_COMMUNITY_Community 67|Community 67]]
|
| 73 |
+
- [[_COMMUNITY_Community 68|Community 68]]
|
| 74 |
+
- [[_COMMUNITY_Community 69|Community 69]]
|
| 75 |
+
- [[_COMMUNITY_Community 70|Community 70]]
|
| 76 |
+
- [[_COMMUNITY_Community 71|Community 71]]
|
| 77 |
+
|
| 78 |
+
## God Nodes (most connected - your core abstractions)
|
| 79 |
+
1. `Qwen3TTSSpeakerEncoderConfig` - 49 edges
|
| 80 |
+
2. `Qwen3TTSTalkerCodePredictorConfig` - 49 edges
|
| 81 |
+
3. `Qwen3TTSTalkerConfig` - 49 edges
|
| 82 |
+
4. `Qwen3TTSConfig` - 48 edges
|
| 83 |
+
5. `Qwen3TTSModel` - 21 edges
|
| 84 |
+
6. `PostResult` - 19 edges
|
| 85 |
+
7. `Qwen3TTSTalkerForConditionalGeneration` - 19 edges
|
| 86 |
+
8. `Qwen3TTSTalkerCodePredictorModelForConditionalGeneration` - 17 edges
|
| 87 |
+
9. `generate()` - 15 edges
|
| 88 |
+
10. `BasePoster` - 14 edges
|
| 89 |
+
|
| 90 |
+
## Surprising Connections (you probably didn't know these)
|
| 91 |
+
- `chatterbox-tts==0.1.7 --no-deps` --semantically_similar_to--> `omnivoice>=0.1.4` [INFERRED] [semantically similar]
|
| 92 |
+
requirements.txt → requirements-omni.txt
|
| 93 |
+
- `gradio==6.8.0` --semantically_similar_to--> `gradio==6.12.0 (omni)` [INFERRED] [semantically similar]
|
| 94 |
+
requirements.txt → requirements-omni.txt
|
| 95 |
+
- `enforce_content_length_limit()` --calls--> `content_length_middleware()` [INFERRED]
|
| 96 |
+
server.py → app.py
|
| 97 |
+
- `run_pipeline()` --calls--> `separate_audio()` [INFERRED]
|
| 98 |
+
pipeline.py → steps/s1b_separate.py
|
| 99 |
+
- `run_pipeline()` --calls--> `transcribe()` [INFERRED]
|
| 100 |
+
pipeline.py → steps/s2_transcribe.py
|
| 101 |
+
|
| 102 |
+
## Hyperedges (group relationships)
|
| 103 |
+
- **Six-step translation pipeline** — [EXTRACTED 1.00]
|
| 104 |
+
- **TTS engine split (env, two reqs files, two spaces, conditional imports)** — [EXTRACTED 1.00]
|
| 105 |
+
- **Live pipeline run (s1b->s2->s3->s4->s5->s6)** — [EXTRACTED 1.00]
|
| 106 |
+
|
| 107 |
+
## Communities
|
| 108 |
+
|
| 109 |
+
### Community 0 - "Community 0"
|
| 110 |
+
Cohesion: 0.04
|
| 111 |
+
Nodes (70): Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig, r""" This is the configuration class to store the configuration of a [`Qwen3, r""" This is the configuration class to store the configuration of a [`Qwen3, This is the configuration class to store the configuration of a [`Qwen3TTSForCon, r""" This is the configuration class to store the configuration of a [`Qwen3 (+62 more)
|
| 112 |
+
|
| 113 |
+
### Community 1 - "Community 1"
|
| 114 |
+
Cohesion: 0.02
|
| 115 |
+
Nodes (118): api_run_pipeline(), content_length_middleware(), ZeroGPU-compatible entrypoint using gradio.Server. Server extends FastAPI, so al, Exposed through Gradio's API engine. ZeroGPU will allocate a GPU when this e, run_pipeline(), BaseHTTPMiddleware, BaseModel, _artifact_reaper_loop() (+110 more)
|
| 116 |
+
|
| 117 |
+
### Community 2 - "Community 2"
|
| 118 |
+
Cohesion: 0.04
|
| 119 |
+
Nodes (38): default(), DistributedGroupResidualVectorQuantization, DistributedResidualVectorQuantization, ema_inplace(), EuclideanCodebook, kmeans(), laplace_smoothing(), postprocess_emb() (+30 more)
|
| 120 |
+
|
| 121 |
+
### Community 3 - "Community 3"
|
| 122 |
+
Cohesion: 0.05
|
| 123 |
+
Nodes (57): ABC, BasePoster, Abstract base class for platform posters., Save a debug screenshot on failure., BasePoster, _build_system_prompt(), _build_user_prompt(), format_caption() (+49 more)
|
| 124 |
+
|
| 125 |
+
### Community 4 - "Community 4"
|
| 126 |
+
Cohesion: 0.06
|
| 127 |
+
Nodes (31): _audio_to_tuple(), _build_choices_and_map(), build_demo(), build_parser(), _collect_gen_kwargs(), _detect_model_kind(), _dtype_from_str(), main() (+23 more)
|
| 128 |
+
|
| 129 |
+
### Community 5 - "Community 5"
|
| 130 |
+
Cohesion: 0.06
|
| 131 |
+
Nodes (59): post(), _assign_words_to_segments(), _extract_words(), _get_faster_whisper_model(), _get_local_whisper_backend(), _get_openai_whisper_model(), _normalise_segments(), Step 3: Transcribe audio with timestamps. Primary local backend (device-depende (+51 more)
|
| 132 |
+
|
| 133 |
+
### Community 6 - "Community 6"
|
| 134 |
+
Cohesion: 0.07
|
| 135 |
+
Nodes (50): forward(), generate(), generate_speaker_prompt(), from_pretrained(), _clip_audio(), _ensure_browser_wav(), _filter_preview_segments(), _free_memory() (+42 more)
|
| 136 |
+
|
| 137 |
+
### Community 7 - "Community 7"
|
| 138 |
+
Cohesion: 0.05
|
| 139 |
+
Nodes (49): FFmpeg concat list (synced TTS), Try-Now app panel, app.js script ref, Comparison table (HeyGen, Rask, ElevenLabs, Synthesia), Hero section + 23+ languages, Frontend index.html, Source/target language selectors, Pricing tiers (Free/Starter/Creator) (+41 more)
|
| 140 |
+
|
| 141 |
+
### Community 8 - "Community 8"
|
| 142 |
+
Cohesion: 0.07
|
| 143 |
+
Nodes (35): _collect_output(), _log_step_done(), main(), pipeline.py — Core pipeline: CLI entrypoint + importable run_pipeline() for Grad, Print duration + separator line for a completed step., Collect all yields and the return value from the generator., Run the full translation pipeline, yielding progress messages. Args:, run_pipeline() (+27 more)
|
| 144 |
+
|
| 145 |
+
### Community 9 - "Community 9"
|
| 146 |
+
Cohesion: 0.09
|
| 147 |
+
Nodes (27): $(), clearFile(), createDemoCard(), detectPlatform(), formatBytes(), formatDemoDate(), formatDemoTitle(), getUsedVideos() (+19 more)
|
| 148 |
+
|
| 149 |
+
### Community 10 - "Community 10"
|
| 150 |
+
Cohesion: 0.09
|
| 151 |
+
Nodes (34): Step 4: Translate segment texts using Pollinations chat completions API (OpenAI-, Translate a batch of segments into target_language., _translate_batch(), bedrock_converse(), bedrock_fallback(), build_client(), log_llm_call(), parse_json_array() (+26 more)
|
| 152 |
+
|
| 153 |
+
### Community 11 - "Community 11"
|
| 154 |
+
Cohesion: 0.08
|
| 155 |
+
Nodes (32): _apply_demucs(), _get_model(), _load_and_normalise(), Step 1b: Separate vocals from accompaniment using Demucs (Python API). In-proce, Lazy-load htdemucs once per process. Module-level semantics; we load on firs, GPU-bound inference call. `mix` shape: [1, channels, time]., Load WAV, resample/remix to match model requirements, z-normalise., Separate vocals from accompaniment using Demucs htdemucs (Python API). Args (+24 more)
|
| 156 |
+
|
| 157 |
+
### Community 12 - "Community 12"
|
| 158 |
+
Cohesion: 0.1
|
| 159 |
+
Nodes (28): tools_api — Standalone endpoints for creator quick tools. Lives alongside the m, audio_cleanup_endpoint(), dramabox_endpoint(), _ext_to_media_type(), APIRouter for /api/tools/* endpoints. Each endpoint is sync request-response (n, Serve a generated artifact. Run dirs auto-expire after RUN_TTL_SECONDS., Manual reap trigger (mostly for testing). Auto-reap runs on a timer., Serve a generated artifact. Run dirs auto-expire after RUN_TTL_SECONDS. (+20 more)
|
| 160 |
+
|
| 161 |
+
### Community 13 - "Community 13"
|
| 162 |
+
Cohesion: 0.12
|
| 163 |
+
Nodes (27): build_for_job(), ensure_transcription(), extract_audio_hq(), extract_reference_audio(), get_audio_duration(), get_device(), load_chatterbox(), main() (+19 more)
|
| 164 |
+
|
| 165 |
+
### Community 14 - "Community 14"
|
| 166 |
+
Cohesion: 0.12
|
| 167 |
+
Nodes (23): build_t3_cond(), main(), prepare_sample(), prepare_sample.py — Turn one dataset.jsonl row into the exact tensors T3.loss(), Build the speaker conditioning (frozen during training)., MTLTokenizer + SOT/EOT padding (mirrors what generate() does internally)., S3Tokenizer on the target dubbed audio → speech tokens (the LABEL). Critica, Turn one dataset row into ready-to-train tensors. (+15 more)
|
| 168 |
+
|
| 169 |
+
### Community 15 - "Community 15"
|
| 170 |
+
Cohesion: 0.13
|
| 171 |
+
Nodes (26): _compress_silences(), _detect_pauses(), _distribute_padding(), _find_tts_silences(), _generate_silence(), _get_wav_duration(), _pad_silence(), _pause_aware_sync() (+18 more)
|
| 172 |
+
|
| 173 |
+
### Community 16 - "Community 16"
|
| 174 |
+
Cohesion: 0.19
|
| 175 |
+
Nodes (18): _burn_in(), _clamp(), _extract_audio(), _force_style_for(), _format_timestamp_srt(), _format_timestamp_vtt(), generate_subtitles(), _is_video() (+10 more)
|
| 176 |
+
|
| 177 |
+
### Community 17 - "Community 17"
|
| 178 |
+
Cohesion: 0.22
|
| 179 |
+
Nodes (12): download_result(), _is_noise(), main(), Batch translate Instagram reels to English via the VideoVoice server API. Usage, Extract the Instagram reel shortcode from a URL, e.g. 'DWn_yPoDsYw'., Submit a single video URL and return the job_id., Return True if a log line is internal noise we don't want in the log., Poll job status until complete or error. Returns final messages and collected lo (+4 more)
|
| 180 |
+
|
| 181 |
+
### Community 18 - "Community 18"
|
| 182 |
+
Cohesion: 0.23
|
| 183 |
+
Nodes (12): evaluate(), load_baseline(), load_with_lora(), main(), pick_held_out_samples(), print_summary(), eval.py — Evaluate the fine-tuned LoRA against the un-tuned baseline. Picks N s, Return overshoot samples (duration_diff > 0.2) — these are NOT in the asymme (+4 more)
|
| 184 |
+
|
| 185 |
+
### Community 19 - "Community 19"
|
| 186 |
+
Cohesion: 0.24
|
| 187 |
+
Nodes (11): extract_creator(), _extract_instagram(), _extract_tiktok(), _extract_youtube(), _load_cache(), Extract original creator @username from video URLs., YouTube: visit video page, extract channel name from meta tags., Extract the @username of the original creator from the video URL. Uses Play (+3 more)
|
| 188 |
+
|
| 189 |
+
### Community 20 - "Community 20"
|
| 190 |
+
Cohesion: 0.27
|
| 191 |
+
Nodes (9): get_fallback_mode(), _get_handler(), get_translation_prompt(), post_translate(), Language-specific handlers for the translation pipeline. Each language that nee, Return a language-specific translation prompt, or the default., Return 'bedrock' or 'google' depending on the language., Run any language-specific post-processing after translation. (+1 more)
|
| 192 |
+
|
| 193 |
+
### Community 21 - "Community 21"
|
| 194 |
+
Cohesion: 0.38
|
| 195 |
+
Nodes (6): _ensure_server(), _generate_impl(), generate_scene(), Dramabox — Resemble AI directable speech engine. Single-Space tool: generates a, Lazy-import the Dramabox model + load checkpoints once. Raises a clean Runti, Run Dramabox on `prompt` and write the resulting WAV under `out_dir`. Retur
|
| 196 |
+
|
| 197 |
+
### Community 22 - "Community 22"
|
| 198 |
+
Cohesion: 0.53
|
| 199 |
+
Nodes (5): main(), _prefetch_chatterbox(), _prefetch_demucs(), _prefetch_faster_whisper(), Prefetch model weights into HF_HOME for faster cold starts on Spaces.
|
| 200 |
+
|
| 201 |
+
### Community 23 - "Community 23"
|
| 202 |
+
Cohesion: 0.33
|
| 203 |
+
Nodes (6): app.py validation, pipeline.py simplified, steps/s4_preview.py, steps/s4_tts.py conditional imports, server.py /api/config, TTS_ENGINE env var
|
| 204 |
+
|
| 205 |
+
### Community 25 - "Community 25"
|
| 206 |
+
Cohesion: 1.0
|
| 207 |
+
Nodes (2): gradio==6.8.0, gradio==6.12.0 (omni)
|
| 208 |
+
|
| 209 |
+
### Community 33 - "Community 33"
|
| 210 |
+
Cohesion: 1.0
|
| 211 |
+
Nodes (1): Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.
|
| 212 |
+
|
| 213 |
+
### Community 34 - "Community 34"
|
| 214 |
+
Cohesion: 1.0
|
| 215 |
+
Nodes (1): Build voice-clone prompt items from reference audio (and optionally reference te
|
| 216 |
+
|
| 217 |
+
### Community 35 - "Community 35"
|
| 218 |
+
Cohesion: 1.0
|
| 219 |
+
Nodes (1): Voice clone speech using the Base model. You can provide either:
|
| 220 |
+
|
| 221 |
+
### Community 36 - "Community 36"
|
| 222 |
+
Cohesion: 1.0
|
| 223 |
+
Nodes (1): Generate speech with the VoiceDesign model using natural-language style instruct
|
| 224 |
+
|
| 225 |
+
### Community 37 - "Community 37"
|
| 226 |
+
Cohesion: 1.0
|
| 227 |
+
Nodes (1): Generate speech with the CustomVoice model using a predefined speaker id, option
|
| 228 |
+
|
| 229 |
+
### Community 38 - "Community 38"
|
| 230 |
+
Cohesion: 1.0
|
| 231 |
+
Nodes (1): Delete stale per-job artifact directories from ARTIFACTS_ROOT.
|
| 232 |
+
|
| 233 |
+
### Community 39 - "Community 39"
|
| 234 |
+
Cohesion: 1.0
|
| 235 |
+
Nodes (1): Reject oversized uploads before body parsing.
|
| 236 |
+
|
| 237 |
+
### Community 40 - "Community 40"
|
| 238 |
+
Cohesion: 1.0
|
| 239 |
+
Nodes (1): Run the translation pipeline in a background thread, pushing progress to the job
|
| 240 |
+
|
| 241 |
+
### Community 41 - "Community 41"
|
| 242 |
+
Cohesion: 1.0
|
| 243 |
+
Nodes (1): List whitelisted MP4 demo videos from outputs/ and data/.
|
| 244 |
+
|
| 245 |
+
### Community 42 - "Community 42"
|
| 246 |
+
Cohesion: 1.0
|
| 247 |
+
Nodes (1): Return curated showcase entries with resolved streaming URLs.
|
| 248 |
+
|
| 249 |
+
### Community 43 - "Community 43"
|
| 250 |
+
Cohesion: 1.0
|
| 251 |
+
Nodes (1): Submit a video for translation.
|
| 252 |
+
|
| 253 |
+
### Community 44 - "Community 44"
|
| 254 |
+
Cohesion: 1.0
|
| 255 |
+
Nodes (1): Poll endpoint returning new messages since index `after`, plus live wait status.
|
| 256 |
+
|
| 257 |
+
### Community 45 - "Community 45"
|
| 258 |
+
Cohesion: 1.0
|
| 259 |
+
Nodes (1): User selects a TTS model after previewing.
|
| 260 |
+
|
| 261 |
+
### Community 46 - "Community 46"
|
| 262 |
+
Cohesion: 1.0
|
| 263 |
+
Nodes (1): Serve a preview audio WAV file.
|
| 264 |
+
|
| 265 |
+
### Community 47 - "Community 47"
|
| 266 |
+
Cohesion: 1.0
|
| 267 |
+
Nodes (1): Download the translated video.
|
| 268 |
+
|
| 269 |
+
### Community 48 - "Community 48"
|
| 270 |
+
Cohesion: 1.0
|
| 271 |
+
Nodes (1): Create artifact directories and start background cleanup.
|
| 272 |
+
|
| 273 |
+
### Community 49 - "Community 49"
|
| 274 |
+
Cohesion: 1.0
|
| 275 |
+
Nodes (1): Sync TTS audio using pause-aware strategy: compress silences first, then atempo.
|
| 276 |
+
|
| 277 |
+
### Community 50 - "Community 50"
|
| 278 |
+
Cohesion: 1.0
|
| 279 |
+
Nodes (1): Rewrite WAV with silence regions compressed to keep_ratio of their original dura
|
| 280 |
+
|
| 281 |
+
### Community 51 - "Community 51"
|
| 282 |
+
Cohesion: 1.0
|
| 283 |
+
Nodes (1): Insert extra silence distributed across detected pause points.
|
| 284 |
+
|
| 285 |
+
### Community 52 - "Community 52"
|
| 286 |
+
Cohesion: 1.0
|
| 287 |
+
Nodes (1): Generate a silent WAV file of given duration.
|
| 288 |
+
|
| 289 |
+
### Community 53 - "Community 53"
|
| 290 |
+
Cohesion: 1.0
|
| 291 |
+
Nodes (1): Sync each TTS segment to its original timestamp window and stitch into a single
|
| 292 |
+
|
| 293 |
+
### Community 54 - "Community 54"
|
| 294 |
+
Cohesion: 1.0
|
| 295 |
+
Nodes (1): Translate the text of each segment into target_language in batches. Args:
|
| 296 |
+
|
| 297 |
+
### Community 55 - "Community 55"
|
| 298 |
+
Cohesion: 1.0
|
| 299 |
+
Nodes (1): Load + run Chatterbox inside a single GPU-decorated scope. ZeroGPU only int
|
| 300 |
+
|
| 301 |
+
### Community 56 - "Community 56"
|
| 302 |
+
Cohesion: 1.0
|
| 303 |
+
Nodes (1): Remove trailing noise/artifacts after speech ends.
|
| 304 |
+
|
| 305 |
+
### Community 57 - "Community 57"
|
| 306 |
+
Cohesion: 1.0
|
| 307 |
+
Nodes (1): Hard-trim TTS output to orig_dur * headroom, with a short fade-out.
|
| 308 |
+
|
| 309 |
+
### Community 58 - "Community 58"
|
| 310 |
+
Cohesion: 1.0
|
| 311 |
+
Nodes (1): Clip audio to max_sec to prevent excessively slow voice cloning.
|
| 312 |
+
|
| 313 |
+
### Community 59 - "Community 59"
|
| 314 |
+
Cohesion: 1.0
|
| 315 |
+
Nodes (1): Numpy variant of _trim_trailing_noise for engines returning np.ndarray.
|
| 316 |
+
|
| 317 |
+
### Community 60 - "Community 60"
|
| 318 |
+
Cohesion: 1.0
|
| 319 |
+
Nodes (1): Perform full OmniVoice processing (load + generate batch) inside a GPU-decorated
|
| 320 |
+
|
| 321 |
+
### Community 61 - "Community 61"
|
| 322 |
+
Cohesion: 1.0
|
| 323 |
+
Nodes (1): Generate speech for all segments using OmniVoice voice cloning.
|
| 324 |
+
|
| 325 |
+
### Community 62 - "Community 62"
|
| 326 |
+
Cohesion: 1.0
|
| 327 |
+
Nodes (1): Synthesise translated text for each segment using voice cloned from reference au
|
| 328 |
+
|
| 329 |
+
### Community 63 - "Community 63"
|
| 330 |
+
Cohesion: 1.0
|
| 331 |
+
Nodes (1): torch==2.6.0
|
| 332 |
+
|
| 333 |
+
### Community 64 - "Community 64"
|
| 334 |
+
Cohesion: 1.0
|
| 335 |
+
Nodes (1): fastapi
|
| 336 |
+
|
| 337 |
+
### Community 65 - "Community 65"
|
| 338 |
+
Cohesion: 1.0
|
| 339 |
+
Nodes (1): yt-dlp
|
| 340 |
+
|
| 341 |
+
### Community 66 - "Community 66"
|
| 342 |
+
Cohesion: 1.0
|
| 343 |
+
Nodes (1): diffusers==0.29.0
|
| 344 |
+
|
| 345 |
+
### Community 67 - "Community 67"
|
| 346 |
+
Cohesion: 1.0
|
| 347 |
+
Nodes (1): ARTIFACTS_ROOT env
|
| 348 |
+
|
| 349 |
+
### Community 68 - "Community 68"
|
| 350 |
+
Cohesion: 1.0
|
| 351 |
+
Nodes (1): AWS g4dn.xlarge alternative
|
| 352 |
+
|
| 353 |
+
### Community 69 - "Community 69"
|
| 354 |
+
Cohesion: 1.0
|
| 355 |
+
Nodes (1): nodejs (system pkg)
|
| 356 |
+
|
| 357 |
+
### Community 70 - "Community 70"
|
| 358 |
+
Cohesion: 1.0
|
| 359 |
+
Nodes (1): fonts-noto-core / cjk
|
| 360 |
+
|
| 361 |
+
### Community 71 - "Community 71"
|
| 362 |
+
Cohesion: 1.0
|
| 363 |
+
Nodes (1): graphify project rules
|
| 364 |
+
|
| 365 |
+
## Knowledge Gaps
|
| 366 |
+
- **329 isolated node(s):** `server.py — FastAPI backend for VideoVoice. Endpoints: POST /api/jobs`, `Download video from Instagram/YouTube using yt-dlp.`, `Allow only trusted social platforms for yt-dlp.`, `Read media duration from ffprobe.`, `Report CUDA/MPS availability.` (+324 more)
|
| 367 |
+
These have ≤1 connection - possible missing edges or undocumented components.
|
| 368 |
+
- **Thin community `Community 25`** (2 nodes): `gradio==6.8.0`, `gradio==6.12.0 (omni)`
|
| 369 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 370 |
+
- **Thin community `Community 33`** (1 nodes): `Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.`
|
| 371 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 372 |
+
- **Thin community `Community 34`** (1 nodes): `Build voice-clone prompt items from reference audio (and optionally reference te`
|
| 373 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 374 |
+
- **Thin community `Community 35`** (1 nodes): `Voice clone speech using the Base model. You can provide either:`
|
| 375 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 376 |
+
- **Thin community `Community 36`** (1 nodes): `Generate speech with the VoiceDesign model using natural-language style instruct`
|
| 377 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 378 |
+
- **Thin community `Community 37`** (1 nodes): `Generate speech with the CustomVoice model using a predefined speaker id, option`
|
| 379 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 380 |
+
- **Thin community `Community 38`** (1 nodes): `Delete stale per-job artifact directories from ARTIFACTS_ROOT.`
|
| 381 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 382 |
+
- **Thin community `Community 39`** (1 nodes): `Reject oversized uploads before body parsing.`
|
| 383 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 384 |
+
- **Thin community `Community 40`** (1 nodes): `Run the translation pipeline in a background thread, pushing progress to the job`
|
| 385 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 386 |
+
- **Thin community `Community 41`** (1 nodes): `List whitelisted MP4 demo videos from outputs/ and data/.`
|
| 387 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 388 |
+
- **Thin community `Community 42`** (1 nodes): `Return curated showcase entries with resolved streaming URLs.`
|
| 389 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 390 |
+
- **Thin community `Community 43`** (1 nodes): `Submit a video for translation.`
|
| 391 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 392 |
+
- **Thin community `Community 44`** (1 nodes): `Poll endpoint returning new messages since index `after`, plus live wait status.`
|
| 393 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 394 |
+
- **Thin community `Community 45`** (1 nodes): `User selects a TTS model after previewing.`
|
| 395 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 396 |
+
- **Thin community `Community 46`** (1 nodes): `Serve a preview audio WAV file.`
|
| 397 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 398 |
+
- **Thin community `Community 47`** (1 nodes): `Download the translated video.`
|
| 399 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 400 |
+
- **Thin community `Community 48`** (1 nodes): `Create artifact directories and start background cleanup.`
|
| 401 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 402 |
+
- **Thin community `Community 49`** (1 nodes): `Sync TTS audio using pause-aware strategy: compress silences first, then atempo.`
|
| 403 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 404 |
+
- **Thin community `Community 50`** (1 nodes): `Rewrite WAV with silence regions compressed to keep_ratio of their original dura`
|
| 405 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 406 |
+
- **Thin community `Community 51`** (1 nodes): `Insert extra silence distributed across detected pause points.`
|
| 407 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 408 |
+
- **Thin community `Community 52`** (1 nodes): `Generate a silent WAV file of given duration.`
|
| 409 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 410 |
+
- **Thin community `Community 53`** (1 nodes): `Sync each TTS segment to its original timestamp window and stitch into a single`
|
| 411 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 412 |
+
- **Thin community `Community 54`** (1 nodes): `Translate the text of each segment into target_language in batches. Args:`
|
| 413 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 414 |
+
- **Thin community `Community 55`** (1 nodes): `Load + run Chatterbox inside a single GPU-decorated scope. ZeroGPU only int`
|
| 415 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 416 |
+
- **Thin community `Community 56`** (1 nodes): `Remove trailing noise/artifacts after speech ends.`
|
| 417 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 418 |
+
- **Thin community `Community 57`** (1 nodes): `Hard-trim TTS output to orig_dur * headroom, with a short fade-out.`
|
| 419 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 420 |
+
- **Thin community `Community 58`** (1 nodes): `Clip audio to max_sec to prevent excessively slow voice cloning.`
|
| 421 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 422 |
+
- **Thin community `Community 59`** (1 nodes): `Numpy variant of _trim_trailing_noise for engines returning np.ndarray.`
|
| 423 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 424 |
+
- **Thin community `Community 60`** (1 nodes): `Perform full OmniVoice processing (load + generate batch) inside a GPU-decorated`
|
| 425 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 426 |
+
- **Thin community `Community 61`** (1 nodes): `Generate speech for all segments using OmniVoice voice cloning.`
|
| 427 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 428 |
+
- **Thin community `Community 62`** (1 nodes): `Synthesise translated text for each segment using voice cloned from reference au`
|
| 429 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 430 |
+
- **Thin community `Community 63`** (1 nodes): `torch==2.6.0`
|
| 431 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 432 |
+
- **Thin community `Community 64`** (1 nodes): `fastapi`
|
| 433 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 434 |
+
- **Thin community `Community 65`** (1 nodes): `yt-dlp`
|
| 435 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 436 |
+
- **Thin community `Community 66`** (1 nodes): `diffusers==0.29.0`
|
| 437 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 438 |
+
- **Thin community `Community 67`** (1 nodes): `ARTIFACTS_ROOT env`
|
| 439 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 440 |
+
- **Thin community `Community 68`** (1 nodes): `AWS g4dn.xlarge alternative`
|
| 441 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 442 |
+
- **Thin community `Community 69`** (1 nodes): `nodejs (system pkg)`
|
| 443 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 444 |
+
- **Thin community `Community 70`** (1 nodes): `fonts-noto-core / cjk`
|
| 445 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 446 |
+
- **Thin community `Community 71`** (1 nodes): `graphify project rules`
|
| 447 |
+
Too small to be a meaningful cluster - may be noise or needs more connections extracted.
|
| 448 |
+
|
| 449 |
+
## Suggested Questions
|
| 450 |
+
_Questions this graph is uniquely positioned to answer:_
|
| 451 |
+
|
| 452 |
+
- **Why does `synthesise_segments()` connect `Community 6` to `Community 8`, `Community 11`?**
|
| 453 |
+
_High betweenness centrality (0.324) - this node is a cross-community bridge._
|
| 454 |
+
- **Why does `generate()` connect `Community 6` to `Community 0`, `Community 4`?**
|
| 455 |
+
_High betweenness centrality (0.200) - this node is a cross-community bridge._
|
| 456 |
+
- **Are the 44 inferred relationships involving `Qwen3TTSSpeakerEncoderConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
|
| 457 |
+
_`Qwen3TTSSpeakerEncoderConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
|
| 458 |
+
- **Are the 44 inferred relationships involving `Qwen3TTSTalkerCodePredictorConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
|
| 459 |
+
_`Qwen3TTSTalkerCodePredictorConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
|
| 460 |
+
- **Are the 44 inferred relationships involving `Qwen3TTSTalkerConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
|
| 461 |
+
_`Qwen3TTSTalkerConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
|
| 462 |
+
- **Are the 44 inferred relationships involving `Qwen3TTSConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
|
| 463 |
+
_`Qwen3TTSConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
|
| 464 |
+
- **What connects `server.py — FastAPI backend for VideoVoice. Endpoints: POST /api/jobs`, `Download video from Instagram/YouTube using yt-dlp.`, `Allow only trusted social platforms for yt-dlp.` to the rest of the system?**
|
| 465 |
+
_329 weakly-connected nodes found - possible documentation gaps or missing edges._
|
graphify-out/graph.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
packages.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
| 2 |
+
nodejs
|
| 3 |
+
fonts-noto-core
|
| 4 |
+
fonts-noto-cjk
|
pipeline.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
pipeline.py — Core pipeline: CLI entrypoint + importable run_pipeline() for Gradio.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python pipeline.py --input data/test_video_3.mp4 --target-lang Spanish
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import os
|
| 11 |
+
import io
|
| 12 |
+
import logging
|
| 13 |
+
import os
|
| 14 |
+
import shutil
|
| 15 |
+
import sys
|
| 16 |
+
import threading
|
| 17 |
+
import time
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Generator
|
| 20 |
+
|
| 21 |
+
from steps.s1_extract_audio import extract_audio, extract_audio_hq
|
| 22 |
+
from steps.s2_transcribe import transcribe, POLLEN_TRANSCRIBE_MODEL
|
| 23 |
+
from steps.s3_translate import translate
|
| 24 |
+
from steps.s4_tts import synthesise_segments
|
| 25 |
+
from steps.s5_sync import sync_and_stitch
|
| 26 |
+
from steps.s6_captions import generate_captions
|
| 27 |
+
from steps.s6_merge import merge_audio_video
|
| 28 |
+
|
| 29 |
+
def _log_step_done(label: str, start: float):
|
| 30 |
+
"""Print duration + separator line for a completed step."""
|
| 31 |
+
elapsed = time.time() - start
|
| 32 |
+
if elapsed >= 60:
|
| 33 |
+
mins, secs = divmod(elapsed, 60)
|
| 34 |
+
print(f"[{label}] Duration: {int(mins)}m {int(secs)}s")
|
| 35 |
+
else:
|
| 36 |
+
print(f"[{label}] Duration: {int(elapsed)}s")
|
| 37 |
+
print("=" * 40)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
LANGUAGE_CODES = {
|
| 41 |
+
"Arabic": "ar",
|
| 42 |
+
"Chinese": "zh",
|
| 43 |
+
"Danish": "da",
|
| 44 |
+
"Dutch": "nl",
|
| 45 |
+
"English": "en",
|
| 46 |
+
"Finnish": "fi",
|
| 47 |
+
"French": "fr",
|
| 48 |
+
"German": "de",
|
| 49 |
+
"Greek": "el",
|
| 50 |
+
"Hebrew": "he",
|
| 51 |
+
"Hindi": "hi",
|
| 52 |
+
"Italian": "it",
|
| 53 |
+
"Japanese": "ja",
|
| 54 |
+
"Korean": "ko",
|
| 55 |
+
"Malay": "ms",
|
| 56 |
+
"Norwegian": "no",
|
| 57 |
+
"Polish": "pl",
|
| 58 |
+
"Portuguese": "pt",
|
| 59 |
+
"Russian": "ru",
|
| 60 |
+
"Spanish": "es",
|
| 61 |
+
"Swahili": "sw",
|
| 62 |
+
"Swedish": "sv",
|
| 63 |
+
"Turkish": "tr",
|
| 64 |
+
"Urdu": "hi",
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def run_pipeline(
|
| 69 |
+
video_path: str,
|
| 70 |
+
target_language: str = "Spanish",
|
| 71 |
+
source_language: str = "auto",
|
| 72 |
+
output_path: str | None = None,
|
| 73 |
+
voice_mode: str = "chatterbox",
|
| 74 |
+
preview_event: threading.Event | None = None,
|
| 75 |
+
job_state: dict | None = None,
|
| 76 |
+
captions: bool = True,
|
| 77 |
+
preserve_music: bool = False,
|
| 78 |
+
data_dir: str | None = None,
|
| 79 |
+
video_link: str | None = None,
|
| 80 |
+
) -> Generator[str | dict, None, str]:
|
| 81 |
+
"""
|
| 82 |
+
Run the full translation pipeline, yielding progress messages.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
video_path: Path to the input video file.
|
| 86 |
+
target_language: Target language name (e.g. "Spanish").
|
| 87 |
+
source_language: ISO-639-1 code of the source language, or "auto" for
|
| 88 |
+
Whisper to auto-detect (default "auto"). Forcing a wrong code makes
|
| 89 |
+
Whisper silently translate-and-transcribe instead of transcribing.
|
| 90 |
+
output_path: Where to save the output video. Auto-generated if None.
|
| 91 |
+
voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
|
| 92 |
+
In Space deployments, this must match TTS_ENGINE env var.
|
| 93 |
+
preview_event: Deprecated - kept for compatibility, but unused in single-engine mode.
|
| 94 |
+
job_state: Shared dict with the server.
|
| 95 |
+
|
| 96 |
+
Yields:
|
| 97 |
+
str: Progress messages for each step.
|
| 98 |
+
dict: Special sentinel when previews are ready.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
str: Path to the translated output video.
|
| 102 |
+
"""
|
| 103 |
+
# Single-engine mode: voice_mode must match TTS_ENGINE if set
|
| 104 |
+
space_engine = os.getenv("TTS_ENGINE")
|
| 105 |
+
if space_engine and voice_mode != space_engine:
|
| 106 |
+
yield f"⚠️ Warning: voice_mode='{voice_mode}' but Space TTS_ENGINE='{space_engine}'. Using {space_engine}.\n"
|
| 107 |
+
voice_mode = space_engine
|
| 108 |
+
|
| 109 |
+
# Fixed step count (no more preview_both mode)
|
| 110 |
+
total_steps = 6 + (1 if preserve_music else 0)
|
| 111 |
+
|
| 112 |
+
# Prepare output path
|
| 113 |
+
if output_path is None:
|
| 114 |
+
if data_dir:
|
| 115 |
+
output_path = str(Path(data_dir) / "output.mp4")
|
| 116 |
+
else:
|
| 117 |
+
stem = Path(video_path).stem
|
| 118 |
+
output_path = f"output_{stem}_{target_language.lower()}.mp4"
|
| 119 |
+
|
| 120 |
+
# Clean tmp dir
|
| 121 |
+
shutil.rmtree("tmp", ignore_errors=True)
|
| 122 |
+
os.makedirs("tmp/audio/source", exist_ok=True)
|
| 123 |
+
|
| 124 |
+
# Set up logging to tmp/logs.txt (clean logs only, no torch/chatterbox noise)
|
| 125 |
+
log_path = "tmp/logs.txt"
|
| 126 |
+
_log_file = open(log_path, "w", encoding="utf-8")
|
| 127 |
+
_orig_stdout = sys.stdout
|
| 128 |
+
_orig_stderr = sys.stderr
|
| 129 |
+
|
| 130 |
+
# Patterns to filter out of log file (still shown in terminal)
|
| 131 |
+
_NOISE = (
|
| 132 |
+
"Sampling:", "sampling", "UserWarning", "FutureWarning", "DeprecationWarning",
|
| 133 |
+
"torch.backends", "torch.functional", "torch.fft", "torchaudio/compliance",
|
| 134 |
+
"sdp_kernel", "LoRACompatible", "pkg_resources", "Fetching",
|
| 135 |
+
"output_attentions", "TRANSFORMERS_VERBOSITY",
|
| 136 |
+
"istft", "stft", "resize_", "inverse_transform",
|
| 137 |
+
"PerthNet", "loaded Perth", "diffusers/models",
|
| 138 |
+
"chatterbox/models/s3gen", "alignment_stream_analyzer",
|
| 139 |
+
"WARNING:chatterbox",
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
class _Tee(io.TextIOBase):
|
| 143 |
+
"""Write to both the original stream and the log file (filtered)."""
|
| 144 |
+
def __init__(self, original, filter_noise=False):
|
| 145 |
+
self._original = original
|
| 146 |
+
self._filter = filter_noise
|
| 147 |
+
def write(self, s):
|
| 148 |
+
self._original.write(s)
|
| 149 |
+
if self._filter and any(p in s for p in _NOISE):
|
| 150 |
+
return len(s)
|
| 151 |
+
if not _log_file.closed:
|
| 152 |
+
_log_file.write(s)
|
| 153 |
+
_log_file.flush()
|
| 154 |
+
return len(s)
|
| 155 |
+
def flush(self):
|
| 156 |
+
self._original.flush()
|
| 157 |
+
if not _log_file.closed:
|
| 158 |
+
_log_file.flush()
|
| 159 |
+
|
| 160 |
+
sys.stdout = _Tee(_orig_stdout, filter_noise=True)
|
| 161 |
+
sys.stderr = _Tee(_orig_stderr, filter_noise=True)
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
yield f"🎬 Starting pipeline: {video_path} → {target_language}\n"
|
| 165 |
+
|
| 166 |
+
# Step 1: Extract audio
|
| 167 |
+
yield f"🔊 Step 1/{total_steps}: Extracting audio...\n"
|
| 168 |
+
_t0 = time.time()
|
| 169 |
+
audio_path = extract_audio(video_path, "tmp/audio/source/extracted_audio.wav")
|
| 170 |
+
yield f" ✓ Audio extracted: {audio_path}\n"
|
| 171 |
+
|
| 172 |
+
# Step 1b: Source separation (conditional)
|
| 173 |
+
vocals_path = audio_path # default: use full mix
|
| 174 |
+
music_path = None
|
| 175 |
+
if preserve_music:
|
| 176 |
+
from steps.s1b_separate import separate_audio
|
| 177 |
+
|
| 178 |
+
audio_hq = extract_audio_hq(video_path, "tmp/audio/source/extracted_audio_hq.wav")
|
| 179 |
+
_log_step_done("s1", _t0)
|
| 180 |
+
|
| 181 |
+
yield f"🎵 Step 2/{total_steps}: Separating vocals from background music...\n"
|
| 182 |
+
_t0 = time.time()
|
| 183 |
+
vocals_path, music_path = separate_audio(audio_hq, "tmp/audio/source")
|
| 184 |
+
yield f" ✓ Vocals and accompaniment separated\n"
|
| 185 |
+
_log_step_done("s1b", _t0)
|
| 186 |
+
else:
|
| 187 |
+
_log_step_done("s1", _t0)
|
| 188 |
+
|
| 189 |
+
# Step offset: steps after separation shift by 1 when preserve_music is on
|
| 190 |
+
step_offset = 1 if preserve_music else 0
|
| 191 |
+
|
| 192 |
+
# Step 2: Transcribe
|
| 193 |
+
yield f"📝 Step {2 + step_offset}/{total_steps}: Transcribing (Pollinations Whisper / mlx-whisper)...\n"
|
| 194 |
+
_t0 = time.time()
|
| 195 |
+
segments = transcribe(vocals_path, language=source_language)
|
| 196 |
+
yield f" ✓ {len(segments)} segments transcribed\n"
|
| 197 |
+
for seg in segments:
|
| 198 |
+
yield f" [{seg['start']:.1f}s–{seg['end']:.1f}s] {seg['text']}\n"
|
| 199 |
+
|
| 200 |
+
# Dump transcription to tmp for inspection
|
| 201 |
+
import json as _json
|
| 202 |
+
from urllib.parse import urlparse, urlunparse
|
| 203 |
+
with open("tmp/transcription.json", "w", encoding="utf-8") as _tf:
|
| 204 |
+
out_data = {
|
| 205 |
+
"model_provider": "pollinations",
|
| 206 |
+
"model_name": POLLEN_TRANSCRIBE_MODEL,
|
| 207 |
+
"source_language": source_language,
|
| 208 |
+
"audio_path": vocals_path,
|
| 209 |
+
"segment_count": len(segments),
|
| 210 |
+
"total_duration": round(segments[-1]["end"], 2) if segments else 0,
|
| 211 |
+
"segments": [
|
| 212 |
+
{
|
| 213 |
+
"index": i,
|
| 214 |
+
"start": seg["start"],
|
| 215 |
+
"end": seg["end"],
|
| 216 |
+
"duration": round(seg["end"] - seg["start"], 2),
|
| 217 |
+
"text": seg["text"],
|
| 218 |
+
**({"words": seg["words"]} if "words" in seg else {}),
|
| 219 |
+
}
|
| 220 |
+
for i, seg in enumerate(segments)
|
| 221 |
+
],
|
| 222 |
+
}
|
| 223 |
+
if video_link:
|
| 224 |
+
parsed = urlparse(video_link)
|
| 225 |
+
clean_link = urlunparse(parsed._replace(query="", fragment=""))
|
| 226 |
+
out_data = {"video_link": clean_link, **out_data}
|
| 227 |
+
|
| 228 |
+
_json.dump(out_data, _tf, indent=2, ensure_ascii=False)
|
| 229 |
+
|
| 230 |
+
_log_step_done("s2", _t0)
|
| 231 |
+
|
| 232 |
+
# Step 3: Translate
|
| 233 |
+
yield f"🌍 Step {3 + step_offset}/{total_steps}: Translating to {target_language}...\n"
|
| 234 |
+
_t0 = time.time()
|
| 235 |
+
segments = translate(segments, target_language)
|
| 236 |
+
yield f" ✓ Translation complete\n"
|
| 237 |
+
for seg in segments:
|
| 238 |
+
yield f" → {seg['translated_text']}\n"
|
| 239 |
+
|
| 240 |
+
target_lang_code = LANGUAGE_CODES.get(target_language, "es")
|
| 241 |
+
_log_step_done("s3", _t0)
|
| 242 |
+
|
| 243 |
+
# ── Step 4: TTS Synthesis ───────────────────────────────
|
| 244 |
+
model_name = voice_mode # Uses TTS_ENGINE env var in Space deployments
|
| 245 |
+
|
| 246 |
+
yield f"🗣️ Step {4 + step_offset}/{total_steps}: Synthesising speech ({model_name})...\n"
|
| 247 |
+
_t0 = time.time()
|
| 248 |
+
tts_gen = synthesise_segments(
|
| 249 |
+
segments, vocals_path,
|
| 250 |
+
language_id=target_lang_code,
|
| 251 |
+
output_dir="tmp/audio/tts",
|
| 252 |
+
model_name=model_name,
|
| 253 |
+
)
|
| 254 |
+
for msg in tts_gen:
|
| 255 |
+
if isinstance(msg, dict) and "__TTS_RESULT__" in msg:
|
| 256 |
+
segments = msg["__TTS_RESULT__"]
|
| 257 |
+
else:
|
| 258 |
+
yield msg
|
| 259 |
+
|
| 260 |
+
yield f" ✓ {len(segments)} segments synthesised\n"
|
| 261 |
+
_log_step_done("s4_tts", _t0)
|
| 262 |
+
|
| 263 |
+
# Step 5: Sync
|
| 264 |
+
yield f"⏱️ Step {5 + step_offset}/{total_steps}: Syncing audio to original timestamps...\n"
|
| 265 |
+
_t0 = time.time()
|
| 266 |
+
final_audio = sync_and_stitch(segments, "tmp/audio/final_audio.wav", "tmp/audio/tts_synced")
|
| 267 |
+
yield f" ✓ Audio synced: {final_audio}\n"
|
| 268 |
+
_log_step_done("s5", _t0)
|
| 269 |
+
|
| 270 |
+
# Captions + Merge
|
| 271 |
+
captions_path = None
|
| 272 |
+
_t0 = time.time()
|
| 273 |
+
if captions:
|
| 274 |
+
captions_path = generate_captions(segments, "tmp/captions.ass", target_language=target_language)
|
| 275 |
+
yield f" ✓ Captions generated: {captions_path}\n"
|
| 276 |
+
|
| 277 |
+
# Step 6: Merge
|
| 278 |
+
music_label = " + music" if music_path else ""
|
| 279 |
+
yield f"🎞️ Step {6 + step_offset}/{total_steps}: Merging translated audio{' + captions' if captions_path else ''}{music_label} into video...\n"
|
| 280 |
+
result = merge_audio_video(video_path, final_audio, output_path, captions_path=captions_path, music_path=music_path)
|
| 281 |
+
_log_step_done("s6", _t0)
|
| 282 |
+
yield f"\n✅ Done! Output saved to: {result}\n"
|
| 283 |
+
|
| 284 |
+
finally:
|
| 285 |
+
sys.stdout = _orig_stdout
|
| 286 |
+
sys.stderr = _orig_stderr
|
| 287 |
+
if not _log_file.closed:
|
| 288 |
+
_log_file.close()
|
| 289 |
+
|
| 290 |
+
if data_dir:
|
| 291 |
+
def _safe_copy(src, dst_name):
|
| 292 |
+
if os.path.exists(src):
|
| 293 |
+
shutil.copy2(src, os.path.join(data_dir, dst_name))
|
| 294 |
+
_safe_copy(log_path, "logs.txt")
|
| 295 |
+
_safe_copy("tmp/transcription.json", "transcription.json")
|
| 296 |
+
_safe_copy("tmp/llm_calls.json", "llm_calls.json")
|
| 297 |
+
_safe_copy("tmp/audio/tts/tts_manifest.json", "tts_manifest.json")
|
| 298 |
+
_safe_copy("tmp/audio/tts/segment_comparison.json", "segment_comparison.json")
|
| 299 |
+
|
| 300 |
+
print(f"[pipeline] Logs saved → {log_path}")
|
| 301 |
+
|
| 302 |
+
return result
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _collect_output(gen: Generator) -> tuple[list[str], str]:
|
| 306 |
+
"""Collect all yields and the return value from the generator."""
|
| 307 |
+
messages = []
|
| 308 |
+
output_path = None
|
| 309 |
+
try:
|
| 310 |
+
while True:
|
| 311 |
+
msg = next(gen)
|
| 312 |
+
if isinstance(msg, dict):
|
| 313 |
+
# Ignore preview sentinels in CLI mode (deprecated preview_both flow)
|
| 314 |
+
continue
|
| 315 |
+
messages.append(msg)
|
| 316 |
+
print(msg, end="", flush=True)
|
| 317 |
+
except StopIteration as e:
|
| 318 |
+
output_path = e.value
|
| 319 |
+
return messages, output_path
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def main():
|
| 323 |
+
parser = argparse.ArgumentParser(description="Video Translation Pipeline")
|
| 324 |
+
parser.add_argument("--input", required=True, help="Input video path")
|
| 325 |
+
parser.add_argument(
|
| 326 |
+
"--target-lang",
|
| 327 |
+
default="Spanish",
|
| 328 |
+
choices=list(LANGUAGE_CODES.keys()),
|
| 329 |
+
help="Target language (default: Spanish)",
|
| 330 |
+
)
|
| 331 |
+
parser.add_argument(
|
| 332 |
+
"--source-lang",
|
| 333 |
+
default="auto",
|
| 334 |
+
help="Source language ISO-639-1 code or 'auto' to let Whisper detect (default: auto)",
|
| 335 |
+
)
|
| 336 |
+
parser.add_argument("--output", default=None, help="Output video path")
|
| 337 |
+
parser.add_argument(
|
| 338 |
+
"--voice-mode",
|
| 339 |
+
default="chatterbox",
|
| 340 |
+
choices=["chatterbox", "omnivoice", "qwen3"],
|
| 341 |
+
help="TTS engine to use (default: chatterbox). Must match TTS_ENGINE env var in Space deployments.",
|
| 342 |
+
)
|
| 343 |
+
parser.add_argument(
|
| 344 |
+
"--preserve-music",
|
| 345 |
+
action="store_true",
|
| 346 |
+
help="Separate and preserve background music using Demucs",
|
| 347 |
+
)
|
| 348 |
+
args = parser.parse_args()
|
| 349 |
+
|
| 350 |
+
gen = run_pipeline(
|
| 351 |
+
video_path=args.input,
|
| 352 |
+
target_language=args.target_lang,
|
| 353 |
+
source_language=args.source_lang,
|
| 354 |
+
output_path=args.output,
|
| 355 |
+
voice_mode=args.voice_mode,
|
| 356 |
+
preserve_music=args.preserve_music,
|
| 357 |
+
)
|
| 358 |
+
_, output = _collect_output(gen)
|
| 359 |
+
print(f"\nFinal output: {output}")
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
if __name__ == "__main__":
|
| 363 |
+
main()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "videovoice"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "AI-powered short video translation with zero-shot voice cloning"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
license = "MIT"
|
| 7 |
+
requires-python = ">=3.10,<3.13"
|
| 8 |
+
dependencies = [
|
| 9 |
+
"openai>=2.30.0",
|
| 10 |
+
"requests>=2.33.0",
|
| 11 |
+
"python-dotenv>=1.2.2",
|
| 12 |
+
"pydub>=0.25.1",
|
| 13 |
+
"ffmpeg-python>=0.2.0",
|
| 14 |
+
"mlx-whisper>=0.4.3",
|
| 15 |
+
"tqdm>=4.67.3",
|
| 16 |
+
"fastapi>=0.135.2",
|
| 17 |
+
"uvicorn[standard]>=0.42.0",
|
| 18 |
+
"python-multipart>=0.0.22",
|
| 19 |
+
"yt-dlp>=2026.3.17",
|
| 20 |
+
"sse-starlette>=3.3.4",
|
| 21 |
+
"soundfile>=0.13.1",
|
| 22 |
+
"deep-translator>=1.11.4",
|
| 23 |
+
"demucs>=4.0.1",
|
| 24 |
+
"boto3>=1.42.82",
|
| 25 |
+
"torch==2.6.0",
|
| 26 |
+
"torchaudio==2.6.0",
|
| 27 |
+
"slowapi>=0.1.9",
|
| 28 |
+
"faster-whisper>=1.2.1",
|
| 29 |
+
"spaces>=0.48.3",
|
| 30 |
+
"openai-whisper>=20240930",
|
| 31 |
+
"gradio>=6.12.0",
|
| 32 |
+
"accelerate>=1.12.0",
|
| 33 |
+
"transformers>=4.57.3",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
[project.optional-dependencies]
|
| 37 |
+
# HF Spaces install from requirements-{cbox,omni}.txt and ignore these.
|
| 38 |
+
# Locally: `uv sync --extra chatterbox` installs the PyPI chatterbox-tts
|
| 39 |
+
# (we skip-worktree the vendored ./chatterbox/ folder so it doesn't shadow
|
| 40 |
+
# the PyPI package). `--extra omnivoice` is heavier and optional.
|
| 41 |
+
chatterbox = ["chatterbox-tts>=0.1.7"]
|
| 42 |
+
omnivoice = ["omnivoice>=0.1.4"]
|
| 43 |
+
|
| 44 |
+
[tool.uv]
|
| 45 |
+
# Declare chatterbox and omnivoice extras as mutually exclusive so uv
|
| 46 |
+
# doesn't try to resolve them into one lockfile view.
|
| 47 |
+
conflicts = [
|
| 48 |
+
[{ extra = "chatterbox" }, { extra = "omnivoice" }],
|
| 49 |
+
]
|
| 50 |
+
override-dependencies = [
|
| 51 |
+
# onnxruntime 1.24.x metadata claims py3.10 support but no 3.10 wheels
|
| 52 |
+
# ship on PyPI — force resolution to the last version that has 3.10 wheels.
|
| 53 |
+
"onnxruntime<1.24",
|
| 54 |
+
# chatterbox-tts==0.1.7 pins gradio==6.8.0, but app.py needs >=6.12.0
|
| 55 |
+
# for gradio.Server. Override so the extras can coexist in a lockfile;
|
| 56 |
+
# gradio is only loaded by app.py (HF), so the local chatterbox install
|
| 57 |
+
# never exercises gradio code.
|
| 58 |
+
"gradio>=6.12.0",
|
| 59 |
+
]
|
requirements-cbox.txt
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
setuptools<70.0.0
|
| 2 |
+
# Core ML
|
| 3 |
+
torch==2.8.0
|
| 4 |
+
torchaudio==2.8.0
|
| 5 |
+
accelerate==1.12.0
|
| 6 |
+
transformers>=4.57.3
|
| 7 |
+
diffusers==0.29.0
|
| 8 |
+
safetensors==0.5.3
|
| 9 |
+
|
| 10 |
+
# Audio processing
|
| 11 |
+
librosa==0.11.0
|
| 12 |
+
soundfile
|
| 13 |
+
pydub
|
| 14 |
+
demucs==4.0.1
|
| 15 |
+
openunmix
|
| 16 |
+
pyloudnorm
|
| 17 |
+
|
| 18 |
+
# Transcription
|
| 19 |
+
faster-whisper
|
| 20 |
+
|
| 21 |
+
# Translation
|
| 22 |
+
deep-translator
|
| 23 |
+
|
| 24 |
+
# TTS
|
| 25 |
+
conformer==0.3.2
|
| 26 |
+
omegaconf
|
| 27 |
+
pykakasi==2.3.0
|
| 28 |
+
resemble-perth>=1.0.0
|
| 29 |
+
s3tokenizer
|
| 30 |
+
spacy-pkuseg
|
| 31 |
+
|
| 32 |
+
# API / server
|
| 33 |
+
fastapi
|
| 34 |
+
uvicorn
|
| 35 |
+
slowapi
|
| 36 |
+
sse-starlette
|
| 37 |
+
python-multipart
|
| 38 |
+
python-dotenv
|
| 39 |
+
pydantic
|
| 40 |
+
|
| 41 |
+
# HuggingFace
|
| 42 |
+
huggingface-hub
|
| 43 |
+
spaces
|
| 44 |
+
|
| 45 |
+
# Utilities
|
| 46 |
+
openai
|
| 47 |
+
boto3
|
| 48 |
+
yt-dlp
|
| 49 |
+
ffmpeg-python
|
| 50 |
+
numpy<2.0.0
|
| 51 |
+
pandas<2.3.0
|
requirements-omni.txt
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requirements for OmniVoice TTS Space (ZeroGPU / Python 3.10)
|
| 2 |
+
# TTS Engine: OmniVoice (set TTS_ENGINE=omnivoice in Space Secrets)
|
| 3 |
+
#
|
| 4 |
+
# This Space serves only the OmniVoice TTS engine, avoiding dependency
|
| 5 |
+
# conflicts with chatterbox-tts (which pins transformers==5.2.0).
|
| 6 |
+
|
| 7 |
+
accelerate==1.12.0
|
| 8 |
+
aiofiles
|
| 9 |
+
annotated-types
|
| 10 |
+
anyio
|
| 11 |
+
audioread
|
| 12 |
+
av
|
| 13 |
+
beautifulsoup4
|
| 14 |
+
boto3
|
| 15 |
+
botocore
|
| 16 |
+
brotli
|
| 17 |
+
catalogue
|
| 18 |
+
certifi
|
| 19 |
+
cffi
|
| 20 |
+
cfgv
|
| 21 |
+
charset-normalizer
|
| 22 |
+
click
|
| 23 |
+
cloudpickle
|
| 24 |
+
coloredlogs
|
| 25 |
+
conformer
|
| 26 |
+
ctranslate2
|
| 27 |
+
decorator
|
| 28 |
+
deep-translator
|
| 29 |
+
demucs==4.0.1
|
| 30 |
+
deprecated
|
| 31 |
+
diffusers
|
| 32 |
+
distlib
|
| 33 |
+
distro
|
| 34 |
+
dora-search
|
| 35 |
+
einops
|
| 36 |
+
fastapi
|
| 37 |
+
faster-whisper
|
| 38 |
+
ffmpeg-python
|
| 39 |
+
ffmpy
|
| 40 |
+
filelock
|
| 41 |
+
flatbuffers
|
| 42 |
+
fsspec
|
| 43 |
+
future
|
| 44 |
+
gradio==6.12.0
|
| 45 |
+
gradio-client
|
| 46 |
+
h11
|
| 47 |
+
httpcore
|
| 48 |
+
httptools
|
| 49 |
+
httpx
|
| 50 |
+
huggingface-hub
|
| 51 |
+
humanfriendly
|
| 52 |
+
identify
|
| 53 |
+
idna
|
| 54 |
+
importlib-metadata
|
| 55 |
+
jaconv
|
| 56 |
+
jinja2
|
| 57 |
+
jiter
|
| 58 |
+
jmespath
|
| 59 |
+
joblib
|
| 60 |
+
julius
|
| 61 |
+
lameenc
|
| 62 |
+
lazy-loader
|
| 63 |
+
librosa
|
| 64 |
+
limits
|
| 65 |
+
llvmlite
|
| 66 |
+
markdown-it-py
|
| 67 |
+
markupsafe
|
| 68 |
+
mdurl
|
| 69 |
+
ml-dtypes
|
| 70 |
+
mlx; sys_platform == 'darwin'
|
| 71 |
+
mlx-whisper; sys_platform == 'darwin'
|
| 72 |
+
more-itertools
|
| 73 |
+
mpmath
|
| 74 |
+
msgpack
|
| 75 |
+
networkx
|
| 76 |
+
nodeenv
|
| 77 |
+
numba
|
| 78 |
+
numpy<2.0.0
|
| 79 |
+
omegaconf
|
| 80 |
+
onnx
|
| 81 |
+
onnxruntime
|
| 82 |
+
openai
|
| 83 |
+
openai-whisper
|
| 84 |
+
openunmix
|
| 85 |
+
orjson
|
| 86 |
+
packaging
|
| 87 |
+
pandas<2.3.0
|
| 88 |
+
pillow
|
| 89 |
+
platformdirs
|
| 90 |
+
pooch
|
| 91 |
+
pre-commit
|
| 92 |
+
protobuf
|
| 93 |
+
psutil
|
| 94 |
+
pycparser
|
| 95 |
+
pydantic
|
| 96 |
+
pydantic-core
|
| 97 |
+
pydub
|
| 98 |
+
pygments
|
| 99 |
+
pykakasi
|
| 100 |
+
pyloudnorm
|
| 101 |
+
python-dateutil
|
| 102 |
+
python-discovery
|
| 103 |
+
python-dotenv
|
| 104 |
+
python-multipart
|
| 105 |
+
pytz
|
| 106 |
+
pyyaml
|
| 107 |
+
regex
|
| 108 |
+
resemble-perth
|
| 109 |
+
retrying
|
| 110 |
+
rich
|
| 111 |
+
s3tokenizer
|
| 112 |
+
s3transfer
|
| 113 |
+
safehttpx
|
| 114 |
+
safetensors
|
| 115 |
+
scikit-learn
|
| 116 |
+
scipy
|
| 117 |
+
semantic-version
|
| 118 |
+
setuptools
|
| 119 |
+
shellingham
|
| 120 |
+
six
|
| 121 |
+
slowapi
|
| 122 |
+
sniffio
|
| 123 |
+
soundfile
|
| 124 |
+
soupsieve
|
| 125 |
+
soxr
|
| 126 |
+
spaces
|
| 127 |
+
spacy-pkuseg
|
| 128 |
+
srsly
|
| 129 |
+
sse-starlette
|
| 130 |
+
starlette
|
| 131 |
+
submitit
|
| 132 |
+
sympy
|
| 133 |
+
threadpoolctl
|
| 134 |
+
tiktoken
|
| 135 |
+
tokenizers
|
| 136 |
+
tomlkit
|
| 137 |
+
torch==2.8.0
|
| 138 |
+
torchaudio==2.8.0
|
| 139 |
+
tqdm
|
| 140 |
+
transformers>=4.57.3
|
| 141 |
+
treetable
|
| 142 |
+
typer
|
| 143 |
+
typing-extensions
|
| 144 |
+
typing-inspection
|
| 145 |
+
tzdata
|
| 146 |
+
urllib3
|
| 147 |
+
uvicorn
|
| 148 |
+
uvloop; sys_platform != 'win32'
|
| 149 |
+
virtualenv
|
| 150 |
+
watchfiles
|
| 151 |
+
websockets
|
| 152 |
+
wrapt
|
| 153 |
+
yt-dlp
|
| 154 |
+
zipp
|
| 155 |
+
|
| 156 |
+
# OmniVoice TTS
|
| 157 |
+
omnivoice>=0.1.4
|
requirements-qwen3.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requirements for Qwen3-TTS Space (ZeroGPU / Python 3.10)
|
| 2 |
+
# TTS Engine: Qwen3-TTS Base 1.7B (set TTS_ENGINE=qwen3 in Space Secrets)
|
| 3 |
+
#
|
| 4 |
+
# This Space serves only the Qwen3-TTS engine, mirroring the chatterbox/
|
| 5 |
+
# omnivoice split. Pins are derived from the official Qwen/Qwen3-TTS Space
|
| 6 |
+
# (torch 2.8, transformers 4.57.3) plus the VideoVoice pipeline's
|
| 7 |
+
# transcription/translation/audio dependencies.
|
| 8 |
+
|
| 9 |
+
# ── Qwen3-TTS core (matches Qwen/Qwen3-TTS Space) ────────────
|
| 10 |
+
# NOTE: `qwen_tts` is NOT a PyPI package. The Qwen3TTSModel class is loaded
|
| 11 |
+
# from a vendored `qwen_tts/` directory at the repo root, mirroring the
|
| 12 |
+
# vendored `chatterbox/` folder pattern. Copy that directory from
|
| 13 |
+
# https://huggingface.co/spaces/Qwen/Qwen3-TTS/tree/main/qwen_tts into this
|
| 14 |
+
# repo before deploying.
|
| 15 |
+
torch==2.8.0
|
| 16 |
+
torchaudio==2.8.0
|
| 17 |
+
transformers==4.57.3
|
| 18 |
+
accelerate==1.12.0
|
| 19 |
+
einops
|
| 20 |
+
librosa
|
| 21 |
+
soundfile
|
| 22 |
+
sox
|
| 23 |
+
onnxruntime
|
| 24 |
+
kernels
|
| 25 |
+
spaces
|
| 26 |
+
|
| 27 |
+
# ── VideoVoice pipeline (transcription + translation + IO) ──
|
| 28 |
+
fastapi
|
| 29 |
+
uvicorn
|
| 30 |
+
slowapi
|
| 31 |
+
sse-starlette
|
| 32 |
+
python-multipart
|
| 33 |
+
python-dotenv
|
| 34 |
+
pydantic
|
| 35 |
+
|
| 36 |
+
faster-whisper
|
| 37 |
+
openai-whisper
|
| 38 |
+
mlx; sys_platform == 'darwin'
|
| 39 |
+
mlx-whisper; sys_platform == 'darwin'
|
| 40 |
+
|
| 41 |
+
deep-translator
|
| 42 |
+
openai
|
| 43 |
+
|
| 44 |
+
demucs==4.0.1
|
| 45 |
+
openunmix
|
| 46 |
+
pyloudnorm
|
| 47 |
+
pydub
|
| 48 |
+
ffmpeg-python
|
| 49 |
+
|
| 50 |
+
huggingface-hub
|
| 51 |
+
boto3
|
| 52 |
+
yt-dlp
|
| 53 |
+
gradio==6.12.0
|
| 54 |
+
numpy<2.0.0
|
| 55 |
+
pandas<2.3.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Requirements for Dramabox Space (ZeroGPU / Python 3.10)
|
| 2 |
+
# TTS Engine: Resemble Dramabox (set TTS_ENGINE=dramabox in Space Secrets)
|
| 3 |
+
#
|
| 4 |
+
# This Space serves the Dramabox "directable speech" model via the
|
| 5 |
+
# /api/tools/dramabox tools endpoint. The dub pipeline is reachable but
|
| 6 |
+
# rejects voice_mode != "dramabox" (server.py), and the frontend never
|
| 7 |
+
# routes dub requests here.
|
| 8 |
+
#
|
| 9 |
+
# NOTE: The Dramabox inference glue (TTSServer, model_downloader) is NOT
|
| 10 |
+
# a PyPI package. Vendor it from
|
| 11 |
+
# https://huggingface.co/spaces/ResembleAI/Dramabox/tree/main/src
|
| 12 |
+
# into this repo as `dramabox_src/` before deploying. The tools_api/dramabox
|
| 13 |
+
# worker adds that path to sys.path on first request.
|
| 14 |
+
|
| 15 |
+
# ── Dramabox core (verbatim from upstream ResembleAI/Dramabox Space) ──
|
| 16 |
+
torch==2.8.0
|
| 17 |
+
torchaudio==2.8.0
|
| 18 |
+
# pydantic 2.11+ emits bool-shorthand `additionalProperties: True` which
|
| 19 |
+
# crashes gradio_client's get_type. 2.10.6 is the last version emitting
|
| 20 |
+
# the dict form — Dramabox requires this pin.
|
| 21 |
+
pydantic==2.10.6
|
| 22 |
+
safetensors>=0.4.0
|
| 23 |
+
accelerate>=0.25.0
|
| 24 |
+
peft>=0.7.0
|
| 25 |
+
av>=12.0.0
|
| 26 |
+
einops>=0.7.0
|
| 27 |
+
PyYAML>=6.0
|
| 28 |
+
sentencepiece>=0.1.99
|
| 29 |
+
transformers>=4.45.0
|
| 30 |
+
huggingface_hub>=0.20.0,<1.0
|
| 31 |
+
bitsandbytes>=0.45.0
|
| 32 |
+
gradio==5.7.1
|
| 33 |
+
spaces>=0.30.0
|
| 34 |
+
soundfile>=0.12.0
|
| 35 |
+
resemble-perth @ git+https://github.com/resemble-ai/Perth.git@master
|
| 36 |
+
|
| 37 |
+
# ── VideoVoice pipeline (server.py / app.py imports these at startup) ──
|
| 38 |
+
fastapi
|
| 39 |
+
uvicorn
|
| 40 |
+
slowapi
|
| 41 |
+
sse-starlette
|
| 42 |
+
python-multipart
|
| 43 |
+
python-dotenv
|
| 44 |
+
|
| 45 |
+
faster-whisper
|
| 46 |
+
openai-whisper
|
| 47 |
+
mlx; sys_platform == 'darwin'
|
| 48 |
+
mlx-whisper; sys_platform == 'darwin'
|
| 49 |
+
|
| 50 |
+
deep-translator
|
| 51 |
+
openai
|
| 52 |
+
|
| 53 |
+
demucs==4.0.1
|
| 54 |
+
openunmix
|
| 55 |
+
pyloudnorm
|
| 56 |
+
pydub
|
| 57 |
+
ffmpeg-python
|
| 58 |
+
|
| 59 |
+
boto3
|
| 60 |
+
yt-dlp
|
| 61 |
+
numpy<2.0.0
|
| 62 |
+
pandas<2.3.0
|
scripts/prefetch_models.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prefetch model weights into HF_HOME for faster cold starts on Spaces."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def _prefetch_chatterbox() -> None:
|
| 7 |
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
|
| 8 |
+
|
| 9 |
+
print("[prefetch] Chatterbox Multilingual TTS")
|
| 10 |
+
_ = ChatterboxMultilingualTTS.from_pretrained("cpu")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _prefetch_faster_whisper() -> None:
|
| 14 |
+
from faster_whisper import WhisperModel
|
| 15 |
+
|
| 16 |
+
raw = os.getenv("FASTER_WHISPER_MODELS")
|
| 17 |
+
if raw:
|
| 18 |
+
models = [m.strip() for m in raw.split(",") if m.strip()]
|
| 19 |
+
else:
|
| 20 |
+
models = [os.getenv("FASTER_WHISPER_MODEL", "large-v3")]
|
| 21 |
+
|
| 22 |
+
for model_name in models:
|
| 23 |
+
print(f"[prefetch] faster-whisper {model_name}")
|
| 24 |
+
_ = WhisperModel(model_name, device="cpu", compute_type="int8")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _prefetch_demucs() -> None:
|
| 28 |
+
from demucs.pretrained import get_model
|
| 29 |
+
|
| 30 |
+
print("[prefetch] Demucs htdemucs")
|
| 31 |
+
_ = get_model("htdemucs")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main() -> None:
|
| 35 |
+
tts_engine = os.getenv("TTS_ENGINE", "chatterbox").lower()
|
| 36 |
+
print(f"[prefetch] HF_HOME={os.getenv('HF_HOME', '<unset>')}")
|
| 37 |
+
if tts_engine == "chatterbox":
|
| 38 |
+
_prefetch_chatterbox()
|
| 39 |
+
else:
|
| 40 |
+
print(f"[prefetch] skipping chatterbox prefetch for TTS_ENGINE={tts_engine}")
|
| 41 |
+
_prefetch_faster_whisper()
|
| 42 |
+
_prefetch_demucs()
|
| 43 |
+
print("[prefetch] done")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
main()
|
server.py
ADDED
|
@@ -0,0 +1,929 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server.py — FastAPI backend for VideoVoice.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /api/jobs — Submit a video for translation (file upload or URL)
|
| 6 |
+
GET /api/jobs/{id} — SSE stream of pipeline progress
|
| 7 |
+
GET /api/jobs/{id}/result — Download the translated video
|
| 8 |
+
POST /api/jobs/{id}/select-model — Select TTS model after preview
|
| 9 |
+
GET /api/jobs/{id}/preview/{model} — Stream preview audio
|
| 10 |
+
GET /api/demo-videos — List available demo videos (outputs + data)
|
| 11 |
+
GET /api/demo-videos/{video_id}/stream — Stream demo video by ID
|
| 12 |
+
GET /api/showcase — Curated before/after showcase entries
|
| 13 |
+
"""
|
| 14 |
+
import asyncio
|
| 15 |
+
import hashlib
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import subprocess
|
| 19 |
+
import shutil
|
| 20 |
+
import threading
|
| 21 |
+
import time
|
| 22 |
+
import uuid
|
| 23 |
+
import re
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from urllib.parse import urlparse
|
| 26 |
+
from typing import Optional
|
| 27 |
+
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
from fastapi import FastAPI, APIRouter, File, Form, HTTPException, Request, UploadFile, Header
|
| 30 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 31 |
+
from fastapi import Request
|
| 32 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 33 |
+
from fastapi.staticfiles import StaticFiles
|
| 34 |
+
from pydantic import BaseModel
|
| 35 |
+
from slowapi import Limiter, _rate_limit_exceeded_handler
|
| 36 |
+
from slowapi.errors import RateLimitExceeded
|
| 37 |
+
from slowapi.middleware import SlowAPIMiddleware
|
| 38 |
+
from slowapi.util import get_remote_address
|
| 39 |
+
from sse_starlette.sse import EventSourceResponse
|
| 40 |
+
|
| 41 |
+
load_dotenv()
|
| 42 |
+
|
| 43 |
+
# TTS_ENGINE controls which TTS backend this Space serves
|
| 44 |
+
TTS_ENGINE = os.getenv("TTS_ENGINE", "chatterbox").lower()
|
| 45 |
+
if TTS_ENGINE not in ("chatterbox", "omnivoice", "qwen3", "dramabox"):
|
| 46 |
+
raise ValueError(f"Invalid TTS_ENGINE: {TTS_ENGINE}. Use 'chatterbox', 'omnivoice', 'qwen3', or 'dramabox'.")
|
| 47 |
+
|
| 48 |
+
# ── Config ────────────────────────────────────────────────
|
| 49 |
+
PORT = int(os.getenv("PORT", "7860"))
|
| 50 |
+
MAX_FILE_SIZE_MB = 90
|
| 51 |
+
MAX_DURATION_SEC = 90
|
| 52 |
+
MAX_UPLOAD_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _default_artifacts_root() -> Path:
|
| 56 |
+
# Prefer /data/jobs when the Space has persistent storage mounted
|
| 57 |
+
# (Docker deploys, or Gradio SDK Spaces with persistent storage enabled).
|
| 58 |
+
# Fall back to /tmp when /data is not writable, which is the case on
|
| 59 |
+
# Zero GPU / Gradio SDK Spaces without the paid persistent-storage add-on.
|
| 60 |
+
preferred = Path("/data/jobs")
|
| 61 |
+
try:
|
| 62 |
+
preferred.parent.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
if os.access(preferred.parent, os.W_OK):
|
| 64 |
+
return preferred
|
| 65 |
+
except (PermissionError, OSError):
|
| 66 |
+
pass
|
| 67 |
+
return Path("/tmp/videovoice_jobs")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
ARTIFACTS_ROOT = Path(os.getenv("ARTIFACTS_ROOT") or _default_artifacts_root())
|
| 71 |
+
ALLOWED_YTDLP_HOSTS = {
|
| 72 |
+
"instagram.com",
|
| 73 |
+
"youtube.com",
|
| 74 |
+
"youtu.be",
|
| 75 |
+
"tiktok.com",
|
| 76 |
+
"vm.tiktok.com",
|
| 77 |
+
}
|
| 78 |
+
PERSISTENT_ARTIFACT_DIRS = {"uploads", "outputs", "data", "tmp", "tools"}
|
| 79 |
+
REAPER_INTERVAL_SECONDS = 10 * 60
|
| 80 |
+
REAPER_MAX_AGE_SECONDS = 2 * 60 * 60
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _parse_allowed_origins(value: str) -> list[str]:
|
| 84 |
+
origins = [origin.strip() for origin in value.split(",") if origin.strip()]
|
| 85 |
+
return origins or ["http://localhost:5173"]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
ALLOWED_ORIGINS = _parse_allowed_origins(
|
| 89 |
+
os.getenv("ALLOWED_ORIGINS", "http://localhost:5173")
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# ── App ────────────────────────────────────────────────
|
| 93 |
+
router = APIRouter()
|
| 94 |
+
_RATE_LIMIT_ENABLED = os.getenv("DISABLE_RATE_LIMIT", "").lower() not in ("1", "true", "yes")
|
| 95 |
+
limiter = Limiter(key_func=get_remote_address, enabled=_RATE_LIMIT_ENABLED)
|
| 96 |
+
# Note: app.state.limiter, exception handlers, and SlowAPIMiddleware
|
| 97 |
+
# are now configured on the main Server instance in app.py.
|
| 98 |
+
|
| 99 |
+
# ── In-memory job store ────────────────────────────────
|
| 100 |
+
# Structure: { job_id: { status, messages[], result_path, error, created_at,
|
| 101 |
+
# voice_mode, preview_paths, preview_event, selected_model } }
|
| 102 |
+
jobs: dict = {}
|
| 103 |
+
|
| 104 |
+
# ── GPU job queue ─────────────────────────────────────
|
| 105 |
+
# Only 1 GPU job at a time — others wait in FIFO order
|
| 106 |
+
gpu_semaphore = threading.Semaphore(1)
|
| 107 |
+
gpu_queue: list[str] = [] # ordered list of queued job_ids waiting for GPU
|
| 108 |
+
gpu_active: dict = { # the currently running job's live info
|
| 109 |
+
"job_id": None,
|
| 110 |
+
"started_at": None,
|
| 111 |
+
"step": 0,
|
| 112 |
+
"total_steps": 6,
|
| 113 |
+
"step_label": "",
|
| 114 |
+
}
|
| 115 |
+
# Per-step timing history: { step_num: [durations] } — learns real per-step costs
|
| 116 |
+
step_durations: dict[int, list[float]] = {}
|
| 117 |
+
session_active_jobs: dict[str, str] = {}
|
| 118 |
+
artifact_reaper_task: Optional[asyncio.Task] = None
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
UPLOAD_DIR = ARTIFACTS_ROOT / "uploads"
|
| 122 |
+
OUTPUT_DIR = ARTIFACTS_ROOT / "outputs"
|
| 123 |
+
SHOWCASE_DIR = ARTIFACTS_ROOT / "data" / "showcase"
|
| 124 |
+
SHOWCASE_FILE = ARTIFACTS_ROOT / "data" / "showcase.json"
|
| 125 |
+
DEMO_VIDEO_DIRS = {
|
| 126 |
+
"outputs": OUTPUT_DIR,
|
| 127 |
+
"data": ARTIFACTS_ROOT / "data",
|
| 128 |
+
"showcase": SHOWCASE_DIR,
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ── Helpers ────────────────────────────────────────────
|
| 133 |
+
def _download_url(url: str, dest: str) -> str:
|
| 134 |
+
"""Download video from Instagram/YouTube using yt-dlp."""
|
| 135 |
+
result = subprocess.run(
|
| 136 |
+
[
|
| 137 |
+
"yt-dlp",
|
| 138 |
+
"--no-playlist",
|
| 139 |
+
"--max-filesize", "100M",
|
| 140 |
+
"--js-runtimes", "node",
|
| 141 |
+
"--extractor-args", "youtube:player_client=android,ios,web_safari",
|
| 142 |
+
"-f", "mp4/best[ext=mp4]/best",
|
| 143 |
+
"-o", dest,
|
| 144 |
+
url,
|
| 145 |
+
],
|
| 146 |
+
capture_output=True,
|
| 147 |
+
text=True,
|
| 148 |
+
timeout=120,
|
| 149 |
+
)
|
| 150 |
+
if result.returncode != 0:
|
| 151 |
+
raise RuntimeError(f"yt-dlp failed: {result.stderr[:300]}")
|
| 152 |
+
return dest
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _is_allowed_video_host(url: str) -> bool:
|
| 156 |
+
"""Allow only trusted social platforms for yt-dlp."""
|
| 157 |
+
parsed = urlparse(url)
|
| 158 |
+
host = (parsed.hostname or "").lower()
|
| 159 |
+
if not host:
|
| 160 |
+
return False
|
| 161 |
+
|
| 162 |
+
return (
|
| 163 |
+
host in ALLOWED_YTDLP_HOSTS
|
| 164 |
+
or host.endswith(".instagram.com")
|
| 165 |
+
or host.endswith(".youtube.com")
|
| 166 |
+
or host.endswith(".tiktok.com")
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _probe_duration_seconds(path: str) -> float:
|
| 171 |
+
"""Read media duration from ffprobe."""
|
| 172 |
+
result = subprocess.run(
|
| 173 |
+
[
|
| 174 |
+
"ffprobe",
|
| 175 |
+
"-v", "error",
|
| 176 |
+
"-show_entries", "format=duration",
|
| 177 |
+
"-of", "csv=p=0",
|
| 178 |
+
path,
|
| 179 |
+
],
|
| 180 |
+
capture_output=True,
|
| 181 |
+
text=True,
|
| 182 |
+
timeout=30,
|
| 183 |
+
)
|
| 184 |
+
if result.returncode != 0:
|
| 185 |
+
raise RuntimeError(f"ffprobe failed: {result.stderr[:300]}")
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
return float(result.stdout.strip())
|
| 189 |
+
except ValueError as exc:
|
| 190 |
+
raise RuntimeError("ffprobe returned an invalid duration value") from exc
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def _gpu_available() -> bool:
|
| 194 |
+
"""Report CUDA/MPS availability."""
|
| 195 |
+
try:
|
| 196 |
+
import torch
|
| 197 |
+
|
| 198 |
+
mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
|
| 199 |
+
return bool(torch.cuda.is_available() or mps_available)
|
| 200 |
+
except Exception:
|
| 201 |
+
return False
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _queue_depth() -> int:
|
| 205 |
+
"""Total queue pressure: active job + queued jobs."""
|
| 206 |
+
return len(gpu_queue) + (1 if gpu_active["job_id"] else 0)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _is_job_active(job_id: str) -> bool:
|
| 210 |
+
"""Whether a job is still active (queued/running)."""
|
| 211 |
+
job = jobs.get(job_id)
|
| 212 |
+
if not job:
|
| 213 |
+
return False
|
| 214 |
+
return job.get("status") in {"queued", "running"}
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _release_session_lock(job: dict) -> None:
|
| 218 |
+
session_id = job.get("session_id")
|
| 219 |
+
if not session_id:
|
| 220 |
+
return
|
| 221 |
+
if session_active_jobs.get(session_id) == job.get("job_id"):
|
| 222 |
+
session_active_jobs.pop(session_id, None)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def _demo_video_id(folder: str, filename: str) -> str:
|
| 226 |
+
"""Generate a stable opaque ID for a whitelisted demo video."""
|
| 227 |
+
raw = f"{folder}/{filename}".encode("utf-8")
|
| 228 |
+
return hashlib.sha256(raw).hexdigest()[:20]
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _collect_demo_videos():
|
| 232 |
+
"""Discover demo videos and return (metadata list, id -> path lookup)."""
|
| 233 |
+
videos = []
|
| 234 |
+
video_lookup = {}
|
| 235 |
+
|
| 236 |
+
for folder, directory in DEMO_VIDEO_DIRS.items():
|
| 237 |
+
if not directory.exists() or not directory.is_dir():
|
| 238 |
+
continue
|
| 239 |
+
|
| 240 |
+
for file_path in directory.iterdir():
|
| 241 |
+
if not file_path.is_file() or file_path.suffix.lower() != ".mp4":
|
| 242 |
+
continue
|
| 243 |
+
|
| 244 |
+
stat = file_path.stat()
|
| 245 |
+
video_id = _demo_video_id(folder, file_path.name)
|
| 246 |
+
videos.append(
|
| 247 |
+
{
|
| 248 |
+
"id": video_id,
|
| 249 |
+
"name": file_path.name,
|
| 250 |
+
"url": f"/api/demo-videos/{video_id}/stream",
|
| 251 |
+
"folder": folder,
|
| 252 |
+
"size_bytes": stat.st_size,
|
| 253 |
+
"modified_at": int(stat.st_mtime),
|
| 254 |
+
}
|
| 255 |
+
)
|
| 256 |
+
video_lookup[video_id] = file_path
|
| 257 |
+
|
| 258 |
+
videos.sort(
|
| 259 |
+
key=lambda item: (
|
| 260 |
+
item["name"].lower(),
|
| 261 |
+
item["folder"].lower(),
|
| 262 |
+
item["url"].lower(),
|
| 263 |
+
)
|
| 264 |
+
)
|
| 265 |
+
return videos, video_lookup
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _queue_status_for(job_id: str) -> str | None:
|
| 269 |
+
"""Build a live queue status string for a waiting job."""
|
| 270 |
+
if job_id not in gpu_queue:
|
| 271 |
+
return None
|
| 272 |
+
pos = gpu_queue.index(job_id) + 1 # 1-based position
|
| 273 |
+
|
| 274 |
+
active = gpu_active
|
| 275 |
+
if not active["job_id"]:
|
| 276 |
+
return f"Queue position: {pos} — GPU starting up..."
|
| 277 |
+
|
| 278 |
+
step = active["step"]
|
| 279 |
+
total = active["total_steps"]
|
| 280 |
+
label = active["step_label"]
|
| 281 |
+
|
| 282 |
+
# Build ETA from per-step history if we have it
|
| 283 |
+
eta_part = ""
|
| 284 |
+
if step > 0 and step_durations:
|
| 285 |
+
remaining_secs = 0
|
| 286 |
+
for s in range(step, total + 1):
|
| 287 |
+
hist = step_durations.get(s, [])
|
| 288 |
+
remaining_secs += (sum(hist) / len(hist)) if hist else 15
|
| 289 |
+
# Multiply by queue position (jobs ahead)
|
| 290 |
+
remaining_secs = int(remaining_secs * pos)
|
| 291 |
+
if remaining_secs > 0:
|
| 292 |
+
if remaining_secs < 60:
|
| 293 |
+
eta_part = f" — ~{remaining_secs}s remaining"
|
| 294 |
+
else:
|
| 295 |
+
m, s_ = divmod(remaining_secs, 60)
|
| 296 |
+
eta_part = f" — ~{m}m {s_:02d}s remaining"
|
| 297 |
+
|
| 298 |
+
jobs_word = "job" if pos == 1 else "jobs"
|
| 299 |
+
if label:
|
| 300 |
+
return f"{pos} {jobs_word} ahead (Step {step}/{total} — {label}){eta_part}"
|
| 301 |
+
else:
|
| 302 |
+
return f"{pos} {jobs_word} ahead (Step {step}/{total}){eta_part}"
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _config_languages() -> list[str]:
|
| 306 |
+
"""Expose supported language names from the pipeline (Chatterbox set)."""
|
| 307 |
+
from pipeline import LANGUAGE_CODES
|
| 308 |
+
|
| 309 |
+
return list(LANGUAGE_CODES.keys())
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def _chatterbox_language_options() -> list[dict]:
|
| 313 |
+
from pipeline import LANGUAGE_CODES
|
| 314 |
+
|
| 315 |
+
return [{"name": name, "code": code} for name, code in LANGUAGE_CODES.items()]
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def _omnivoice_language_options() -> list[dict]:
|
| 319 |
+
from steps.lang.omnivoice_languages import OMNIVOICE_LANGUAGE_CODES
|
| 320 |
+
|
| 321 |
+
return [{"name": name, "code": code} for name, code in OMNIVOICE_LANGUAGE_CODES.items()]
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def _qwen3_language_options() -> list[dict]:
|
| 325 |
+
from steps.lang.qwen3_languages import QWEN3_LANGUAGE_CODES
|
| 326 |
+
|
| 327 |
+
return [{"name": name, "code": code} for name, code in QWEN3_LANGUAGE_CODES.items()]
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
async def _artifact_reaper_loop():
|
| 331 |
+
"""Delete stale per-job artifact directories from ARTIFACTS_ROOT."""
|
| 332 |
+
while True:
|
| 333 |
+
try:
|
| 334 |
+
now = time.time()
|
| 335 |
+
for path in ARTIFACTS_ROOT.iterdir():
|
| 336 |
+
if not path.is_dir():
|
| 337 |
+
continue
|
| 338 |
+
if path.name in PERSISTENT_ARTIFACT_DIRS:
|
| 339 |
+
continue
|
| 340 |
+
|
| 341 |
+
age = now - path.stat().st_mtime
|
| 342 |
+
if age > REAPER_MAX_AGE_SECONDS:
|
| 343 |
+
shutil.rmtree(path, ignore_errors=True)
|
| 344 |
+
|
| 345 |
+
stale_jobs = [
|
| 346 |
+
job_id
|
| 347 |
+
for job_id, state in jobs.items()
|
| 348 |
+
if state.get("status") in {"complete", "error"}
|
| 349 |
+
and (now - state.get("created_at", now)) > REAPER_MAX_AGE_SECONDS
|
| 350 |
+
]
|
| 351 |
+
for job_id in stale_jobs:
|
| 352 |
+
jobs.pop(job_id, None)
|
| 353 |
+
except Exception as exc:
|
| 354 |
+
print(f"[reaper] cleanup error: {exc}")
|
| 355 |
+
|
| 356 |
+
await asyncio.sleep(REAPER_INTERVAL_SECONDS)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
async def enforce_content_length_limit(request: Request, call_next):
|
| 360 |
+
"""Reject oversized uploads before body parsing."""
|
| 361 |
+
if request.method.upper() == "POST" and request.url.path == "/api/jobs":
|
| 362 |
+
content_length = request.headers.get("content-length")
|
| 363 |
+
if content_length:
|
| 364 |
+
try:
|
| 365 |
+
if int(content_length) > MAX_UPLOAD_BYTES:
|
| 366 |
+
return JSONResponse(
|
| 367 |
+
status_code=413,
|
| 368 |
+
content={"detail": f"File too large (max {MAX_FILE_SIZE_MB}MB)."},
|
| 369 |
+
)
|
| 370 |
+
except ValueError:
|
| 371 |
+
return JSONResponse(
|
| 372 |
+
status_code=400,
|
| 373 |
+
content={"detail": "Invalid Content-Length header."},
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
return await call_next(request)
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
async def _run_pipeline_async(
|
| 380 |
+
job_id: str, video_path: str, target_lang: str, source_lang: str, voice_mode: str, captions: bool = True, preserve_music: bool = True, video_link: Optional[str] = None
|
| 381 |
+
):
|
| 382 |
+
"""Run the translation pipeline in a background thread, pushing progress to the job store."""
|
| 383 |
+
from pipeline import run_pipeline
|
| 384 |
+
|
| 385 |
+
job = jobs[job_id]
|
| 386 |
+
job["status"] = "queued"
|
| 387 |
+
|
| 388 |
+
# Join the queue
|
| 389 |
+
gpu_queue.append(job_id)
|
| 390 |
+
job["_wait_status"] = _queue_status_for(job_id) or "Waiting for GPU..."
|
| 391 |
+
|
| 392 |
+
# Wait for GPU without blocking the event loop — update queue status each tick
|
| 393 |
+
while not gpu_semaphore.acquire(blocking=False):
|
| 394 |
+
job["_wait_status"] = _queue_status_for(job_id) or "Waiting for GPU..."
|
| 395 |
+
await asyncio.sleep(1)
|
| 396 |
+
|
| 397 |
+
# Leave the queue, mark as running
|
| 398 |
+
if job_id in gpu_queue:
|
| 399 |
+
gpu_queue.remove(job_id)
|
| 400 |
+
job["_wait_status"] = None
|
| 401 |
+
job["status"] = "running"
|
| 402 |
+
|
| 403 |
+
# Fixed 6 pipeline steps: extract, separate, transcribe, translate, tts, sync, merge
|
| 404 |
+
# (+1 if preserve_music for music restoration)
|
| 405 |
+
total_steps = 6 + (1 if preserve_music else 0)
|
| 406 |
+
gpu_active["job_id"] = job_id
|
| 407 |
+
gpu_active["started_at"] = time.time()
|
| 408 |
+
gpu_active["step"] = 0
|
| 409 |
+
gpu_active["total_steps"] = total_steps
|
| 410 |
+
gpu_active["step_label"] = ""
|
| 411 |
+
|
| 412 |
+
job["messages"].append({"type": "progress", "message": "GPU acquired — starting pipeline...", "step": 0})
|
| 413 |
+
start = time.time()
|
| 414 |
+
step_start = time.time()
|
| 415 |
+
|
| 416 |
+
try:
|
| 417 |
+
data_dir = str(ARTIFACTS_ROOT / job_id)
|
| 418 |
+
os.makedirs(data_dir, exist_ok=True)
|
| 419 |
+
output_path = str(Path(data_dir) / "output.mp4")
|
| 420 |
+
|
| 421 |
+
# Note: preview_both mode removed in single-engine Spaces
|
| 422 |
+
# Each Space only serves one TTS engine (TTS_ENGINE env var)
|
| 423 |
+
preview_event = None
|
| 424 |
+
|
| 425 |
+
gen = run_pipeline(
|
| 426 |
+
video_path=video_path,
|
| 427 |
+
target_language=target_lang,
|
| 428 |
+
source_language=source_lang,
|
| 429 |
+
output_path=output_path,
|
| 430 |
+
voice_mode=voice_mode,
|
| 431 |
+
preview_event=preview_event,
|
| 432 |
+
job_state=job,
|
| 433 |
+
captions=captions,
|
| 434 |
+
preserve_music=preserve_music,
|
| 435 |
+
data_dir=data_dir,
|
| 436 |
+
video_link=video_link,
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
step = 0
|
| 440 |
+
|
| 441 |
+
def _run_gen():
|
| 442 |
+
nonlocal step, step_start
|
| 443 |
+
output = None
|
| 444 |
+
try:
|
| 445 |
+
while True:
|
| 446 |
+
msg = next(gen)
|
| 447 |
+
|
| 448 |
+
# Handle preview-ready sentinel dict
|
| 449 |
+
if isinstance(msg, dict) and msg.get("__PREVIEW_READY__"):
|
| 450 |
+
preview_paths = msg["paths"]
|
| 451 |
+
job["preview_paths"] = preview_paths
|
| 452 |
+
|
| 453 |
+
# Build preview URLs
|
| 454 |
+
preview_urls = {}
|
| 455 |
+
for model_name, path in preview_paths.items():
|
| 456 |
+
if path:
|
| 457 |
+
preview_urls[model_name] = (
|
| 458 |
+
f"/api/jobs/{job_id}/preview/{model_name}"
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
job["messages"].append({
|
| 462 |
+
"type": "voice_preview",
|
| 463 |
+
"step": 4,
|
| 464 |
+
"previews": preview_urls,
|
| 465 |
+
})
|
| 466 |
+
continue
|
| 467 |
+
|
| 468 |
+
# Regular string message
|
| 469 |
+
if isinstance(msg, str):
|
| 470 |
+
# Detect step transitions and record per-step timing
|
| 471 |
+
if "Step" in msg and f"/{total_steps}" in msg:
|
| 472 |
+
try:
|
| 473 |
+
new_step = int(
|
| 474 |
+
msg.split("Step")[1].split("/")[0].strip()
|
| 475 |
+
)
|
| 476 |
+
# Record duration of the step that just ended
|
| 477 |
+
if step > 0:
|
| 478 |
+
dur = time.time() - step_start
|
| 479 |
+
step_durations.setdefault(step, [])
|
| 480 |
+
step_durations[step].append(dur)
|
| 481 |
+
if len(step_durations[step]) > 10:
|
| 482 |
+
step_durations[step].pop(0)
|
| 483 |
+
|
| 484 |
+
step = new_step
|
| 485 |
+
step_start = time.time()
|
| 486 |
+
|
| 487 |
+
# Extract step label (text after "Step X/Y: ")
|
| 488 |
+
label = msg.split(":", 1)[1].strip() if ":" in msg else ""
|
| 489 |
+
# Remove emoji prefix
|
| 490 |
+
label = label.lstrip("🔊📝🌍🗣️⏱️🎞️🎧 ")
|
| 491 |
+
gpu_active["step"] = step
|
| 492 |
+
gpu_active["step_label"] = label
|
| 493 |
+
|
| 494 |
+
except (ValueError, IndexError):
|
| 495 |
+
pass
|
| 496 |
+
|
| 497 |
+
job["messages"].append({
|
| 498 |
+
"type": "progress",
|
| 499 |
+
"message": msg.strip(),
|
| 500 |
+
"step": step,
|
| 501 |
+
})
|
| 502 |
+
|
| 503 |
+
except StopIteration as e:
|
| 504 |
+
output = e.value
|
| 505 |
+
except Exception as e:
|
| 506 |
+
# Pipeline crashed — set error status directly from
|
| 507 |
+
# the thread so the frontend sees it immediately,
|
| 508 |
+
# rather than relying on exception propagation through
|
| 509 |
+
# run_in_executor (which can silently swallow errors
|
| 510 |
+
# when stdout/stderr are in a broken state).
|
| 511 |
+
import traceback
|
| 512 |
+
tb = traceback.format_exc()
|
| 513 |
+
print(f"[pipeline] CRASH in job {job_id}: {e}\n{tb}")
|
| 514 |
+
job["status"] = "error"
|
| 515 |
+
job["messages"].append({
|
| 516 |
+
"type": "error",
|
| 517 |
+
"message": f"Pipeline crashed: {e}",
|
| 518 |
+
})
|
| 519 |
+
return None
|
| 520 |
+
|
| 521 |
+
# Record the final step's duration
|
| 522 |
+
if step > 0:
|
| 523 |
+
dur = time.time() - step_start
|
| 524 |
+
step_durations.setdefault(step, [])
|
| 525 |
+
step_durations[step].append(dur)
|
| 526 |
+
if len(step_durations[step]) > 10:
|
| 527 |
+
step_durations[step].pop(0)
|
| 528 |
+
return output
|
| 529 |
+
|
| 530 |
+
loop = asyncio.get_event_loop()
|
| 531 |
+
result_path = await loop.run_in_executor(None, _run_gen)
|
| 532 |
+
|
| 533 |
+
if job["status"] == "error":
|
| 534 |
+
# Error already reported by _run_gen — skip marking as complete
|
| 535 |
+
pass
|
| 536 |
+
else:
|
| 537 |
+
elapsed = round(time.time() - start)
|
| 538 |
+
job["status"] = "complete"
|
| 539 |
+
job["result_path"] = result_path or output_path
|
| 540 |
+
job["messages"].append({"type": "complete", "elapsed": elapsed})
|
| 541 |
+
|
| 542 |
+
except Exception as e:
|
| 543 |
+
job["status"] = "error"
|
| 544 |
+
job["messages"].append({"type": "error", "message": str(e)})
|
| 545 |
+
|
| 546 |
+
finally:
|
| 547 |
+
# Free GPU memory between jobs
|
| 548 |
+
import gc
|
| 549 |
+
import torch
|
| 550 |
+
gc.collect()
|
| 551 |
+
if hasattr(torch, "mps") and torch.backends.mps.is_available():
|
| 552 |
+
torch.mps.empty_cache()
|
| 553 |
+
|
| 554 |
+
gpu_active["job_id"] = None
|
| 555 |
+
gpu_active["started_at"] = None
|
| 556 |
+
gpu_active["step"] = 0
|
| 557 |
+
gpu_active["step_label"] = ""
|
| 558 |
+
if job_id in gpu_queue:
|
| 559 |
+
gpu_queue.remove(job_id)
|
| 560 |
+
_release_session_lock(job)
|
| 561 |
+
gpu_semaphore.release()
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
# ── Routes ─────────────────────────────────────────────
|
| 565 |
+
|
| 566 |
+
@router.get("/api/health")
|
| 567 |
+
async def health():
|
| 568 |
+
return JSONResponse(
|
| 569 |
+
{
|
| 570 |
+
"status": "ok",
|
| 571 |
+
"gpu_available": _gpu_available(),
|
| 572 |
+
"queue_depth": _queue_depth(),
|
| 573 |
+
"active_job_id": gpu_active["job_id"],
|
| 574 |
+
}
|
| 575 |
+
)
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
@router.get("/api/config")
|
| 579 |
+
async def config():
|
| 580 |
+
return JSONResponse(
|
| 581 |
+
{
|
| 582 |
+
"max_file_size_mb": MAX_FILE_SIZE_MB,
|
| 583 |
+
"max_duration_sec": MAX_DURATION_SEC,
|
| 584 |
+
"languages": _config_languages(),
|
| 585 |
+
"chatterbox_languages": _chatterbox_language_options(),
|
| 586 |
+
"omnivoice_languages": _omnivoice_language_options(),
|
| 587 |
+
"qwen3_languages": _qwen3_language_options(),
|
| 588 |
+
"tts_models": [TTS_ENGINE],
|
| 589 |
+
"tts_engine": TTS_ENGINE,
|
| 590 |
+
}
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
@router.get("/api/demo-videos")
|
| 595 |
+
async def list_demo_videos():
|
| 596 |
+
"""List whitelisted MP4 demo videos from outputs/ and data/."""
|
| 597 |
+
videos, _ = _collect_demo_videos()
|
| 598 |
+
return JSONResponse({"videos": videos})
|
| 599 |
+
|
| 600 |
+
|
| 601 |
+
@router.get("/api/demo-videos/{video_id}/stream")
|
| 602 |
+
async def stream_demo_video(video_id: str):
|
| 603 |
+
"""Stream a demo video by opaque ID (no client-provided path)."""
|
| 604 |
+
_, video_lookup = _collect_demo_videos()
|
| 605 |
+
video_path = video_lookup.get(video_id)
|
| 606 |
+
if not video_path:
|
| 607 |
+
raise HTTPException(404, "Demo video not found.")
|
| 608 |
+
|
| 609 |
+
return FileResponse(
|
| 610 |
+
str(video_path),
|
| 611 |
+
media_type="video/mp4",
|
| 612 |
+
filename=video_path.name,
|
| 613 |
+
)
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
@router.get("/api/showcase")
|
| 617 |
+
async def get_showcase():
|
| 618 |
+
"""Return curated showcase entries with resolved streaming URLs."""
|
| 619 |
+
if not SHOWCASE_FILE.exists():
|
| 620 |
+
return JSONResponse({"showcases": []})
|
| 621 |
+
|
| 622 |
+
try:
|
| 623 |
+
data = json.loads(SHOWCASE_FILE.read_text(encoding="utf-8"))
|
| 624 |
+
except (json.JSONDecodeError, OSError):
|
| 625 |
+
return JSONResponse({"showcases": []})
|
| 626 |
+
|
| 627 |
+
showcases = data.get("showcases", [])
|
| 628 |
+
for entry in showcases:
|
| 629 |
+
for key in ("their_dub", "our_dub"):
|
| 630 |
+
dub = entry.get(key)
|
| 631 |
+
if dub and dub.get("type") == "local" and dub.get("filename"):
|
| 632 |
+
video_id = _demo_video_id("showcase", dub["filename"])
|
| 633 |
+
dub["url"] = f"/api/demo-videos/{video_id}/stream"
|
| 634 |
+
|
| 635 |
+
return JSONResponse({"showcases": showcases})
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
@router.post("/api/jobs")
|
| 639 |
+
@limiter.limit("3/hour")
|
| 640 |
+
async def create_job(
|
| 641 |
+
request: Request,
|
| 642 |
+
file: Optional[UploadFile] = File(None),
|
| 643 |
+
url: Optional[str] = Form(None),
|
| 644 |
+
target_language: str = Form("Spanish"),
|
| 645 |
+
source_language: str = Form("auto"),
|
| 646 |
+
voice_mode: str = Form("chatterbox"),
|
| 647 |
+
captions: str = Form("true"),
|
| 648 |
+
preserve_music: str = Form("false"),
|
| 649 |
+
x_session_id: Optional[str] = Header(default=None, alias="X-Session-Id"),
|
| 650 |
+
):
|
| 651 |
+
"""Submit a video for translation."""
|
| 652 |
+
if not file and not url:
|
| 653 |
+
raise HTTPException(400, "Provide either a file upload or a URL.")
|
| 654 |
+
|
| 655 |
+
if x_session_id:
|
| 656 |
+
existing_job_id = session_active_jobs.get(x_session_id)
|
| 657 |
+
if existing_job_id and _is_job_active(existing_job_id):
|
| 658 |
+
return JSONResponse(
|
| 659 |
+
status_code=409,
|
| 660 |
+
content={"existing_job_id": existing_job_id},
|
| 661 |
+
)
|
| 662 |
+
if existing_job_id and not _is_job_active(existing_job_id):
|
| 663 |
+
session_active_jobs.pop(x_session_id, None)
|
| 664 |
+
|
| 665 |
+
# Validate voice_mode - only TTS_ENGINE is valid for this Space
|
| 666 |
+
# "preview_both" is disabled in single-engine mode (no way to choose between engines)
|
| 667 |
+
valid_modes = (TTS_ENGINE,)
|
| 668 |
+
if voice_mode not in valid_modes:
|
| 669 |
+
voice_mode = TTS_ENGINE
|
| 670 |
+
|
| 671 |
+
job_id = None
|
| 672 |
+
if url:
|
| 673 |
+
if not _is_allowed_video_host(url):
|
| 674 |
+
raise HTTPException(400, "Unsupported URL host.")
|
| 675 |
+
|
| 676 |
+
# Instagram
|
| 677 |
+
m = re.search(r'/(?:reel|reels|p)/([A-Za-z0-9_-]+)', url)
|
| 678 |
+
if m:
|
| 679 |
+
job_id = m.group(1)
|
| 680 |
+
# YouTube
|
| 681 |
+
if not job_id:
|
| 682 |
+
m = re.search(r'(?:v=|youtu\.be/)([\w-]+)', url)
|
| 683 |
+
if m:
|
| 684 |
+
job_id = m.group(1)
|
| 685 |
+
# TikTok (vm.tiktok.com)
|
| 686 |
+
if not job_id:
|
| 687 |
+
m = re.search(r'vm\.tiktok\.com/([\w-]+)', url)
|
| 688 |
+
if m:
|
| 689 |
+
job_id = m.group(1)
|
| 690 |
+
# TikTok (standard /video/xxx)
|
| 691 |
+
if not job_id:
|
| 692 |
+
m = re.search(r'/video/(\d+)', url)
|
| 693 |
+
if m:
|
| 694 |
+
job_id = m.group(1)
|
| 695 |
+
|
| 696 |
+
if not job_id:
|
| 697 |
+
job_id = str(uuid.uuid4())[:12]
|
| 698 |
+
|
| 699 |
+
base_job_id = job_id
|
| 700 |
+
counter = 1
|
| 701 |
+
job_dir = ARTIFACTS_ROOT / job_id
|
| 702 |
+
while job_dir.exists():
|
| 703 |
+
job_id = f"{base_job_id}_{counter}"
|
| 704 |
+
job_dir = ARTIFACTS_ROOT / job_id
|
| 705 |
+
counter += 1
|
| 706 |
+
|
| 707 |
+
job_dir.mkdir(parents=True, exist_ok=True)
|
| 708 |
+
|
| 709 |
+
video_path = ""
|
| 710 |
+
|
| 711 |
+
if file:
|
| 712 |
+
# Save uploaded file
|
| 713 |
+
ext = Path(file.filename or "video.mp4").suffix or ".mp4"
|
| 714 |
+
video_path = str(job_dir / f"input{ext}")
|
| 715 |
+
with open(video_path, "wb") as f:
|
| 716 |
+
content = await file.read()
|
| 717 |
+
f.write(content)
|
| 718 |
+
elif url:
|
| 719 |
+
# Download from URL
|
| 720 |
+
video_path = str(job_dir / "input.mp4")
|
| 721 |
+
try:
|
| 722 |
+
_download_url(url, video_path)
|
| 723 |
+
except Exception as e:
|
| 724 |
+
shutil.rmtree(job_dir, ignore_errors=True)
|
| 725 |
+
raise HTTPException(400, f"Failed to download video: {e}")
|
| 726 |
+
|
| 727 |
+
try:
|
| 728 |
+
duration_seconds = _probe_duration_seconds(video_path)
|
| 729 |
+
except Exception as exc:
|
| 730 |
+
shutil.rmtree(job_dir, ignore_errors=True)
|
| 731 |
+
raise HTTPException(400, f"Could not validate video duration: {exc}")
|
| 732 |
+
|
| 733 |
+
if duration_seconds > MAX_DURATION_SEC:
|
| 734 |
+
shutil.rmtree(job_dir, ignore_errors=True)
|
| 735 |
+
raise HTTPException(400, f"Video exceeds {MAX_DURATION_SEC} seconds limit.")
|
| 736 |
+
|
| 737 |
+
# Initialize job
|
| 738 |
+
jobs[job_id] = {
|
| 739 |
+
"job_id": job_id,
|
| 740 |
+
"status": "queued",
|
| 741 |
+
"messages": [],
|
| 742 |
+
"result_path": None,
|
| 743 |
+
"error": None,
|
| 744 |
+
"created_at": time.time(),
|
| 745 |
+
"voice_mode": voice_mode,
|
| 746 |
+
"preview_paths": None,
|
| 747 |
+
"preview_event": None,
|
| 748 |
+
"selected_model": None,
|
| 749 |
+
"session_id": x_session_id,
|
| 750 |
+
}
|
| 751 |
+
if x_session_id:
|
| 752 |
+
session_active_jobs[x_session_id] = job_id
|
| 753 |
+
|
| 754 |
+
# Start pipeline in background
|
| 755 |
+
enable_captions = captions.lower() == "true"
|
| 756 |
+
enable_music = preserve_music.lower() == "true"
|
| 757 |
+
asyncio.create_task(
|
| 758 |
+
_run_pipeline_async(job_id, video_path, target_language, source_language, voice_mode, enable_captions, enable_music, url)
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
return JSONResponse({"job_id": job_id, "status": "queued"})
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
@router.get("/api/jobs/{job_id}")
|
| 765 |
+
@limiter.limit("20/second")
|
| 766 |
+
async def job_status_poll(request: Request, job_id: str, after: int = 0):
|
| 767 |
+
"""Poll endpoint returning new messages since index `after`, plus live wait status."""
|
| 768 |
+
if job_id not in jobs:
|
| 769 |
+
raise HTTPException(404, "Job not found.")
|
| 770 |
+
|
| 771 |
+
job = jobs[job_id]
|
| 772 |
+
messages = job["messages"][after:]
|
| 773 |
+
|
| 774 |
+
# Include live wait ETA (updated in-place, not a queued message)
|
| 775 |
+
wait_status = job.get("_wait_status")
|
| 776 |
+
|
| 777 |
+
return JSONResponse(
|
| 778 |
+
{"messages": messages, "next": after + len(messages), "wait_status": wait_status},
|
| 779 |
+
headers={"Cache-Control": "no-cache, no-store"},
|
| 780 |
+
)
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
class ModelSelection(BaseModel):
|
| 784 |
+
model: str
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
@router.post("/api/jobs/{job_id}/select-model")
|
| 788 |
+
async def select_model(job_id: str, selection: ModelSelection):
|
| 789 |
+
"""User selects a TTS model after previewing."""
|
| 790 |
+
job = jobs.get(job_id)
|
| 791 |
+
if not job:
|
| 792 |
+
raise HTTPException(404, "Job not found.")
|
| 793 |
+
|
| 794 |
+
if selection.model != TTS_ENGINE:
|
| 795 |
+
raise HTTPException(400, f"Invalid model. This Space only serves {TTS_ENGINE}.")
|
| 796 |
+
|
| 797 |
+
job["selected_model"] = selection.model
|
| 798 |
+
|
| 799 |
+
# Unblock the pipeline
|
| 800 |
+
if job.get("preview_event"):
|
| 801 |
+
job["preview_event"].set()
|
| 802 |
+
|
| 803 |
+
return JSONResponse({"status": "ok", "selected": selection.model})
|
| 804 |
+
|
| 805 |
+
|
| 806 |
+
@router.get("/api/jobs/{job_id}/preview/{model_name}")
|
| 807 |
+
async def get_preview_audio(job_id: str, model_name: str):
|
| 808 |
+
"""Serve a preview audio WAV file."""
|
| 809 |
+
job = jobs.get(job_id)
|
| 810 |
+
if not job:
|
| 811 |
+
raise HTTPException(404, "Job not found.")
|
| 812 |
+
|
| 813 |
+
if model_name != TTS_ENGINE:
|
| 814 |
+
raise HTTPException(400, f"Invalid model name. This Space serves {TTS_ENGINE} only.")
|
| 815 |
+
|
| 816 |
+
preview_paths = job.get("preview_paths")
|
| 817 |
+
if not preview_paths:
|
| 818 |
+
raise HTTPException(404, "Previews not yet generated.")
|
| 819 |
+
|
| 820 |
+
path = preview_paths.get(model_name)
|
| 821 |
+
if not path or not Path(path).exists():
|
| 822 |
+
raise HTTPException(404, f"Preview for '{model_name}' not available.")
|
| 823 |
+
|
| 824 |
+
return FileResponse(
|
| 825 |
+
path,
|
| 826 |
+
media_type="audio/wav",
|
| 827 |
+
filename=f"preview_{model_name}.wav",
|
| 828 |
+
)
|
| 829 |
+
|
| 830 |
+
|
| 831 |
+
@router.get("/api/jobs/{job_id}/result")
|
| 832 |
+
async def job_result(job_id: str):
|
| 833 |
+
"""Download the translated video."""
|
| 834 |
+
job = jobs.get(job_id)
|
| 835 |
+
if not job:
|
| 836 |
+
raise HTTPException(404, "Job not found.")
|
| 837 |
+
if job["status"] != "complete":
|
| 838 |
+
raise HTTPException(400, f"Job is {job['status']}, not complete.")
|
| 839 |
+
if not job["result_path"] or not Path(job["result_path"]).exists():
|
| 840 |
+
raise HTTPException(404, "Result file not found.")
|
| 841 |
+
|
| 842 |
+
return FileResponse(
|
| 843 |
+
job["result_path"],
|
| 844 |
+
media_type="video/mp4",
|
| 845 |
+
filename=f"videovoice_{job_id}.mp4",
|
| 846 |
+
)
|
| 847 |
+
|
| 848 |
+
|
| 849 |
+
@router.on_event("startup")
|
| 850 |
+
async def startup_event():
|
| 851 |
+
"""Create artifact directories and start background cleanup."""
|
| 852 |
+
global artifact_reaper_task
|
| 853 |
+
|
| 854 |
+
ARTIFACTS_ROOT.mkdir(parents=True, exist_ok=True)
|
| 855 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 856 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 857 |
+
(ARTIFACTS_ROOT / "data").mkdir(parents=True, exist_ok=True)
|
| 858 |
+
(ARTIFACTS_ROOT / "tmp").mkdir(parents=True, exist_ok=True)
|
| 859 |
+
|
| 860 |
+
if os.getenv("DISABLE_CLEANUP", "").lower() in ("1", "true", "yes"):
|
| 861 |
+
print("[reaper] DISABLE_CLEANUP is set — artifact reaper will not run")
|
| 862 |
+
elif artifact_reaper_task is None or artifact_reaper_task.done():
|
| 863 |
+
artifact_reaper_task = asyncio.create_task(_artifact_reaper_loop())
|
| 864 |
+
|
| 865 |
+
|
| 866 |
+
@router.on_event("shutdown")
|
| 867 |
+
async def shutdown_event():
|
| 868 |
+
global artifact_reaper_task
|
| 869 |
+
if artifact_reaper_task is not None and not artifact_reaper_task.done():
|
| 870 |
+
artifact_reaper_task.cancel()
|
| 871 |
+
try:
|
| 872 |
+
await artifact_reaper_task
|
| 873 |
+
except asyncio.CancelledError:
|
| 874 |
+
pass
|
| 875 |
+
|
| 876 |
+
|
| 877 |
+
# ── No-cache headers for dev/tunnel (ensures Cloudflare serves fresh files) ──
|
| 878 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 879 |
+
|
| 880 |
+
# Phase 1.7 marker: remove legacy static middleware when React FE fully owns UI.
|
| 881 |
+
class NoCacheStaticMiddleware(BaseHTTPMiddleware):
|
| 882 |
+
async def dispatch(self, request: Request, call_next):
|
| 883 |
+
response = await call_next(request)
|
| 884 |
+
if request.url.path.endswith(('.css', '.js', '.html')) or request.url.path == '/':
|
| 885 |
+
response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
|
| 886 |
+
response.headers['Pragma'] = 'no-cache'
|
| 887 |
+
return response
|
| 888 |
+
|
| 889 |
+
# Standalone middleware and static mounts removed (now handled in app.py/main app)
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
# ── Local dev entrypoint ──────────────────────────────
|
| 893 |
+
# On HF Spaces `app.py` creates its own Server and imports this router, so
|
| 894 |
+
# the block below is skipped. Locally, `python server.py` builds a minimal
|
| 895 |
+
# FastAPI wrapper around the router so there's something for uvicorn to run.
|
| 896 |
+
if __name__ == "__main__":
|
| 897 |
+
local_app = FastAPI(title="VideoVoice API (local)")
|
| 898 |
+
local_app.state.limiter = limiter
|
| 899 |
+
local_app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
| 900 |
+
local_app.add_middleware(SlowAPIMiddleware)
|
| 901 |
+
local_app.add_middleware(NoCacheStaticMiddleware)
|
| 902 |
+
local_app.add_middleware(
|
| 903 |
+
CORSMiddleware,
|
| 904 |
+
allow_origins=ALLOWED_ORIGINS,
|
| 905 |
+
allow_credentials=True,
|
| 906 |
+
allow_methods=["*"],
|
| 907 |
+
allow_headers=["*"],
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
@local_app.middleware("http")
|
| 911 |
+
async def _local_content_length(request: Request, call_next):
|
| 912 |
+
return await enforce_content_length_limit(request, call_next)
|
| 913 |
+
|
| 914 |
+
local_app.include_router(router)
|
| 915 |
+
|
| 916 |
+
# Tools API — independent of pipeline; safe to include here too.
|
| 917 |
+
from tools_api import router as tools_router
|
| 918 |
+
local_app.include_router(tools_router)
|
| 919 |
+
|
| 920 |
+
# Serve the legacy static frontend at / so `python server.py` keeps the
|
| 921 |
+
# old dev UX (open http://localhost:8000 to hit frontend/index.html).
|
| 922 |
+
# The React SPA in production is deployed separately to S3.
|
| 923 |
+
frontend_dir = Path(__file__).parent / "frontend"
|
| 924 |
+
if frontend_dir.exists():
|
| 925 |
+
local_app.mount("/", StaticFiles(directory=str(frontend_dir), html=True), name="frontend")
|
| 926 |
+
|
| 927 |
+
import uvicorn
|
| 928 |
+
port = int(os.getenv("PORT", 8000))
|
| 929 |
+
uvicorn.run(local_app, host="0.0.0.0", port=port)
|
social_distributor/.env.example
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# VideoVoice data directory (default: ../data relative to poster/)
|
| 2 |
+
VIDEOVOICE_DATA_DIR=/Users/rafa/MscAi/VideoVoice/data
|
| 3 |
+
|
| 4 |
+
# Pollinations LLM (for caption generation)
|
| 5 |
+
POLLEN_MODEL=gemini-search
|
| 6 |
+
POLLEN_API_KEY=pollinations
|
| 7 |
+
|
| 8 |
+
# AWS Bedrock fallback (for caption generation)
|
| 9 |
+
AWS_REGION=us-east-1
|
| 10 |
+
BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
|
| 11 |
+
# AWS_ACCESS_KEY_ID=
|
| 12 |
+
# AWS_SECRET_ACCESS_KEY=
|
| 13 |
+
|
| 14 |
+
# Posting settings
|
| 15 |
+
POST_DELAY=30
|
| 16 |
+
HEADLESS=true
|
social_distributor/.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
poster/auth/storage/
|
| 4 |
+
*.pyc
|
| 5 |
+
.env
|
| 6 |
+
post_history.json
|
| 7 |
+
creator_cache.json
|
| 8 |
+
errors/
|
social_distributor/README.md
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Social Media Distributor
|
| 2 |
+
|
| 3 |
+
Automated social media posting for VideoVoice dubbed videos. Posts AI-dubbed videos to Instagram, TikTok, and YouTube with AI-generated captions.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Multi-platform posting**: Instagram, TikTok, YouTube
|
| 8 |
+
- **AI-generated captions**: Uses Pollinations LLM with AWS Bedrock fallback
|
| 9 |
+
- **Creator handle extraction**: Automatically pulls creator info from source videos
|
| 10 |
+
- **Smart scheduling**: Configurable delays between posts to avoid rate limits
|
| 11 |
+
- **Session management**: Persistent browser sessions (no repeated logins)
|
| 12 |
+
- **Post tracking**: Tracks what was posted to avoid duplicates
|
| 13 |
+
|
| 14 |
+
## Setup
|
| 15 |
+
|
| 16 |
+
### 1. Install Dependencies
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
# Using uv (recommended)
|
| 20 |
+
uv sync
|
| 21 |
+
|
| 22 |
+
# Or using pip
|
| 23 |
+
pip install -r pyproject.toml
|
| 24 |
+
playwright install
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 2. Configure Environment
|
| 28 |
+
|
| 29 |
+
Copy the example environment file and edit:
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
cp .env.example .env
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
Edit `.env`:
|
| 36 |
+
```env
|
| 37 |
+
# VideoVoice data directory (where dubbed video folders are)
|
| 38 |
+
VIDEOVOICE_DATA_DIR=/path/to/VideoVoice/data
|
| 39 |
+
|
| 40 |
+
# LLM for caption generation (Pollinations)
|
| 41 |
+
POLLEN_MODEL=gemini-search
|
| 42 |
+
POLLEN_API_KEY=pollinations
|
| 43 |
+
|
| 44 |
+
# Optional: AWS Bedrock fallback
|
| 45 |
+
AWS_REGION=us-east-1
|
| 46 |
+
BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
|
| 47 |
+
AWS_ACCESS_KEY_ID=...
|
| 48 |
+
AWS_SECRET_ACCESS_KEY=...
|
| 49 |
+
|
| 50 |
+
# Posting behavior
|
| 51 |
+
POST_DELAY=30 # Seconds between posts
|
| 52 |
+
HEADLESS=true # Run browser headlessly
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### 3. Login to Platforms
|
| 56 |
+
|
| 57 |
+
You need to authenticate with each platform once. This opens a browser window for you to log in:
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
# Login to Instagram
|
| 61 |
+
python post.py login instagram
|
| 62 |
+
|
| 63 |
+
# Login to TikTok
|
| 64 |
+
python post.py login tiktok
|
| 65 |
+
|
| 66 |
+
# Login to YouTube
|
| 67 |
+
python post.py login youtube
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
Sessions are saved in `poster/auth/storage/` — you won't need to log in again.
|
| 71 |
+
|
| 72 |
+
## Usage
|
| 73 |
+
|
| 74 |
+
### Post Videos
|
| 75 |
+
|
| 76 |
+
Post all videos from a data folder:
|
| 77 |
+
|
| 78 |
+
```bash
|
| 79 |
+
# Post all platforms (default)
|
| 80 |
+
python post.py post /path/to/VideoVoice/data/Dxxxxxxxxx
|
| 81 |
+
|
| 82 |
+
# Post to specific platforms only
|
| 83 |
+
python post.py post /path/to/data/Folder1 -p instagram,tiktok
|
| 84 |
+
|
| 85 |
+
# Post multiple folders
|
| 86 |
+
python post.py post Folder1 Folder2 Folder3
|
| 87 |
+
|
| 88 |
+
# Dry run (generate captions but don't post)
|
| 89 |
+
python post.py post Folder1 --dry-run
|
| 90 |
+
|
| 91 |
+
# Force re-post even if already posted
|
| 92 |
+
python post.py post Folder1 --force
|
| 93 |
+
|
| 94 |
+
# Override language (e.g., if auto-detection is wrong)
|
| 95 |
+
python post.py post Folder1 --lang-override "Urdu"
|
| 96 |
+
|
| 97 |
+
# Customize delay between posts
|
| 98 |
+
python post.py post Folder1 --delay 60
|
| 99 |
+
|
| 100 |
+
# Run with visible browser (for debugging)
|
| 101 |
+
python post.py post Folder1 --no-headless
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
### Preview Captions
|
| 105 |
+
|
| 106 |
+
Generate and preview captions without posting:
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
# Preview captions for all platforms
|
| 110 |
+
python post.py caption /path/to/data/Folder1
|
| 111 |
+
|
| 112 |
+
# Preview for specific platforms
|
| 113 |
+
python post.py caption Folder1 -p youtube
|
| 114 |
+
|
| 115 |
+
# Preview multiple folders
|
| 116 |
+
python post.py caption Folder1 Folder2 Folder3
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### Check Posting History
|
| 120 |
+
|
| 121 |
+
```bash
|
| 122 |
+
python post.py status
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
Shows a table of all posted videos with timestamps and status.
|
| 126 |
+
|
| 127 |
+
## Command Reference
|
| 128 |
+
|
| 129 |
+
| Command | Description |
|
| 130 |
+
|---------|-------------|
|
| 131 |
+
| `python post.py login <platform>` | Authenticate with a platform |
|
| 132 |
+
| `python post.py post <folders...>` | Post videos to social media |
|
| 133 |
+
| `python post.py caption <folders...>` | Preview generated captions |
|
| 134 |
+
| `python post.py status` | View posting history |
|
| 135 |
+
|
| 136 |
+
### Post Options
|
| 137 |
+
|
| 138 |
+
| Option | Description |
|
| 139 |
+
|--------|-------------|
|
| 140 |
+
| `-p, --platforms` | Comma-separated platforms (default: instagram,tiktok,youtube) |
|
| 141 |
+
| `--force` | Re-post even if already posted |
|
| 142 |
+
| `--dry-run` | Generate captions but don't post |
|
| 143 |
+
| `--delay <seconds>` | Seconds between posts |
|
| 144 |
+
| `--headless / --no-headless` | Run browser headlessly |
|
| 145 |
+
| `--lang-override <name>` | Override target language (e.g., "Urdu") |
|
| 146 |
+
|
| 147 |
+
## How It Works
|
| 148 |
+
|
| 149 |
+
1. **Loads videos** from VideoVoice data folders
|
| 150 |
+
2. **Extracts creator info** from the original video link
|
| 151 |
+
3. **Generates captions** using AI (Pollinations LLM)
|
| 152 |
+
4. **Posts to each platform** with platform-optimized formatting
|
| 153 |
+
5. **Tracks posts** in `post_history.json`
|
| 154 |
+
|
| 155 |
+
## File Structure
|
| 156 |
+
|
| 157 |
+
```
|
| 158 |
+
social_distributor/
|
| 159 |
+
├── post.py # CLI entry point
|
| 160 |
+
├── poster/
|
| 161 |
+
│ ├── auth/
|
| 162 |
+
│ │ ├── session.py # Browser session management
|
| 163 |
+
│ │ └── storage/ # Saved session files
|
| 164 |
+
│ ├── platforms/
|
| 165 |
+
│ │ ├── base.py # Base poster class
|
| 166 |
+
│ │ ├── instagram.py # Instagram automation
|
| 167 |
+
│ │ ├── tiktok.py # TikTok automation
|
| 168 |
+
│ │ └── youtube.py # YouTube automation
|
| 169 |
+
│ ├── caption_gen.py # AI caption generation
|
| 170 |
+
│ ├── creator_extract.py # Creator handle extraction
|
| 171 |
+
│ ├── video_loader.py # Video metadata loading
|
| 172 |
+
│ ├── post_log.py # Post history tracking
|
| 173 |
+
│ ├── config.py # Configuration & constants
|
| 174 |
+
│ └── models.py # Data models
|
| 175 |
+
├── .env # Your environment config
|
| 176 |
+
└── post_history.json # Auto-generated post log
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
## Troubleshooting
|
| 180 |
+
|
| 181 |
+
**Login fails / session expires:**
|
| 182 |
+
```bash
|
| 183 |
+
# Re-login to the platform
|
| 184 |
+
python post.py login instagram
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
**Caption generation fails:**
|
| 188 |
+
- Check your `POLLEN_API_KEY` in `.env`
|
| 189 |
+
- Or configure AWS Bedrock credentials as fallback
|
| 190 |
+
|
| 191 |
+
**Post fails on specific platform:**
|
| 192 |
+
- Use `--no-headless` to see the browser and debug
|
| 193 |
+
- Check `post_history.json` for error messages
|
| 194 |
+
- Platforms may require re-authentication periodically
|
| 195 |
+
|
| 196 |
+
**Videos not found:**
|
| 197 |
+
- Ensure `VIDEOVOICE_DATA_DIR` points to your VideoVoice `data/` folder
|
| 198 |
+
- Folder names should match VideoVoice video IDs (e.g., `Dxxxxxxxxx`)
|
| 199 |
+
|
| 200 |
+
## Notes
|
| 201 |
+
|
| 202 |
+
- Instagram and TikTok use browser automation (Playwright)
|
| 203 |
+
- YouTube posts via web upload (requires logged-in session)
|
| 204 |
+
- First login for each platform opens a real browser window
|
| 205 |
+
- Headless mode runs faster but hides the browser (use `--no-headless` to debug)
|
social_distributor/post.py
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""VideoVoice Social Media Poster — CLI entrypoint."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
import click
|
| 10 |
+
from rich.console import Console
|
| 11 |
+
from rich.table import Table
|
| 12 |
+
|
| 13 |
+
console = Console()
|
| 14 |
+
|
| 15 |
+
ALL_PLATFORMS = ["instagram", "tiktok", "youtube"]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@click.group()
|
| 19 |
+
def cli():
|
| 20 |
+
"""VideoVoice Social Media Poster — post dubbed videos to Instagram, TikTok, and YouTube."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ── Login command ────────────────────────────────────────────────────────
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@cli.command()
|
| 28 |
+
@click.argument("platform", type=click.Choice(ALL_PLATFORMS))
|
| 29 |
+
def login(platform: str):
|
| 30 |
+
"""Interactively log in to a platform (opens a browser window)."""
|
| 31 |
+
from poster.auth.session import interactive_login
|
| 32 |
+
|
| 33 |
+
asyncio.run(interactive_login(platform))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ── Caption preview command ──────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@cli.command()
|
| 40 |
+
@click.argument("folders", nargs=-1, required=True)
|
| 41 |
+
@click.option("--platforms", "-p", default="instagram,tiktok,youtube", help="Comma-separated platforms")
|
| 42 |
+
@click.option("--lang-override", default=None, help="Override target language name (e.g. 'Urdu')")
|
| 43 |
+
def caption(folders: tuple[str, ...], platforms: str, lang_override: str | None):
|
| 44 |
+
"""Preview generated captions without posting."""
|
| 45 |
+
from poster.caption_gen import format_caption, generate_caption
|
| 46 |
+
from poster.video_loader import load_videos
|
| 47 |
+
|
| 48 |
+
target_platforms = [p.strip() for p in platforms.split(",")]
|
| 49 |
+
videos = load_videos(list(folders), lang_override)
|
| 50 |
+
|
| 51 |
+
if not videos:
|
| 52 |
+
console.print("[red]No valid videos found.[/red]")
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
for video in videos:
|
| 56 |
+
console.print(f"\n[bold]{'=' * 60}[/bold]")
|
| 57 |
+
console.print(f"[bold]Video:[/bold] {video.video_id}")
|
| 58 |
+
console.print(f"[bold]Source:[/bold] {video.source_language} -> {video.target_language_name}")
|
| 59 |
+
console.print(f"[bold]Link:[/bold] {video.video_link or 'N/A'}")
|
| 60 |
+
|
| 61 |
+
for platform in target_platforms:
|
| 62 |
+
console.print(f"\n[cyan]--- {platform.upper()} ---[/cyan]")
|
| 63 |
+
try:
|
| 64 |
+
caption_data = generate_caption(video, platform)
|
| 65 |
+
result = format_caption(caption_data, video, platform)
|
| 66 |
+
|
| 67 |
+
if platform == "youtube":
|
| 68 |
+
title, desc = result
|
| 69 |
+
console.print(f"[bold]Title:[/bold] {title}")
|
| 70 |
+
console.print(f"[bold]Description:[/bold]\n{desc}")
|
| 71 |
+
else:
|
| 72 |
+
console.print(f"[bold]Caption:[/bold]\n{result}")
|
| 73 |
+
except Exception as e:
|
| 74 |
+
console.print(f"[red]Caption generation failed: {e}[/red]")
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# ── Post command ─────────────────────────────────────────────────────────
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@cli.command()
|
| 81 |
+
@click.argument("folders", nargs=-1, required=True)
|
| 82 |
+
@click.option("--platforms", "-p", default="instagram,tiktok,youtube", help="Comma-separated platforms")
|
| 83 |
+
@click.option("--force", is_flag=True, help="Re-post even if already posted")
|
| 84 |
+
@click.option("--dry-run", is_flag=True, help="Generate captions but don't post")
|
| 85 |
+
@click.option("--delay", default=None, type=int, help="Seconds between posts (default: from env)")
|
| 86 |
+
@click.option("--headless/--no-headless", default=None, help="Run browser headlessly")
|
| 87 |
+
@click.option("--lang-override", default=None, help="Override target language name (e.g. 'Urdu')")
|
| 88 |
+
def post(
|
| 89 |
+
folders: tuple[str, ...],
|
| 90 |
+
platforms: str,
|
| 91 |
+
force: bool,
|
| 92 |
+
dry_run: bool,
|
| 93 |
+
delay: int | None,
|
| 94 |
+
headless: bool | None,
|
| 95 |
+
lang_override: str | None,
|
| 96 |
+
):
|
| 97 |
+
"""Post dubbed videos to social media platforms."""
|
| 98 |
+
asyncio.run(
|
| 99 |
+
_post_async(list(folders), platforms, force, dry_run, delay, headless, lang_override)
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
async def _post_async(
|
| 104 |
+
folders: list[str],
|
| 105 |
+
platforms_str: str,
|
| 106 |
+
force: bool,
|
| 107 |
+
dry_run: bool,
|
| 108 |
+
delay: int | None,
|
| 109 |
+
headless: bool | None,
|
| 110 |
+
lang_override: str | None,
|
| 111 |
+
):
|
| 112 |
+
from playwright.async_api import async_playwright
|
| 113 |
+
|
| 114 |
+
from poster import post_log
|
| 115 |
+
from poster.auth.session import get_context, has_session
|
| 116 |
+
from poster.caption_gen import format_caption, generate_caption
|
| 117 |
+
from poster.config import POST_DELAY
|
| 118 |
+
from poster.creator_extract import extract_creator
|
| 119 |
+
from poster.models import PostResult
|
| 120 |
+
from poster.platforms.instagram import InstagramPoster
|
| 121 |
+
from poster.platforms.tiktok import TikTokPoster
|
| 122 |
+
from poster.platforms.youtube import YouTubePoster
|
| 123 |
+
from poster.video_loader import load_videos
|
| 124 |
+
|
| 125 |
+
target_platforms = [p.strip() for p in platforms_str.split(",")]
|
| 126 |
+
post_delay = delay if delay is not None else POST_DELAY
|
| 127 |
+
|
| 128 |
+
# Validate sessions exist
|
| 129 |
+
for platform in target_platforms:
|
| 130 |
+
if not has_session(platform):
|
| 131 |
+
console.print(
|
| 132 |
+
f"[red]No session for {platform}. "
|
| 133 |
+
f"Run: python post.py login {platform}[/red]"
|
| 134 |
+
)
|
| 135 |
+
return
|
| 136 |
+
|
| 137 |
+
# Load videos
|
| 138 |
+
videos = load_videos(folders, lang_override)
|
| 139 |
+
if not videos:
|
| 140 |
+
console.print("[red]No valid videos found.[/red]")
|
| 141 |
+
return
|
| 142 |
+
|
| 143 |
+
console.print(f"\n[bold]Posting {len(videos)} video(s) to {', '.join(target_platforms)}[/bold]")
|
| 144 |
+
if dry_run:
|
| 145 |
+
console.print("[yellow]DRY RUN — captions will be generated but nothing will be posted[/yellow]")
|
| 146 |
+
|
| 147 |
+
results: list[PostResult] = []
|
| 148 |
+
|
| 149 |
+
async with async_playwright() as pw:
|
| 150 |
+
# Create browser contexts for each platform
|
| 151 |
+
contexts = {}
|
| 152 |
+
posters = {}
|
| 153 |
+
poster_classes = {
|
| 154 |
+
"instagram": InstagramPoster,
|
| 155 |
+
"tiktok": TikTokPoster,
|
| 156 |
+
"youtube": YouTubePoster,
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
for platform in target_platforms:
|
| 160 |
+
ctx = await get_context(pw, platform, headless=headless)
|
| 161 |
+
contexts[platform] = ctx
|
| 162 |
+
posters[platform] = poster_classes[platform](ctx)
|
| 163 |
+
|
| 164 |
+
# Use first available context for creator extraction
|
| 165 |
+
extract_ctx = next(iter(contexts.values()))
|
| 166 |
+
|
| 167 |
+
for i, video in enumerate(videos):
|
| 168 |
+
console.print(f"\n[bold]{'=' * 60}[/bold]")
|
| 169 |
+
console.print(f"[bold]Video {i + 1}/{len(videos)}:[/bold] {video.video_id}")
|
| 170 |
+
|
| 171 |
+
# Extract creator handle
|
| 172 |
+
creator_handle = await extract_creator(video.video_link, extract_ctx)
|
| 173 |
+
|
| 174 |
+
for platform in target_platforms:
|
| 175 |
+
console.print(f"\n[cyan]--- {platform.upper()} ---[/cyan]")
|
| 176 |
+
|
| 177 |
+
# Check if already posted
|
| 178 |
+
if not force and post_log.is_posted(video.video_id, platform):
|
| 179 |
+
console.print(f"[yellow]Already posted — skipping (use --force to re-post)[/yellow]")
|
| 180 |
+
results.append(PostResult(
|
| 181 |
+
video_id=video.video_id,
|
| 182 |
+
platform=platform,
|
| 183 |
+
status="skipped",
|
| 184 |
+
timestamp="",
|
| 185 |
+
))
|
| 186 |
+
continue
|
| 187 |
+
|
| 188 |
+
# Generate caption
|
| 189 |
+
try:
|
| 190 |
+
caption_data = generate_caption(video, platform, creator_handle)
|
| 191 |
+
formatted = format_caption(caption_data, video, platform, creator_handle)
|
| 192 |
+
except Exception as e:
|
| 193 |
+
console.print(f"[red]Caption generation failed: {e}[/red]")
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
if platform == "youtube":
|
| 197 |
+
title, description = formatted
|
| 198 |
+
console.print(f"[dim]Title: {title}[/dim]")
|
| 199 |
+
console.print(f"[dim]Description: {description[:150]}...[/dim]")
|
| 200 |
+
else:
|
| 201 |
+
description = formatted
|
| 202 |
+
title = None
|
| 203 |
+
console.print(f"[dim]Caption: {description[:150]}...[/dim]")
|
| 204 |
+
|
| 205 |
+
if dry_run:
|
| 206 |
+
console.print("[yellow]DRY RUN — skipping actual post[/yellow]")
|
| 207 |
+
continue
|
| 208 |
+
|
| 209 |
+
# Post
|
| 210 |
+
poster = posters[platform]
|
| 211 |
+
if platform == "youtube":
|
| 212 |
+
result = await poster.post(
|
| 213 |
+
video.output_path, description,
|
| 214 |
+
video_id=video.video_id, title=title,
|
| 215 |
+
)
|
| 216 |
+
else:
|
| 217 |
+
result = await poster.post(
|
| 218 |
+
video.output_path, description,
|
| 219 |
+
video_id=video.video_id,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
result.caption_used = description if isinstance(description, str) else str(description)
|
| 223 |
+
results.append(result)
|
| 224 |
+
post_log.record(result)
|
| 225 |
+
|
| 226 |
+
if result.status == "success":
|
| 227 |
+
console.print(f"[green]Posted to {platform}![/green]")
|
| 228 |
+
else:
|
| 229 |
+
console.print(f"[red]Failed: {result.error}[/red]")
|
| 230 |
+
|
| 231 |
+
# Delay between posts
|
| 232 |
+
if post_delay > 0:
|
| 233 |
+
console.print(f"[dim]Waiting {post_delay}s before next post...[/dim]")
|
| 234 |
+
await asyncio.sleep(post_delay)
|
| 235 |
+
|
| 236 |
+
# Close all browser contexts
|
| 237 |
+
for ctx in contexts.values():
|
| 238 |
+
await ctx.browser.close()
|
| 239 |
+
|
| 240 |
+
# Print summary
|
| 241 |
+
_print_summary(results)
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def _print_summary(results: list):
|
| 245 |
+
if not results:
|
| 246 |
+
return
|
| 247 |
+
|
| 248 |
+
table = Table(title="Posting Summary")
|
| 249 |
+
table.add_column("Video", style="bold")
|
| 250 |
+
table.add_column("Platform")
|
| 251 |
+
table.add_column("Status")
|
| 252 |
+
table.add_column("Error")
|
| 253 |
+
|
| 254 |
+
for r in results:
|
| 255 |
+
status_style = {
|
| 256 |
+
"success": "green",
|
| 257 |
+
"failed": "red",
|
| 258 |
+
"skipped": "yellow",
|
| 259 |
+
}.get(r.status, "white")
|
| 260 |
+
|
| 261 |
+
table.add_row(
|
| 262 |
+
r.video_id,
|
| 263 |
+
r.platform,
|
| 264 |
+
f"[{status_style}]{r.status}[/{status_style}]",
|
| 265 |
+
r.error or "",
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
console.print()
|
| 269 |
+
console.print(table)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
# ── Status command ───────────────────────────────────────────────────────
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
@cli.command()
|
| 276 |
+
def status():
|
| 277 |
+
"""Show posting history."""
|
| 278 |
+
from poster import post_log
|
| 279 |
+
|
| 280 |
+
data = post_log.get_all()
|
| 281 |
+
if not data:
|
| 282 |
+
console.print("[yellow]No posting history yet.[/yellow]")
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
table = Table(title="Posting History")
|
| 286 |
+
table.add_column("Video ID", style="bold")
|
| 287 |
+
table.add_column("Platform")
|
| 288 |
+
table.add_column("Status")
|
| 289 |
+
table.add_column("Timestamp")
|
| 290 |
+
table.add_column("Error")
|
| 291 |
+
|
| 292 |
+
for video_id, platforms in data.items():
|
| 293 |
+
for platform, info in platforms.items():
|
| 294 |
+
status_style = {
|
| 295 |
+
"success": "green",
|
| 296 |
+
"failed": "red",
|
| 297 |
+
}.get(info.get("status", ""), "white")
|
| 298 |
+
|
| 299 |
+
table.add_row(
|
| 300 |
+
video_id,
|
| 301 |
+
platform,
|
| 302 |
+
f"[{status_style}]{info.get('status', 'unknown')}[/{status_style}]",
|
| 303 |
+
info.get("timestamp", "")[:19],
|
| 304 |
+
info.get("error", "") or "",
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
console.print(table)
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
if __name__ == "__main__":
|
| 311 |
+
cli()
|
social_distributor/poster/__init__.py
ADDED
|
File without changes
|
social_distributor/poster/auth/__init__.py
ADDED
|
File without changes
|
social_distributor/poster/auth/session.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Browser session management — persistent login via Playwright storage state."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from playwright.async_api import BrowserContext, Playwright, async_playwright
|
| 8 |
+
from rich.console import Console
|
| 9 |
+
|
| 10 |
+
from ..config import AUTH_STORAGE_DIR, HEADLESS
|
| 11 |
+
|
| 12 |
+
console = Console()
|
| 13 |
+
|
| 14 |
+
PLATFORM_LOGIN_URLS = {
|
| 15 |
+
"instagram": "https://www.instagram.com/accounts/login/",
|
| 16 |
+
"tiktok": "https://www.tiktok.com/login",
|
| 17 |
+
"youtube": "https://studio.youtube.com/",
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Mobile UA for Instagram (required for mobile web Reels upload)
|
| 21 |
+
MOBILE_USER_AGENT = (
|
| 22 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
|
| 23 |
+
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
| 24 |
+
"Version/17.0 Mobile/15E148 Safari/604.1"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
DESKTOP_USER_AGENT = (
|
| 28 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
| 29 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 30 |
+
"Chrome/120.0.0.0 Safari/537.36"
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _state_path(platform: str) -> Path:
|
| 35 |
+
AUTH_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
|
| 36 |
+
return AUTH_STORAGE_DIR / f"{platform}_state.json"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def has_session(platform: str) -> bool:
|
| 40 |
+
return _state_path(platform).exists()
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
async def interactive_login(platform: str) -> None:
|
| 44 |
+
"""Launch a headed browser for the user to log in manually.
|
| 45 |
+
|
| 46 |
+
After login, saves the browser storage state for future use.
|
| 47 |
+
"""
|
| 48 |
+
login_url = PLATFORM_LOGIN_URLS.get(platform)
|
| 49 |
+
if not login_url:
|
| 50 |
+
console.print(f"[red]Unknown platform: {platform}[/red]")
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
console.print(f"\n[bold]Opening {platform.title()} login page...[/bold]")
|
| 54 |
+
console.print("[yellow]Please log in manually in the browser window.[/yellow]")
|
| 55 |
+
console.print("[yellow]Press Enter here when you're done logging in.[/yellow]\n")
|
| 56 |
+
|
| 57 |
+
use_mobile = platform == "instagram"
|
| 58 |
+
|
| 59 |
+
async with async_playwright() as pw:
|
| 60 |
+
browser = await pw.chromium.launch(headless=False)
|
| 61 |
+
context = await browser.new_context(
|
| 62 |
+
user_agent=MOBILE_USER_AGENT if use_mobile else DESKTOP_USER_AGENT,
|
| 63 |
+
viewport={"width": 414, "height": 896} if use_mobile else {"width": 1280, "height": 800},
|
| 64 |
+
is_mobile=use_mobile,
|
| 65 |
+
has_touch=use_mobile,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
page = await context.new_page()
|
| 69 |
+
await page.goto(login_url, wait_until="domcontentloaded")
|
| 70 |
+
|
| 71 |
+
# Wait for user to finish logging in
|
| 72 |
+
input(">>> Press Enter after you've logged in... ")
|
| 73 |
+
|
| 74 |
+
# Save state
|
| 75 |
+
state_file = _state_path(platform)
|
| 76 |
+
await context.storage_state(path=str(state_file))
|
| 77 |
+
console.print(f"[green]Session saved for {platform.title()}![/green]")
|
| 78 |
+
|
| 79 |
+
await browser.close()
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
async def get_context(
|
| 83 |
+
pw: Playwright,
|
| 84 |
+
platform: str,
|
| 85 |
+
headless: bool | None = None,
|
| 86 |
+
) -> BrowserContext:
|
| 87 |
+
"""Get a browser context with saved session state.
|
| 88 |
+
|
| 89 |
+
Raises FileNotFoundError if no session exists — user must run login first.
|
| 90 |
+
"""
|
| 91 |
+
state_file = _state_path(platform)
|
| 92 |
+
if not state_file.exists():
|
| 93 |
+
raise FileNotFoundError(
|
| 94 |
+
f"No saved session for {platform}. Run: python post.py login {platform}"
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
if headless is None:
|
| 98 |
+
headless = HEADLESS
|
| 99 |
+
|
| 100 |
+
use_mobile = platform == "instagram"
|
| 101 |
+
|
| 102 |
+
browser = await pw.chromium.launch(headless=headless)
|
| 103 |
+
context = await browser.new_context(
|
| 104 |
+
storage_state=str(state_file),
|
| 105 |
+
user_agent=MOBILE_USER_AGENT if use_mobile else DESKTOP_USER_AGENT,
|
| 106 |
+
viewport={"width": 414, "height": 896} if use_mobile else {"width": 1280, "height": 800},
|
| 107 |
+
is_mobile=use_mobile,
|
| 108 |
+
has_touch=use_mobile,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
return context
|
social_distributor/poster/caption_gen.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM-based caption generation for social media posts."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
from rich.console import Console
|
| 8 |
+
|
| 9 |
+
from .config import (
|
| 10 |
+
INSTAGRAM_CAPTION_LIMIT,
|
| 11 |
+
POLLEN_MODEL,
|
| 12 |
+
TIKTOK_CAPTION_LIMIT,
|
| 13 |
+
YOUTUBE_DESCRIPTION_LIMIT,
|
| 14 |
+
YOUTUBE_TITLE_LIMIT,
|
| 15 |
+
bedrock_converse,
|
| 16 |
+
build_pollinations_client,
|
| 17 |
+
)
|
| 18 |
+
from .models import VideoData
|
| 19 |
+
|
| 20 |
+
console = Console()
|
| 21 |
+
|
| 22 |
+
PLATFORM_LIMITS = {
|
| 23 |
+
"instagram": INSTAGRAM_CAPTION_LIMIT,
|
| 24 |
+
"tiktok": TIKTOK_CAPTION_LIMIT,
|
| 25 |
+
"youtube": YOUTUBE_DESCRIPTION_LIMIT,
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
PLATFORM_HASHTAGS = {
|
| 29 |
+
"instagram": "#Reels #ReelsViral #ExplorePage",
|
| 30 |
+
"tiktok": "#fyp #foryou #foryoupage",
|
| 31 |
+
"youtube": "#Shorts #YouTubeShorts",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _build_system_prompt(platform: str) -> str:
|
| 36 |
+
char_limit = PLATFORM_LIMITS.get(platform, 2200)
|
| 37 |
+
is_youtube = platform == "youtube"
|
| 38 |
+
|
| 39 |
+
return f"""You are a social media caption writer for VideoVoice, an AI voice-cloning video dubbing tool.
|
| 40 |
+
|
| 41 |
+
Your job: write a catchy, engaging caption for a dubbed video posted on {platform.title()}.
|
| 42 |
+
|
| 43 |
+
VideoVoice's key differentiator: platform tools give you an option (subtitle overlay). We give you a BRAND NEW video with cloned voice — same speaker, new language. Background music preserved. 23+ languages. "2x Reach, Same Effort."
|
| 44 |
+
|
| 45 |
+
Rules:
|
| 46 |
+
1. Highlight the magic of hearing this content in the target language with the SAME voice (AI voice cloning, not just subtitles)
|
| 47 |
+
2. Be conversational, create curiosity, make people want to watch
|
| 48 |
+
3. ALWAYS include the original video link to credit the original creator
|
| 49 |
+
4. If a creator handle is provided, tag them with @
|
| 50 |
+
5. Stay within {char_limit} characters total
|
| 51 |
+
6. Include relevant hashtags: #VideoVoice #AIDubbing #VoiceCloning + language-specific + {PLATFORM_HASHTAGS.get(platform, "")}
|
| 52 |
+
7. Write the caption primarily in English
|
| 53 |
+
|
| 54 |
+
{"Return a JSON object with two fields: `title` (under " + str(YOUTUBE_TITLE_LIMIT) + " chars, punchy) and `description` (the full caption)." if is_youtube else "Return a JSON object with one field: `caption` (the full caption text)."}
|
| 55 |
+
|
| 56 |
+
Example tone: "What's more interesting than hearing the power of English motivation but in the magic of Turkish words? Same voice. Same energy. New language."
|
| 57 |
+
|
| 58 |
+
IMPORTANT: Return ONLY valid JSON, no markdown fences."""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _build_user_prompt(video: VideoData, creator_handle: str | None) -> str:
|
| 62 |
+
# Truncate original text to avoid token limits
|
| 63 |
+
original_excerpt = video.original_text[:500]
|
| 64 |
+
translated_excerpt = video.translated_text[:300]
|
| 65 |
+
|
| 66 |
+
parts = [
|
| 67 |
+
f"Source language: {video.source_language}",
|
| 68 |
+
f"Target language: {video.target_language_name} ({video.target_language_code})",
|
| 69 |
+
f"Original transcript (excerpt): {original_excerpt}",
|
| 70 |
+
f"Translated text (excerpt): {translated_excerpt}",
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
if video.video_link:
|
| 74 |
+
parts.append(f"Original video link: {video.video_link}")
|
| 75 |
+
if creator_handle:
|
| 76 |
+
parts.append(f"Original creator: @{creator_handle}")
|
| 77 |
+
|
| 78 |
+
return "\n".join(parts)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _parse_response(raw: str, platform: str) -> dict:
|
| 82 |
+
"""Parse the LLM JSON response, with fallback for markdown fences."""
|
| 83 |
+
raw = raw.strip()
|
| 84 |
+
# Strip markdown code fences if present
|
| 85 |
+
if raw.startswith("```"):
|
| 86 |
+
lines = raw.split("\n")
|
| 87 |
+
lines = [l for l in lines if not l.strip().startswith("```")]
|
| 88 |
+
raw = "\n".join(lines)
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
return json.loads(raw)
|
| 92 |
+
except json.JSONDecodeError:
|
| 93 |
+
# If JSON parsing fails, treat the whole thing as a caption
|
| 94 |
+
if platform == "youtube":
|
| 95 |
+
return {"title": "Dubbed with AI Voice Cloning", "description": raw}
|
| 96 |
+
return {"caption": raw}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def generate_caption(
|
| 100 |
+
video: VideoData,
|
| 101 |
+
platform: str,
|
| 102 |
+
creator_handle: str | None = None,
|
| 103 |
+
) -> dict:
|
| 104 |
+
"""Generate a caption using Pollinations LLM, with Bedrock fallback.
|
| 105 |
+
|
| 106 |
+
Returns dict with 'caption' key (or 'title' + 'description' for YouTube).
|
| 107 |
+
"""
|
| 108 |
+
system_prompt = _build_system_prompt(platform)
|
| 109 |
+
user_prompt = _build_user_prompt(video, creator_handle)
|
| 110 |
+
|
| 111 |
+
# Primary: Pollinations
|
| 112 |
+
try:
|
| 113 |
+
client = build_pollinations_client()
|
| 114 |
+
response = client.chat.completions.create(
|
| 115 |
+
model=POLLEN_MODEL,
|
| 116 |
+
messages=[
|
| 117 |
+
{"role": "system", "content": system_prompt},
|
| 118 |
+
{"role": "user", "content": user_prompt},
|
| 119 |
+
],
|
| 120 |
+
temperature=0.7,
|
| 121 |
+
)
|
| 122 |
+
raw = response.choices[0].message.content
|
| 123 |
+
console.print(f"[green]Caption generated via Pollinations[/green] ({platform})")
|
| 124 |
+
return _parse_response(raw, platform)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
console.print(f"[yellow]Pollinations failed: {e}. Trying Bedrock...[/yellow]")
|
| 127 |
+
|
| 128 |
+
# Fallback: AWS Bedrock
|
| 129 |
+
try:
|
| 130 |
+
raw = bedrock_converse(system_prompt, user_prompt, temperature=0.7)
|
| 131 |
+
console.print(f"[green]Caption generated via Bedrock[/green] ({platform})")
|
| 132 |
+
return _parse_response(raw, platform)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
console.print(f"[red]Bedrock also failed: {e}[/red]")
|
| 135 |
+
raise RuntimeError(f"Caption generation failed for {video.video_id} on {platform}") from e
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def format_caption(
|
| 139 |
+
caption_data: dict,
|
| 140 |
+
video: VideoData,
|
| 141 |
+
platform: str,
|
| 142 |
+
creator_handle: str | None = None,
|
| 143 |
+
) -> str | tuple[str, str]:
|
| 144 |
+
"""Ensure the final caption always contains the original link and creator credit.
|
| 145 |
+
|
| 146 |
+
Returns a string for Instagram/TikTok, or (title, description) tuple for YouTube.
|
| 147 |
+
"""
|
| 148 |
+
if platform == "youtube":
|
| 149 |
+
title = caption_data.get("title", "AI Voice Dubbed")
|
| 150 |
+
desc = caption_data.get("description", caption_data.get("caption", ""))
|
| 151 |
+
else:
|
| 152 |
+
desc = caption_data.get("caption", "")
|
| 153 |
+
|
| 154 |
+
# Ensure original link is present
|
| 155 |
+
if video.video_link and video.video_link not in desc:
|
| 156 |
+
desc += f"\n\nOriginal: {video.video_link}"
|
| 157 |
+
|
| 158 |
+
# Ensure creator tag is present
|
| 159 |
+
if creator_handle and f"@{creator_handle}" not in desc:
|
| 160 |
+
desc += f"\nCredit: @{creator_handle}"
|
| 161 |
+
|
| 162 |
+
if platform == "youtube":
|
| 163 |
+
return title, desc
|
| 164 |
+
return desc
|
social_distributor/poster/config.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Central configuration — env loading, constants, language maps."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
|
| 9 |
+
load_dotenv(Path(__file__).resolve().parent.parent / ".env")
|
| 10 |
+
|
| 11 |
+
# ── Paths ────────────────────────────────────────────────────────────────
|
| 12 |
+
POSTER_ROOT = Path(__file__).resolve().parent.parent
|
| 13 |
+
VIDEOVOICE_DATA_DIR = Path(
|
| 14 |
+
os.getenv("VIDEOVOICE_DATA_DIR", str(POSTER_ROOT.parent / "data"))
|
| 15 |
+
)
|
| 16 |
+
AUTH_STORAGE_DIR = POSTER_ROOT / "poster" / "auth" / "storage"
|
| 17 |
+
POST_LOG_PATH = POSTER_ROOT / "post_history.json"
|
| 18 |
+
CREATOR_CACHE_PATH = POSTER_ROOT / "creator_cache.json"
|
| 19 |
+
|
| 20 |
+
# ── Pollinations LLM (primary) ───────────────────────────────────────────
|
| 21 |
+
POLLINATIONS_BASE = "https://gen.pollinations.ai/v1"
|
| 22 |
+
POLLEN_MODEL = os.getenv("POLLEN_MODEL", "gemini-search")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def build_pollinations_client() -> OpenAI:
|
| 26 |
+
api_key = (
|
| 27 |
+
os.getenv("POLLEN_API_KEY_SECONDARY")
|
| 28 |
+
or os.getenv("POLLEN_API_KEY")
|
| 29 |
+
or os.getenv("POLLINATIONS_API_KEY")
|
| 30 |
+
or "pollinations"
|
| 31 |
+
)
|
| 32 |
+
return OpenAI(base_url=POLLINATIONS_BASE, api_key=api_key)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ── Bedrock fallback ─────────────────────────────────────────────────────
|
| 36 |
+
BEDROCK_REGION = os.getenv("AWS_REGION", "us-east-1")
|
| 37 |
+
BEDROCK_MODEL = os.getenv("BEDROCK_MODEL", "qwen.qwen3-next-80b-a3b")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def bedrock_converse(system_prompt: str, user_text: str, temperature: float = 0.3) -> str:
|
| 41 |
+
import boto3
|
| 42 |
+
|
| 43 |
+
client = boto3.client("bedrock-runtime", region_name=BEDROCK_REGION)
|
| 44 |
+
response = client.converse(
|
| 45 |
+
modelId=BEDROCK_MODEL,
|
| 46 |
+
messages=[{"role": "user", "content": [{"text": user_text}]}],
|
| 47 |
+
system=[{"text": system_prompt}],
|
| 48 |
+
inferenceConfig={"temperature": temperature},
|
| 49 |
+
)
|
| 50 |
+
return response["output"]["message"]["content"][0]["text"].strip()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ── Language code → name (reversed from pipeline.py LANGUAGE_CODES) ──────
|
| 54 |
+
LANGUAGE_CODE_TO_NAME: dict[str, str] = {
|
| 55 |
+
"ar": "Arabic",
|
| 56 |
+
"zh": "Chinese",
|
| 57 |
+
"da": "Danish",
|
| 58 |
+
"nl": "Dutch",
|
| 59 |
+
"en": "English",
|
| 60 |
+
"fi": "Finnish",
|
| 61 |
+
"fr": "French",
|
| 62 |
+
"de": "German",
|
| 63 |
+
"el": "Greek",
|
| 64 |
+
"he": "Hebrew",
|
| 65 |
+
"hi": "Hindi",
|
| 66 |
+
"it": "Italian",
|
| 67 |
+
"ja": "Japanese",
|
| 68 |
+
"ko": "Korean",
|
| 69 |
+
"ms": "Malay",
|
| 70 |
+
"no": "Norwegian",
|
| 71 |
+
"pl": "Polish",
|
| 72 |
+
"pt": "Portuguese",
|
| 73 |
+
"ru": "Russian",
|
| 74 |
+
"es": "Spanish",
|
| 75 |
+
"sw": "Swahili",
|
| 76 |
+
"sv": "Swedish",
|
| 77 |
+
"tr": "Turkish",
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
# ── Platform caption limits ──────────────────────────────────────────────
|
| 81 |
+
INSTAGRAM_CAPTION_LIMIT = 2200
|
| 82 |
+
TIKTOK_CAPTION_LIMIT = 4000
|
| 83 |
+
YOUTUBE_TITLE_LIMIT = 100
|
| 84 |
+
YOUTUBE_DESCRIPTION_LIMIT = 5000
|
| 85 |
+
|
| 86 |
+
# ── Posting settings ─────────────────────────────────────────────────────
|
| 87 |
+
POST_DELAY = int(os.getenv("POST_DELAY", "30"))
|
| 88 |
+
HEADLESS = os.getenv("HEADLESS", "true").lower() == "true"
|
social_distributor/poster/creator_extract.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Extract original creator @username from video URLs."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
|
| 8 |
+
from rich.console import Console
|
| 9 |
+
|
| 10 |
+
from .config import CREATOR_CACHE_PATH
|
| 11 |
+
|
| 12 |
+
console = Console()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _load_cache() -> dict[str, str]:
|
| 16 |
+
if CREATOR_CACHE_PATH.exists():
|
| 17 |
+
with open(CREATOR_CACHE_PATH) as f:
|
| 18 |
+
return json.load(f)
|
| 19 |
+
return {}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _save_cache(cache: dict[str, str]) -> None:
|
| 23 |
+
with open(CREATOR_CACHE_PATH, "w") as f:
|
| 24 |
+
json.dump(cache, f, indent=2)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
async def extract_creator(video_link: str | None, browser_context=None) -> str | None:
|
| 28 |
+
"""Extract the @username of the original creator from the video URL.
|
| 29 |
+
|
| 30 |
+
Uses Playwright browser context to visit the page and extract metadata.
|
| 31 |
+
Results are cached to avoid repeated page visits.
|
| 32 |
+
"""
|
| 33 |
+
if not video_link:
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
cache = _load_cache()
|
| 37 |
+
if video_link in cache:
|
| 38 |
+
return cache[video_link]
|
| 39 |
+
|
| 40 |
+
username = None
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
if "instagram.com" in video_link:
|
| 44 |
+
username = await _extract_instagram(video_link, browser_context)
|
| 45 |
+
elif "tiktok.com" in video_link:
|
| 46 |
+
username = await _extract_tiktok(video_link, browser_context)
|
| 47 |
+
elif "youtube.com" in video_link or "youtu.be" in video_link:
|
| 48 |
+
username = await _extract_youtube(video_link, browser_context)
|
| 49 |
+
except Exception as e:
|
| 50 |
+
console.print(f"[yellow]Creator extraction failed: {e}[/yellow]")
|
| 51 |
+
|
| 52 |
+
if username:
|
| 53 |
+
# Clean up username
|
| 54 |
+
username = username.strip().lstrip("@")
|
| 55 |
+
cache[video_link] = username
|
| 56 |
+
_save_cache(cache)
|
| 57 |
+
console.print(f"[green]Creator found:[/green] @{username}")
|
| 58 |
+
|
| 59 |
+
return username
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
async def _extract_instagram(url: str, ctx) -> str | None:
|
| 63 |
+
"""Instagram: visit reel, extract username from og:title or page URL."""
|
| 64 |
+
if not ctx:
|
| 65 |
+
return None
|
| 66 |
+
|
| 67 |
+
page = await ctx.new_page()
|
| 68 |
+
try:
|
| 69 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
| 70 |
+
await page.wait_for_timeout(2000)
|
| 71 |
+
|
| 72 |
+
# Try og:title meta tag: "Username on Instagram: ..."
|
| 73 |
+
og_title = await page.query_selector('meta[property="og:title"]')
|
| 74 |
+
if og_title:
|
| 75 |
+
content = await og_title.get_attribute("content")
|
| 76 |
+
if content:
|
| 77 |
+
# Pattern: "Username on Instagram" or "@username"
|
| 78 |
+
match = re.match(r"^@?(\w[\w.]+)", content)
|
| 79 |
+
if match:
|
| 80 |
+
return match.group(1)
|
| 81 |
+
|
| 82 |
+
# Try the final URL which may contain /username/reel/ID
|
| 83 |
+
final_url = page.url
|
| 84 |
+
match = re.search(r"instagram\.com/([^/]+)/reel", final_url)
|
| 85 |
+
if match:
|
| 86 |
+
return match.group(1)
|
| 87 |
+
|
| 88 |
+
finally:
|
| 89 |
+
await page.close()
|
| 90 |
+
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
async def _extract_tiktok(url: str, ctx) -> str | None:
|
| 95 |
+
"""TikTok: follow redirect from short URL, parse /@username from final URL."""
|
| 96 |
+
if not ctx:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
page = await ctx.new_page()
|
| 100 |
+
try:
|
| 101 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
| 102 |
+
await page.wait_for_timeout(2000)
|
| 103 |
+
|
| 104 |
+
final_url = page.url
|
| 105 |
+
match = re.search(r"/@([^/]+)", final_url)
|
| 106 |
+
if match:
|
| 107 |
+
return match.group(1)
|
| 108 |
+
|
| 109 |
+
# Fallback: check meta tags
|
| 110 |
+
og_title = await page.query_selector('meta[property="og:title"]')
|
| 111 |
+
if og_title:
|
| 112 |
+
content = await og_title.get_attribute("content")
|
| 113 |
+
if content:
|
| 114 |
+
match = re.search(r"@(\w[\w.]+)", content)
|
| 115 |
+
if match:
|
| 116 |
+
return match.group(1)
|
| 117 |
+
finally:
|
| 118 |
+
await page.close()
|
| 119 |
+
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
async def _extract_youtube(url: str, ctx) -> str | None:
|
| 124 |
+
"""YouTube: visit video page, extract channel name from meta tags."""
|
| 125 |
+
if not ctx:
|
| 126 |
+
return None
|
| 127 |
+
|
| 128 |
+
page = await ctx.new_page()
|
| 129 |
+
try:
|
| 130 |
+
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
|
| 131 |
+
await page.wait_for_timeout(2000)
|
| 132 |
+
|
| 133 |
+
# Try link[itemprop="name"] inside the channel section
|
| 134 |
+
author = await page.query_selector('link[itemprop="name"]')
|
| 135 |
+
if author:
|
| 136 |
+
name = await author.get_attribute("content")
|
| 137 |
+
if name:
|
| 138 |
+
return name
|
| 139 |
+
|
| 140 |
+
# Fallback: og:title often has "Video Title - Channel Name"
|
| 141 |
+
og_title = await page.query_selector('meta[property="og:title"]')
|
| 142 |
+
if og_title:
|
| 143 |
+
content = await og_title.get_attribute("content")
|
| 144 |
+
if content and " - " in content:
|
| 145 |
+
return content.rsplit(" - ", 1)[-1].strip()
|
| 146 |
+
finally:
|
| 147 |
+
await page.close()
|
| 148 |
+
|
| 149 |
+
return None
|
social_distributor/poster/models.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for the poster pipeline."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class VideoData:
|
| 10 |
+
video_id: str
|
| 11 |
+
output_path: str
|
| 12 |
+
video_link: str | None
|
| 13 |
+
source_language: str
|
| 14 |
+
target_language_code: str
|
| 15 |
+
target_language_name: str
|
| 16 |
+
original_text: str
|
| 17 |
+
translated_text: str
|
| 18 |
+
platform_type: str | None # "instagram" | "tiktok" | "youtube" | None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class PostResult:
|
| 23 |
+
video_id: str
|
| 24 |
+
platform: str
|
| 25 |
+
status: str # "success" | "failed" | "skipped"
|
| 26 |
+
timestamp: str
|
| 27 |
+
caption_used: str = ""
|
| 28 |
+
error: str | None = None
|
| 29 |
+
url: str | None = None
|
social_distributor/poster/platforms/__init__.py
ADDED
|
File without changes
|
social_distributor/poster/platforms/base.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Abstract base class for platform posters."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import random
|
| 7 |
+
from abc import ABC, abstractmethod
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
|
| 10 |
+
from playwright.async_api import BrowserContext, Page
|
| 11 |
+
|
| 12 |
+
from ..models import PostResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BasePoster(ABC):
|
| 16 |
+
platform: str = ""
|
| 17 |
+
|
| 18 |
+
def __init__(self, context: BrowserContext):
|
| 19 |
+
self.context = context
|
| 20 |
+
|
| 21 |
+
@abstractmethod
|
| 22 |
+
async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
|
| 23 |
+
...
|
| 24 |
+
|
| 25 |
+
@abstractmethod
|
| 26 |
+
async def is_logged_in(self) -> bool:
|
| 27 |
+
...
|
| 28 |
+
|
| 29 |
+
async def _human_delay(self, min_s: float = 1.0, max_s: float = 3.0) -> None:
|
| 30 |
+
await asyncio.sleep(random.uniform(min_s, max_s))
|
| 31 |
+
|
| 32 |
+
async def _screenshot_on_error(self, page: Page, video_id: str) -> None:
|
| 33 |
+
"""Save a debug screenshot on failure."""
|
| 34 |
+
from ..config import POSTER_ROOT
|
| 35 |
+
errors_dir = POSTER_ROOT / "errors"
|
| 36 |
+
errors_dir.mkdir(exist_ok=True)
|
| 37 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 38 |
+
path = errors_dir / f"{self.platform}_{video_id}_{ts}.png"
|
| 39 |
+
await page.screenshot(path=str(path))
|
| 40 |
+
|
| 41 |
+
def _make_result(
|
| 42 |
+
self,
|
| 43 |
+
video_id: str,
|
| 44 |
+
status: str,
|
| 45 |
+
caption: str = "",
|
| 46 |
+
error: str | None = None,
|
| 47 |
+
url: str | None = None,
|
| 48 |
+
) -> PostResult:
|
| 49 |
+
return PostResult(
|
| 50 |
+
video_id=video_id,
|
| 51 |
+
platform=self.platform,
|
| 52 |
+
status=status,
|
| 53 |
+
timestamp=datetime.now(timezone.utc).isoformat(),
|
| 54 |
+
caption_used=caption,
|
| 55 |
+
error=error,
|
| 56 |
+
url=url,
|
| 57 |
+
)
|
social_distributor/poster/platforms/instagram.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Instagram Reel posting via Playwright (mobile web viewport)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from rich.console import Console
|
| 6 |
+
|
| 7 |
+
from .base import BasePoster
|
| 8 |
+
from ..models import PostResult
|
| 9 |
+
|
| 10 |
+
console = Console()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class InstagramPoster(BasePoster):
|
| 14 |
+
platform = "instagram"
|
| 15 |
+
|
| 16 |
+
async def is_logged_in(self) -> bool:
|
| 17 |
+
page = await self.context.new_page()
|
| 18 |
+
try:
|
| 19 |
+
await page.goto("https://www.instagram.com/", wait_until="domcontentloaded", timeout=15000)
|
| 20 |
+
await page.wait_for_timeout(3000)
|
| 21 |
+
login_form = await page.query_selector('input[name="username"]')
|
| 22 |
+
if login_form:
|
| 23 |
+
return False
|
| 24 |
+
nav = await page.query_selector('nav, div[role="navigation"]')
|
| 25 |
+
return nav is not None
|
| 26 |
+
except Exception:
|
| 27 |
+
return False
|
| 28 |
+
finally:
|
| 29 |
+
await page.close()
|
| 30 |
+
|
| 31 |
+
async def _dismiss_popups(self, page) -> None:
|
| 32 |
+
"""Dismiss common Instagram popups (notifications, app switch, cookies)."""
|
| 33 |
+
dismiss_selectors = [
|
| 34 |
+
'button:has-text("Not Now")',
|
| 35 |
+
'button:has-text("Cancel")',
|
| 36 |
+
'button:has-text("Accept All")',
|
| 37 |
+
'button:has-text("Allow All Cookies")',
|
| 38 |
+
'button:has-text("Decline")',
|
| 39 |
+
]
|
| 40 |
+
for selector in dismiss_selectors:
|
| 41 |
+
try:
|
| 42 |
+
btn = await page.query_selector(selector)
|
| 43 |
+
if btn and await btn.is_visible():
|
| 44 |
+
await btn.click()
|
| 45 |
+
await self._human_delay(0.5, 1)
|
| 46 |
+
except Exception:
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
|
| 50 |
+
video_id = kwargs.get("video_id", "unknown")
|
| 51 |
+
page = await self.context.new_page()
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
console.print(f"[cyan]Instagram:[/cyan] Navigating to Instagram...")
|
| 55 |
+
await page.goto("https://www.instagram.com/", wait_until="domcontentloaded", timeout=20000)
|
| 56 |
+
await page.wait_for_timeout(3000)
|
| 57 |
+
|
| 58 |
+
await self._dismiss_popups(page)
|
| 59 |
+
|
| 60 |
+
login_form = await page.query_selector('input[name="username"]')
|
| 61 |
+
if login_form:
|
| 62 |
+
return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
|
| 63 |
+
|
| 64 |
+
await self._human_delay(1, 2)
|
| 65 |
+
|
| 66 |
+
# Click the create/new post button
|
| 67 |
+
console.print(f"[cyan]Instagram:[/cyan] Opening create dialog...")
|
| 68 |
+
create_selectors = [
|
| 69 |
+
'svg[aria-label="New post"]',
|
| 70 |
+
'a[href="/create/"]',
|
| 71 |
+
'div[role="menuitem"] svg[aria-label*="New"]',
|
| 72 |
+
'a[href="/create/select/"]',
|
| 73 |
+
'[aria-label="New post"]',
|
| 74 |
+
'svg[aria-label="New Post"]',
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
create_clicked = False
|
| 78 |
+
for selector in create_selectors:
|
| 79 |
+
el = await page.query_selector(selector)
|
| 80 |
+
if el:
|
| 81 |
+
await el.click()
|
| 82 |
+
create_clicked = True
|
| 83 |
+
break
|
| 84 |
+
|
| 85 |
+
if not create_clicked:
|
| 86 |
+
await page.goto("https://www.instagram.com/create/select/", wait_until="domcontentloaded")
|
| 87 |
+
await page.wait_for_timeout(2000)
|
| 88 |
+
|
| 89 |
+
await self._human_delay(2, 3)
|
| 90 |
+
await self._dismiss_popups(page)
|
| 91 |
+
|
| 92 |
+
# ── FIX: Instagram's file input is hidden by default.
|
| 93 |
+
# Wait for it to be *attached* to the DOM (not visible),
|
| 94 |
+
# then call set_input_files() which works on hidden inputs.
|
| 95 |
+
console.print(f"[cyan]Instagram:[/cyan] Uploading video...")
|
| 96 |
+
try:
|
| 97 |
+
file_input = await page.wait_for_selector(
|
| 98 |
+
'input[type="file"]',
|
| 99 |
+
state="attached", # <-- was default "visible", which timed out
|
| 100 |
+
timeout=15000,
|
| 101 |
+
)
|
| 102 |
+
except Exception:
|
| 103 |
+
# Fallback: query directly without waiting
|
| 104 |
+
file_input = await page.query_selector('input[type="file"]')
|
| 105 |
+
|
| 106 |
+
if not file_input:
|
| 107 |
+
await self._screenshot_on_error(page, video_id)
|
| 108 |
+
return self._make_result(video_id, "failed", caption, error="File input not found in DOM")
|
| 109 |
+
|
| 110 |
+
# Unhide the input via JS as a safety measure, then set the file
|
| 111 |
+
await page.evaluate(
|
| 112 |
+
"""el => {
|
| 113 |
+
el.style.display = 'block';
|
| 114 |
+
el.style.opacity = '1';
|
| 115 |
+
el.style.visibility = 'visible';
|
| 116 |
+
}""",
|
| 117 |
+
file_input,
|
| 118 |
+
)
|
| 119 |
+
await file_input.set_input_files(video_path)
|
| 120 |
+
|
| 121 |
+
await self._human_delay(3, 5)
|
| 122 |
+
|
| 123 |
+
# Instagram may show aspect ratio / crop screen — look for Reel tab
|
| 124 |
+
reel_tab = await page.query_selector('div:has-text("Reel"), button:has-text("Reel")')
|
| 125 |
+
if reel_tab:
|
| 126 |
+
await reel_tab.click()
|
| 127 |
+
await self._human_delay(1, 2)
|
| 128 |
+
|
| 129 |
+
# Click through editing steps (crop, filters, etc.)
|
| 130 |
+
for _ in range(3):
|
| 131 |
+
next_btn = await page.query_selector(
|
| 132 |
+
'button:has-text("Next"), div[role="button"]:has-text("Next")'
|
| 133 |
+
)
|
| 134 |
+
if next_btn:
|
| 135 |
+
await next_btn.click()
|
| 136 |
+
await self._human_delay(2, 3)
|
| 137 |
+
await self._dismiss_popups(page)
|
| 138 |
+
else:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
# Fill in the caption
|
| 142 |
+
console.print(f"[cyan]Instagram:[/cyan] Adding caption...")
|
| 143 |
+
caption_selectors = [
|
| 144 |
+
'textarea[aria-label*="Write a caption"]',
|
| 145 |
+
'textarea[placeholder*="Write a caption"]',
|
| 146 |
+
'div[contenteditable="true"][role="textbox"]',
|
| 147 |
+
'div[aria-label*="Write a caption"]',
|
| 148 |
+
]
|
| 149 |
+
|
| 150 |
+
caption_filled = False
|
| 151 |
+
for selector in caption_selectors:
|
| 152 |
+
editor = await page.query_selector(selector)
|
| 153 |
+
if editor:
|
| 154 |
+
await editor.click()
|
| 155 |
+
await self._human_delay(0.5, 1)
|
| 156 |
+
await page.keyboard.type(caption, delay=10)
|
| 157 |
+
caption_filled = True
|
| 158 |
+
break
|
| 159 |
+
|
| 160 |
+
if not caption_filled:
|
| 161 |
+
console.print("[yellow]Instagram: Could not find caption field[/yellow]")
|
| 162 |
+
|
| 163 |
+
await self._human_delay(2, 3)
|
| 164 |
+
|
| 165 |
+
# Click Share
|
| 166 |
+
console.print(f"[cyan]Instagram:[/cyan] Sharing...")
|
| 167 |
+
share_btn = await page.query_selector(
|
| 168 |
+
'button:has-text("Share"), div[role="button"]:has-text("Share")'
|
| 169 |
+
)
|
| 170 |
+
if share_btn:
|
| 171 |
+
await share_btn.click()
|
| 172 |
+
else:
|
| 173 |
+
await self._screenshot_on_error(page, video_id)
|
| 174 |
+
return self._make_result(video_id, "failed", caption, error="Could not find Share button")
|
| 175 |
+
|
| 176 |
+
# Wait for upload to complete
|
| 177 |
+
console.print(f"[cyan]Instagram:[/cyan] Waiting for upload to complete...")
|
| 178 |
+
await page.wait_for_timeout(10000)
|
| 179 |
+
|
| 180 |
+
# Check for success
|
| 181 |
+
try:
|
| 182 |
+
await page.wait_for_selector(
|
| 183 |
+
'div:has-text("Your reel has been shared"), '
|
| 184 |
+
'div:has-text("Reel shared"), '
|
| 185 |
+
'span:has-text("Your reel has been shared"), '
|
| 186 |
+
'img[alt="Animated checkmark"]',
|
| 187 |
+
timeout=60000,
|
| 188 |
+
)
|
| 189 |
+
console.print(f"[green]Instagram: Reel shared successfully![/green]")
|
| 190 |
+
return self._make_result(video_id, "success", caption)
|
| 191 |
+
except Exception:
|
| 192 |
+
if page.url == "https://www.instagram.com/" or "/create" not in page.url:
|
| 193 |
+
console.print(f"[green]Instagram: Likely posted (redirected to feed)[/green]")
|
| 194 |
+
return self._make_result(video_id, "success", caption)
|
| 195 |
+
|
| 196 |
+
await self._screenshot_on_error(page, video_id)
|
| 197 |
+
return self._make_result(video_id, "failed", caption, error="Share confirmation not detected")
|
| 198 |
+
|
| 199 |
+
except Exception as e:
|
| 200 |
+
try:
|
| 201 |
+
await self._screenshot_on_error(page, video_id)
|
| 202 |
+
except Exception:
|
| 203 |
+
pass
|
| 204 |
+
return self._make_result(video_id, "failed", caption, error=str(e))
|
| 205 |
+
finally:
|
| 206 |
+
await page.close()
|
social_distributor/poster/platforms/tiktok.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TikTok video posting via Playwright (tiktok.com/upload)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from rich.console import Console
|
| 6 |
+
|
| 7 |
+
from .base import BasePoster
|
| 8 |
+
from ..models import PostResult
|
| 9 |
+
|
| 10 |
+
console = Console()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class TikTokPoster(BasePoster):
|
| 14 |
+
platform = "tiktok"
|
| 15 |
+
|
| 16 |
+
async def is_logged_in(self) -> bool:
|
| 17 |
+
page = await self.context.new_page()
|
| 18 |
+
try:
|
| 19 |
+
await page.goto("https://www.tiktok.com/upload", wait_until="domcontentloaded", timeout=15000)
|
| 20 |
+
await page.wait_for_timeout(3000)
|
| 21 |
+
# If redirected to login, we're not logged in
|
| 22 |
+
if "/login" in page.url:
|
| 23 |
+
return False
|
| 24 |
+
# Look for upload area
|
| 25 |
+
upload_area = await page.query_selector('input[type="file"]')
|
| 26 |
+
return upload_area is not None
|
| 27 |
+
except Exception:
|
| 28 |
+
return False
|
| 29 |
+
finally:
|
| 30 |
+
await page.close()
|
| 31 |
+
|
| 32 |
+
async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
|
| 33 |
+
video_id = kwargs.get("video_id", "unknown")
|
| 34 |
+
page = await self.context.new_page()
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
console.print(f"[cyan]TikTok:[/cyan] Navigating to upload page...")
|
| 38 |
+
await page.goto("https://www.tiktok.com/upload", wait_until="domcontentloaded", timeout=20000)
|
| 39 |
+
await page.wait_for_timeout(3000)
|
| 40 |
+
|
| 41 |
+
if "/login" in page.url:
|
| 42 |
+
return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
|
| 43 |
+
|
| 44 |
+
# Upload video via file input
|
| 45 |
+
console.print(f"[cyan]TikTok:[/cyan] Uploading video...")
|
| 46 |
+
file_input = await page.wait_for_selector('input[type="file"]', timeout=10000)
|
| 47 |
+
await file_input.set_input_files(video_path)
|
| 48 |
+
|
| 49 |
+
# Wait for video to process (the upload indicator / thumbnail appears)
|
| 50 |
+
await self._human_delay(3, 5)
|
| 51 |
+
|
| 52 |
+
# Wait for video processing — look for the editor/preview to appear
|
| 53 |
+
# TikTok shows a video preview once upload is complete
|
| 54 |
+
try:
|
| 55 |
+
await page.wait_for_selector(
|
| 56 |
+
'div[class*="editor"], div[class*="preview"], div[class*="video-card"]',
|
| 57 |
+
timeout=60000,
|
| 58 |
+
)
|
| 59 |
+
except Exception:
|
| 60 |
+
console.print("[yellow]TikTok: Waiting for upload processing...[/yellow]")
|
| 61 |
+
await page.wait_for_timeout(10000)
|
| 62 |
+
|
| 63 |
+
await self._human_delay(2, 4)
|
| 64 |
+
|
| 65 |
+
# Fill in the caption
|
| 66 |
+
console.print(f"[cyan]TikTok:[/cyan] Adding caption...")
|
| 67 |
+
|
| 68 |
+
# TikTok uses a contenteditable div for the caption
|
| 69 |
+
# Try multiple selectors for the caption editor
|
| 70 |
+
caption_selectors = [
|
| 71 |
+
'div[contenteditable="true"]',
|
| 72 |
+
'div[data-placeholder*="caption"]',
|
| 73 |
+
'div[class*="caption"] div[contenteditable="true"]',
|
| 74 |
+
'.public-DraftEditor-content',
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
caption_editor = None
|
| 78 |
+
for selector in caption_selectors:
|
| 79 |
+
caption_editor = await page.query_selector(selector)
|
| 80 |
+
if caption_editor:
|
| 81 |
+
break
|
| 82 |
+
|
| 83 |
+
if caption_editor:
|
| 84 |
+
await caption_editor.click()
|
| 85 |
+
await self._human_delay(0.5, 1)
|
| 86 |
+
# Clear existing text and type new caption
|
| 87 |
+
await page.keyboard.press("Meta+a")
|
| 88 |
+
await page.keyboard.press("Backspace")
|
| 89 |
+
await self._human_delay(0.3, 0.5)
|
| 90 |
+
await page.keyboard.type(caption, delay=10)
|
| 91 |
+
else:
|
| 92 |
+
console.print("[yellow]TikTok: Could not find caption editor[/yellow]")
|
| 93 |
+
|
| 94 |
+
await self._human_delay(2, 3)
|
| 95 |
+
|
| 96 |
+
# Click Post button
|
| 97 |
+
console.print(f"[cyan]TikTok:[/cyan] Posting...")
|
| 98 |
+
post_button_selectors = [
|
| 99 |
+
'button:has-text("Post")',
|
| 100 |
+
'button[class*="post-button"]',
|
| 101 |
+
'div[class*="btn-post"] button',
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
posted = False
|
| 105 |
+
for selector in post_button_selectors:
|
| 106 |
+
btn = await page.query_selector(selector)
|
| 107 |
+
if btn and await btn.is_enabled():
|
| 108 |
+
await btn.click()
|
| 109 |
+
posted = True
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
if not posted:
|
| 113 |
+
# Fallback: try pressing the button by text
|
| 114 |
+
try:
|
| 115 |
+
await page.get_by_role("button", name="Post").click()
|
| 116 |
+
posted = True
|
| 117 |
+
except Exception:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
if not posted:
|
| 121 |
+
await self._screenshot_on_error(page, video_id)
|
| 122 |
+
return self._make_result(video_id, "failed", caption, error="Could not find Post button")
|
| 123 |
+
|
| 124 |
+
# Wait for upload to complete
|
| 125 |
+
console.print(f"[cyan]TikTok:[/cyan] Waiting for upload to complete...")
|
| 126 |
+
await page.wait_for_timeout(10000)
|
| 127 |
+
|
| 128 |
+
# Check for success indicators
|
| 129 |
+
success = False
|
| 130 |
+
try:
|
| 131 |
+
await page.wait_for_selector(
|
| 132 |
+
'div:has-text("uploaded"), div:has-text("Your video"), div[class*="success"]',
|
| 133 |
+
timeout=30000,
|
| 134 |
+
)
|
| 135 |
+
success = True
|
| 136 |
+
except Exception:
|
| 137 |
+
# If URL changed away from upload page, likely success
|
| 138 |
+
if "/upload" not in page.url:
|
| 139 |
+
success = True
|
| 140 |
+
|
| 141 |
+
if success:
|
| 142 |
+
console.print(f"[green]TikTok: Posted successfully![/green]")
|
| 143 |
+
return self._make_result(video_id, "success", caption)
|
| 144 |
+
else:
|
| 145 |
+
await self._screenshot_on_error(page, video_id)
|
| 146 |
+
return self._make_result(video_id, "failed", caption, error="Upload may not have completed")
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
try:
|
| 150 |
+
await self._screenshot_on_error(page, video_id)
|
| 151 |
+
except Exception:
|
| 152 |
+
pass
|
| 153 |
+
return self._make_result(video_id, "failed", caption, error=str(e))
|
| 154 |
+
finally:
|
| 155 |
+
await page.close()
|
social_distributor/poster/platforms/youtube.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""YouTube Shorts posting via Playwright (studio.youtube.com)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from rich.console import Console
|
| 6 |
+
|
| 7 |
+
from .base import BasePoster
|
| 8 |
+
from ..models import PostResult
|
| 9 |
+
|
| 10 |
+
console = Console()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class YouTubePoster(BasePoster):
|
| 14 |
+
platform = "youtube"
|
| 15 |
+
|
| 16 |
+
async def is_logged_in(self) -> bool:
|
| 17 |
+
page = await self.context.new_page()
|
| 18 |
+
try:
|
| 19 |
+
await page.goto("https://studio.youtube.com/", wait_until="domcontentloaded", timeout=15000)
|
| 20 |
+
await page.wait_for_timeout(3000)
|
| 21 |
+
if "accounts.google.com" in page.url:
|
| 22 |
+
return False
|
| 23 |
+
# Look for the Create button in YouTube Studio
|
| 24 |
+
create_btn = await page.query_selector('#create-icon, button[aria-label="Create"]')
|
| 25 |
+
return create_btn is not None
|
| 26 |
+
except Exception:
|
| 27 |
+
return False
|
| 28 |
+
finally:
|
| 29 |
+
await page.close()
|
| 30 |
+
|
| 31 |
+
async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
|
| 32 |
+
video_id = kwargs.get("video_id", "unknown")
|
| 33 |
+
title = kwargs.get("title", "AI Voice Dubbed")
|
| 34 |
+
page = await self.context.new_page()
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
console.print(f"[cyan]YouTube:[/cyan] Navigating to YouTube Studio...")
|
| 38 |
+
await page.goto("https://studio.youtube.com/", wait_until="domcontentloaded", timeout=20000)
|
| 39 |
+
await page.wait_for_timeout(3000)
|
| 40 |
+
|
| 41 |
+
if "accounts.google.com" in page.url:
|
| 42 |
+
return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
|
| 43 |
+
|
| 44 |
+
# Click Create button
|
| 45 |
+
console.print(f"[cyan]YouTube:[/cyan] Opening upload dialog...")
|
| 46 |
+
create_btn = await page.wait_for_selector(
|
| 47 |
+
'#create-icon, button[aria-label="Create"]', timeout=10000
|
| 48 |
+
)
|
| 49 |
+
await create_btn.click()
|
| 50 |
+
await self._human_delay(1, 2)
|
| 51 |
+
|
| 52 |
+
# Click "Upload videos"
|
| 53 |
+
upload_option = await page.wait_for_selector(
|
| 54 |
+
'tp-yt-paper-item:has-text("Upload videos"), #text-item-0', timeout=5000
|
| 55 |
+
)
|
| 56 |
+
await upload_option.click()
|
| 57 |
+
await self._human_delay(1, 2)
|
| 58 |
+
|
| 59 |
+
# Upload video file
|
| 60 |
+
console.print(f"[cyan]YouTube:[/cyan] Uploading video...")
|
| 61 |
+
file_input = await page.wait_for_selector('input[type="file"]', timeout=10000)
|
| 62 |
+
await file_input.set_input_files(video_path)
|
| 63 |
+
|
| 64 |
+
# Wait for upload to start processing
|
| 65 |
+
await self._human_delay(3, 5)
|
| 66 |
+
|
| 67 |
+
# Wait for the details form to appear
|
| 68 |
+
try:
|
| 69 |
+
await page.wait_for_selector(
|
| 70 |
+
'#textbox[aria-label*="title"], div[id="textbox"]',
|
| 71 |
+
timeout=60000,
|
| 72 |
+
)
|
| 73 |
+
except Exception:
|
| 74 |
+
console.print("[yellow]YouTube: Waiting for upload form...[/yellow]")
|
| 75 |
+
await page.wait_for_timeout(10000)
|
| 76 |
+
|
| 77 |
+
await self._human_delay(1, 2)
|
| 78 |
+
|
| 79 |
+
# Fill in title
|
| 80 |
+
console.print(f"[cyan]YouTube:[/cyan] Setting title and description...")
|
| 81 |
+
title_input = await page.query_selector('#textbox[aria-label*="title"]')
|
| 82 |
+
if title_input:
|
| 83 |
+
await title_input.click()
|
| 84 |
+
await page.keyboard.press("Meta+a")
|
| 85 |
+
await page.keyboard.type(title[:100], delay=10)
|
| 86 |
+
|
| 87 |
+
await self._human_delay(1, 2)
|
| 88 |
+
|
| 89 |
+
# Fill in description
|
| 90 |
+
desc_input = await page.query_selector(
|
| 91 |
+
'#textbox[aria-label*="description"], '
|
| 92 |
+
'div[aria-label*="Tell viewers about your video"]'
|
| 93 |
+
)
|
| 94 |
+
if desc_input:
|
| 95 |
+
await desc_input.click()
|
| 96 |
+
await page.keyboard.type(caption, delay=5)
|
| 97 |
+
|
| 98 |
+
await self._human_delay(1, 2)
|
| 99 |
+
|
| 100 |
+
# Handle "Made for kids" — select "No, it's not made for kids"
|
| 101 |
+
not_for_kids = await page.query_selector(
|
| 102 |
+
'tp-yt-paper-radio-button[name="NOT_MADE_FOR_KIDS"], '
|
| 103 |
+
'#radioLabel:has-text("not made for kids")'
|
| 104 |
+
)
|
| 105 |
+
if not_for_kids:
|
| 106 |
+
await not_for_kids.click()
|
| 107 |
+
await self._human_delay(0.5, 1)
|
| 108 |
+
|
| 109 |
+
# Click Next through the wizard steps (Elements, Checks, Visibility)
|
| 110 |
+
for step_name in ["Elements", "Checks", "Visibility"]:
|
| 111 |
+
console.print(f"[cyan]YouTube:[/cyan] Step: {step_name}...")
|
| 112 |
+
next_btn = await page.query_selector('#next-button, button:has-text("Next")')
|
| 113 |
+
if next_btn:
|
| 114 |
+
await next_btn.click()
|
| 115 |
+
await self._human_delay(2, 3)
|
| 116 |
+
|
| 117 |
+
# Set visibility to Public
|
| 118 |
+
public_radio = await page.query_selector(
|
| 119 |
+
'tp-yt-paper-radio-button[name="PUBLIC"], '
|
| 120 |
+
'#radioLabel:has-text("Public")'
|
| 121 |
+
)
|
| 122 |
+
if public_radio:
|
| 123 |
+
await public_radio.click()
|
| 124 |
+
await self._human_delay(1, 2)
|
| 125 |
+
|
| 126 |
+
# Click Publish / Done
|
| 127 |
+
console.print(f"[cyan]YouTube:[/cyan] Publishing...")
|
| 128 |
+
publish_btn = await page.query_selector(
|
| 129 |
+
'#done-button, button:has-text("Publish"), button:has-text("Done")'
|
| 130 |
+
)
|
| 131 |
+
if publish_btn:
|
| 132 |
+
await publish_btn.click()
|
| 133 |
+
else:
|
| 134 |
+
await self._screenshot_on_error(page, video_id)
|
| 135 |
+
return self._make_result(video_id, "failed", caption, error="Could not find Publish button")
|
| 136 |
+
|
| 137 |
+
# Wait for publish confirmation
|
| 138 |
+
await page.wait_for_timeout(10000)
|
| 139 |
+
|
| 140 |
+
# Check for success — dialog may show "Video published" or close
|
| 141 |
+
try:
|
| 142 |
+
await page.wait_for_selector(
|
| 143 |
+
'div:has-text("Video published"), a[href*="youtu"]',
|
| 144 |
+
timeout=30000,
|
| 145 |
+
)
|
| 146 |
+
# Try to extract the video URL
|
| 147 |
+
link_el = await page.query_selector('a[href*="youtu.be"], a[href*="youtube.com/watch"]')
|
| 148 |
+
video_url = None
|
| 149 |
+
if link_el:
|
| 150 |
+
video_url = await link_el.get_attribute("href")
|
| 151 |
+
|
| 152 |
+
console.print(f"[green]YouTube: Published successfully![/green]")
|
| 153 |
+
return self._make_result(video_id, "success", caption, url=video_url)
|
| 154 |
+
except Exception:
|
| 155 |
+
await self._screenshot_on_error(page, video_id)
|
| 156 |
+
return self._make_result(video_id, "failed", caption, error="Publish confirmation not detected")
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
try:
|
| 160 |
+
await self._screenshot_on_error(page, video_id)
|
| 161 |
+
except Exception:
|
| 162 |
+
pass
|
| 163 |
+
return self._make_result(video_id, "failed", caption, error=str(e))
|
| 164 |
+
finally:
|
| 165 |
+
await page.close()
|
social_distributor/poster/post_log.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""JSON-based posting history for deduplication."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
|
| 8 |
+
from .config import POST_LOG_PATH
|
| 9 |
+
from .models import PostResult
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _load() -> dict:
|
| 13 |
+
if POST_LOG_PATH.exists():
|
| 14 |
+
with open(POST_LOG_PATH) as f:
|
| 15 |
+
return json.load(f)
|
| 16 |
+
return {}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _save(data: dict) -> None:
|
| 20 |
+
with open(POST_LOG_PATH, "w") as f:
|
| 21 |
+
json.dump(data, f, indent=2)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def is_posted(video_id: str, platform: str) -> bool:
|
| 25 |
+
data = _load()
|
| 26 |
+
entry = data.get(video_id, {}).get(platform, {})
|
| 27 |
+
return entry.get("status") == "success"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def record(result: PostResult) -> None:
|
| 31 |
+
data = _load()
|
| 32 |
+
if result.video_id not in data:
|
| 33 |
+
data[result.video_id] = {}
|
| 34 |
+
data[result.video_id][result.platform] = {
|
| 35 |
+
"status": result.status,
|
| 36 |
+
"timestamp": result.timestamp,
|
| 37 |
+
"caption": result.caption_used,
|
| 38 |
+
"error": result.error,
|
| 39 |
+
"url": result.url,
|
| 40 |
+
}
|
| 41 |
+
_save(data)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_all() -> dict:
|
| 45 |
+
return _load()
|
social_distributor/poster/video_loader.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Load and validate VideoVoice data folders into VideoData objects."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
from rich.console import Console
|
| 10 |
+
|
| 11 |
+
from .config import LANGUAGE_CODE_TO_NAME, VIDEOVOICE_DATA_DIR
|
| 12 |
+
from .models import VideoData
|
| 13 |
+
|
| 14 |
+
console = Console()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _detect_platform(video_link: str | None) -> str | None:
|
| 18 |
+
if not video_link:
|
| 19 |
+
return None
|
| 20 |
+
if re.search(r"/reels?/", video_link):
|
| 21 |
+
return "instagram"
|
| 22 |
+
if "tiktok.com" in video_link:
|
| 23 |
+
return "tiktok"
|
| 24 |
+
if "youtube.com" in video_link or "youtu.be" in video_link:
|
| 25 |
+
return "youtube"
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def load_video(folder_name: str, lang_override: str | None = None) -> VideoData | None:
|
| 30 |
+
"""Load a single video folder. Returns None if the folder is invalid."""
|
| 31 |
+
folder = VIDEOVOICE_DATA_DIR / folder_name
|
| 32 |
+
if not folder.is_dir():
|
| 33 |
+
console.print(f"[red]Folder not found:[/red] {folder}")
|
| 34 |
+
return None
|
| 35 |
+
|
| 36 |
+
output_mp4 = folder / "output.mp4"
|
| 37 |
+
if not output_mp4.exists():
|
| 38 |
+
console.print(f"[red]No output.mp4 in:[/red] {folder_name}")
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
# Read transcription.json
|
| 42 |
+
transcription_path = folder / "transcription.json"
|
| 43 |
+
if not transcription_path.exists():
|
| 44 |
+
console.print(f"[red]No transcription.json in:[/red] {folder_name}")
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
with open(transcription_path) as f:
|
| 48 |
+
transcription = json.load(f)
|
| 49 |
+
|
| 50 |
+
video_link = transcription.get("video_link")
|
| 51 |
+
source_language = transcription.get("source_language", "en")
|
| 52 |
+
original_text = " ".join(
|
| 53 |
+
seg.get("text", "") for seg in transcription.get("segments", [])
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Read segment_comparison.json
|
| 57 |
+
seg_comp_path = folder / "segment_comparison.json"
|
| 58 |
+
target_lang_code = "en"
|
| 59 |
+
translated_text = ""
|
| 60 |
+
|
| 61 |
+
if seg_comp_path.exists():
|
| 62 |
+
with open(seg_comp_path) as f:
|
| 63 |
+
segments = json.load(f)
|
| 64 |
+
if segments and isinstance(segments, list):
|
| 65 |
+
target_lang_code = segments[0].get("language_id", "en")
|
| 66 |
+
translated_text = " ".join(
|
| 67 |
+
seg.get("tts_text", "") or seg.get("translated_text", "")
|
| 68 |
+
for seg in segments
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
target_lang_name = lang_override or LANGUAGE_CODE_TO_NAME.get(
|
| 72 |
+
target_lang_code, target_lang_code
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return VideoData(
|
| 76 |
+
video_id=folder_name,
|
| 77 |
+
output_path=str(output_mp4),
|
| 78 |
+
video_link=video_link,
|
| 79 |
+
source_language=source_language,
|
| 80 |
+
target_language_code=target_lang_code,
|
| 81 |
+
target_language_name=target_lang_name,
|
| 82 |
+
original_text=original_text,
|
| 83 |
+
translated_text=translated_text,
|
| 84 |
+
platform_type=_detect_platform(video_link),
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def load_videos(
|
| 89 |
+
folder_names: list[str], lang_override: str | None = None
|
| 90 |
+
) -> list[VideoData]:
|
| 91 |
+
"""Load multiple video folders, skipping invalid ones."""
|
| 92 |
+
videos = []
|
| 93 |
+
for name in folder_names:
|
| 94 |
+
video = load_video(name, lang_override)
|
| 95 |
+
if video:
|
| 96 |
+
videos.append(video)
|
| 97 |
+
console.print(
|
| 98 |
+
f"[green]Loaded:[/green] {name} "
|
| 99 |
+
f"({video.source_language} -> {video.target_language_name})"
|
| 100 |
+
)
|
| 101 |
+
return videos
|
social_distributor/pyproject.toml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "videovoice-poster"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Automated social media posting for VideoVoice dubbed videos"
|
| 5 |
+
requires-python = ">=3.10"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"playwright>=1.40",
|
| 8 |
+
"openai>=1.0",
|
| 9 |
+
"boto3>=1.34",
|
| 10 |
+
"click>=8.0",
|
| 11 |
+
"rich>=13.0",
|
| 12 |
+
"python-dotenv>=1.0",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[tool.hatch.build.targets.wheel]
|
| 16 |
+
packages = ["poster"]
|
| 17 |
+
|
| 18 |
+
[build-system]
|
| 19 |
+
requires = ["hatchling"]
|
| 20 |
+
build-backend = "hatchling.build"
|
social_distributor/uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
steps/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Video Translation Pipeline — steps package
|
steps/lang/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Language-specific handlers for the translation pipeline.
|
| 2 |
+
|
| 3 |
+
Each language that needs special handling gets its own module (e.g. urdu.py).
|
| 4 |
+
This package provides a simple dispatcher so s3_translate.py stays language-agnostic.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _get_handler(target_language: str):
|
| 9 |
+
"""Lazy-import language handler module if it exists."""
|
| 10 |
+
lang = target_language.lower()
|
| 11 |
+
if lang == "urdu":
|
| 12 |
+
from . import urdu
|
| 13 |
+
return urdu
|
| 14 |
+
return None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_translation_prompt(target_language: str, default_prompt: str) -> str:
|
| 18 |
+
"""Return a language-specific translation prompt, or the default."""
|
| 19 |
+
handler = _get_handler(target_language)
|
| 20 |
+
if handler and hasattr(handler, 'get_translation_prompt'):
|
| 21 |
+
return handler.get_translation_prompt()
|
| 22 |
+
return default_prompt
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_fallback_mode(target_language: str) -> str:
|
| 26 |
+
"""Return 'bedrock' or 'google' depending on the language."""
|
| 27 |
+
handler = _get_handler(target_language)
|
| 28 |
+
if handler and hasattr(handler, 'get_fallback_mode'):
|
| 29 |
+
return handler.get_fallback_mode()
|
| 30 |
+
return "google"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def post_translate(segments: list[dict], target_language: str) -> list[dict]:
|
| 34 |
+
"""Run any language-specific post-processing after translation."""
|
| 35 |
+
handler = _get_handler(target_language)
|
| 36 |
+
if handler and hasattr(handler, 'post_translate'):
|
| 37 |
+
return handler.post_translate(segments)
|
| 38 |
+
return segments
|
steps/lang/_shared.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared utilities for language-specific translation handlers."""
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
POLLINATIONS_BASE = "https://gen.pollinations.ai/v1"
|
| 13 |
+
MODEL = os.getenv("POLLEN_MODEL", "openai-large")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_client() -> OpenAI:
|
| 17 |
+
"""Build an OpenAI-compatible client pointing at Pollinations."""
|
| 18 |
+
api_key = (
|
| 19 |
+
os.getenv("POLLEN_API_KEY_SECONDARY")
|
| 20 |
+
or os.getenv("POLLEN_API_KEY")
|
| 21 |
+
or os.getenv("POLLINATIONS_API_KEY")
|
| 22 |
+
or "pollinations"
|
| 23 |
+
)
|
| 24 |
+
return OpenAI(base_url=POLLINATIONS_BASE, api_key=api_key)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
_LLM_LOG_PATH = "tmp/llm_calls.json"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def log_llm_call(
|
| 31 |
+
step: str,
|
| 32 |
+
provider: str,
|
| 33 |
+
model: str,
|
| 34 |
+
system_prompt: str,
|
| 35 |
+
user_prompt: str,
|
| 36 |
+
response: str,
|
| 37 |
+
temperature: float,
|
| 38 |
+
) -> None:
|
| 39 |
+
"""Append an LLM call record to tmp/llm_calls.json."""
|
| 40 |
+
entry = {
|
| 41 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 42 |
+
"step": step,
|
| 43 |
+
"provider": provider,
|
| 44 |
+
"model": model,
|
| 45 |
+
"temperature": temperature,
|
| 46 |
+
"system_prompt": system_prompt,
|
| 47 |
+
"user_prompt": user_prompt,
|
| 48 |
+
"response": response,
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
try:
|
| 52 |
+
with open(_LLM_LOG_PATH, "r", encoding="utf-8") as f:
|
| 53 |
+
calls = json.load(f)
|
| 54 |
+
except (FileNotFoundError, json.JSONDecodeError):
|
| 55 |
+
calls = []
|
| 56 |
+
|
| 57 |
+
calls.append(entry)
|
| 58 |
+
|
| 59 |
+
os.makedirs(os.path.dirname(_LLM_LOG_PATH) or ".", exist_ok=True)
|
| 60 |
+
with open(_LLM_LOG_PATH, "w", encoding="utf-8") as f:
|
| 61 |
+
json.dump(calls, f, indent=2, ensure_ascii=False)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def parse_json_array(raw: str) -> list:
|
| 65 |
+
"""Parse a JSON array from LLM output, with regex fallback for markdown fences etc."""
|
| 66 |
+
raw = raw.strip()
|
| 67 |
+
|
| 68 |
+
# Direct parse
|
| 69 |
+
try:
|
| 70 |
+
result = json.loads(raw)
|
| 71 |
+
if isinstance(result, dict):
|
| 72 |
+
return list(result.values())
|
| 73 |
+
if isinstance(result, list):
|
| 74 |
+
return [item[0] if isinstance(item, list) and len(item) > 0 else str(item) for item in result]
|
| 75 |
+
return result
|
| 76 |
+
except json.JSONDecodeError:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
# Fallback: extract [...] with regex
|
| 80 |
+
match = re.search(r'\[.*\]', raw, re.DOTALL)
|
| 81 |
+
if match:
|
| 82 |
+
result = json.loads(match.group())
|
| 83 |
+
if isinstance(result, list):
|
| 84 |
+
return [item[0] if isinstance(item, list) and len(item) > 0 else str(item) for item in result]
|
| 85 |
+
return result
|
| 86 |
+
|
| 87 |
+
# Fallback: extract {...} and convert dict values
|
| 88 |
+
match_dict = re.search(r'\{.*\}', raw, re.DOTALL)
|
| 89 |
+
if match_dict:
|
| 90 |
+
result = json.loads(match_dict.group())
|
| 91 |
+
if isinstance(result, dict):
|
| 92 |
+
return list(result.values())
|
| 93 |
+
return result
|
| 94 |
+
|
| 95 |
+
raise ValueError(f"Could not parse JSON array from LLM response:\n{raw[:200]}")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def bedrock_converse(system_prompt: str, user_text: str, temperature: float = 0.1, step: str = "bedrock") -> str:
|
| 99 |
+
"""Make a single Bedrock converse call and return the raw response text."""
|
| 100 |
+
import boto3
|
| 101 |
+
|
| 102 |
+
region = os.getenv("AWS_REGION", "us-east-1")
|
| 103 |
+
model_id = os.getenv("BEDROCK_MODEL", "qwen.qwen3-next-80b-a3b")
|
| 104 |
+
|
| 105 |
+
client = boto3.client("bedrock-runtime", region_name=region)
|
| 106 |
+
response = client.converse(
|
| 107 |
+
modelId=model_id,
|
| 108 |
+
messages=[{"role": "user", "content": [{"text": user_text}]}],
|
| 109 |
+
system=[{"text": system_prompt}],
|
| 110 |
+
inferenceConfig={"temperature": temperature},
|
| 111 |
+
)
|
| 112 |
+
result = response["output"]["message"]["content"][0]["text"].strip()
|
| 113 |
+
|
| 114 |
+
log_llm_call(
|
| 115 |
+
step=step, provider="bedrock", model=model_id,
|
| 116 |
+
system_prompt=system_prompt, user_prompt=user_text,
|
| 117 |
+
response=result, temperature=temperature,
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
return result
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def bedrock_fallback(segments: list[dict], numbered: str, system_prompt: str, max_retries: int = 2) -> list[dict]:
|
| 124 |
+
"""Fallback translator using AWS Bedrock. Retries on count mismatch."""
|
| 125 |
+
expected = len(segments)
|
| 126 |
+
strict_prompt = (
|
| 127 |
+
system_prompt
|
| 128 |
+
+ f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
|
| 129 |
+
f"— one per input line. Do NOT merge, skip, or split any lines."
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
print(f"[lang] Bedrock fallback: translating {expected} segments")
|
| 133 |
+
|
| 134 |
+
for attempt in range(1, max_retries + 1):
|
| 135 |
+
raw = bedrock_converse(strict_prompt, numbered, step="s3_translate_bedrock")
|
| 136 |
+
translated_list = parse_json_array(raw)
|
| 137 |
+
|
| 138 |
+
if len(translated_list) == expected:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
print(f"[lang] Bedrock returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
|
| 142 |
+
if attempt == max_retries:
|
| 143 |
+
raise ValueError(
|
| 144 |
+
f"Bedrock translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]
|
| 148 |
+
result = [{**seg, "translated_text": t} for seg, t in zip(segments, cleaned)]
|
| 149 |
+
print("[lang] Bedrock fallback translation complete ✓")
|
| 150 |
+
return result
|
steps/lang/omnivoice_languages.py
ADDED
|
@@ -0,0 +1,652 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AUTO-GENERATED from k2-fsa/OmniVoice omnivoice/utils/lang_map.py
|
| 2 |
+
# Source: https://github.com/k2-fsa/OmniVoice/blob/master/omnivoice/utils/lang_map.py
|
| 3 |
+
"""Omnivoice-supported languages (display name -> Omnivoice language id)."""
|
| 4 |
+
|
| 5 |
+
OMNIVOICE_LANGUAGE_CODES: dict[str, str] = {
|
| 6 |
+
"Abadi": "kbt",
|
| 7 |
+
"Abkhazian": "ab",
|
| 8 |
+
"Abron": "abr",
|
| 9 |
+
"Abua": "abn",
|
| 10 |
+
"Adamawa Fulfulde": "fub",
|
| 11 |
+
"Adyghe": "ady",
|
| 12 |
+
"Afade": "aal",
|
| 13 |
+
"Afrikaans": "af",
|
| 14 |
+
"Agwagwune": "yay",
|
| 15 |
+
"Aja (Benin)": "ajg",
|
| 16 |
+
"Akebu": "keu",
|
| 17 |
+
"Alago": "ala",
|
| 18 |
+
"Albanian": "sq",
|
| 19 |
+
"Algerian Arabic": "arq",
|
| 20 |
+
"Algerian Saharan Arabic": "aao",
|
| 21 |
+
"Ambo-Pasco Quechua": "qva",
|
| 22 |
+
"Ambonese Malay": "abs",
|
| 23 |
+
"Amdo Tibetan": "adx",
|
| 24 |
+
"Amharic": "am",
|
| 25 |
+
"Anaang": "anw",
|
| 26 |
+
"Angika": "anp",
|
| 27 |
+
"Antankarana Malagasy": "xmv",
|
| 28 |
+
"Aragonese": "an",
|
| 29 |
+
"Arbëreshë Albanian": "aae",
|
| 30 |
+
"Arequipa-La Unión Quechua": "qxu",
|
| 31 |
+
"Armenian": "hy",
|
| 32 |
+
"Ashe": "ahs",
|
| 33 |
+
"Ashéninka Perené": "prq",
|
| 34 |
+
"Askopan": "eiv",
|
| 35 |
+
"Assamese": "as",
|
| 36 |
+
"Asturian": "ast",
|
| 37 |
+
"Atayal": "tay",
|
| 38 |
+
"Awak": "awo",
|
| 39 |
+
"Ayacucho Quechua": "quy",
|
| 40 |
+
"Azerbaijani": "az",
|
| 41 |
+
"Baatonum": "bba",
|
| 42 |
+
"Bacama": "bcy",
|
| 43 |
+
"Bade": "bde",
|
| 44 |
+
"Bafia": "ksf",
|
| 45 |
+
"Bafut": "bfd",
|
| 46 |
+
"Bagirmi Fulfulde": "fui",
|
| 47 |
+
"Bago-Kusuntu": "bqg",
|
| 48 |
+
"Baharna Arabic": "abv",
|
| 49 |
+
"Bakoko": "bkh",
|
| 50 |
+
"Balanta-Ganja": "bjt",
|
| 51 |
+
"Balti": "bft",
|
| 52 |
+
"Bamenyam": "bce",
|
| 53 |
+
"Bamun": "bax",
|
| 54 |
+
"Bangwinji": "bsj",
|
| 55 |
+
"Banjar": "bjn",
|
| 56 |
+
"Bankon": "abb",
|
| 57 |
+
"Baoulé": "bci",
|
| 58 |
+
"Bara Malagasy": "bhr",
|
| 59 |
+
"Barok": "bjk",
|
| 60 |
+
"Basa (Cameroon)": "bas",
|
| 61 |
+
"Basa (Nigeria)": "bzw",
|
| 62 |
+
"Bashkir": "ba",
|
| 63 |
+
"Basque": "eu",
|
| 64 |
+
"Batak Mandailing": "btm",
|
| 65 |
+
"Batanga": "bnm",
|
| 66 |
+
"Bateri": "btv",
|
| 67 |
+
"Bats": "bbl",
|
| 68 |
+
"Bayot": "bda",
|
| 69 |
+
"Bebele": "beb",
|
| 70 |
+
"Belarusian": "be",
|
| 71 |
+
"Bengali": "bn",
|
| 72 |
+
"Betawi": "bew",
|
| 73 |
+
"Bhili": "bhb",
|
| 74 |
+
"Bhojpuri": "bho",
|
| 75 |
+
"Bilur": "bxf",
|
| 76 |
+
"Bima": "bhp",
|
| 77 |
+
"Bodo": "brx",
|
| 78 |
+
"Boghom": "bux",
|
| 79 |
+
"Bokyi": "bky",
|
| 80 |
+
"Bomu": "bmq",
|
| 81 |
+
"Bondei": "bou",
|
| 82 |
+
"Borgu Fulfulde": "fue",
|
| 83 |
+
"Bosnian": "bs",
|
| 84 |
+
"Brahui": "brh",
|
| 85 |
+
"Braj": "bra",
|
| 86 |
+
"Breton": "br",
|
| 87 |
+
"Buduma": "bdm",
|
| 88 |
+
"Buginese": "bug",
|
| 89 |
+
"Bukharic": "bhh",
|
| 90 |
+
"Bulgarian": "bg",
|
| 91 |
+
"Bulu (Cameroon)": "bum",
|
| 92 |
+
"Bundeli": "bns",
|
| 93 |
+
"Bunun": "bnn",
|
| 94 |
+
"Bura-Pabir": "bwr",
|
| 95 |
+
"Burak": "bys",
|
| 96 |
+
"Burmese": "my",
|
| 97 |
+
"Burushaski": "bsk",
|
| 98 |
+
"Cacaloxtepec Mixtec": "miu",
|
| 99 |
+
"Cajatambo North Lima Quechua": "qvl",
|
| 100 |
+
"Cakfem-Mushere": "cky",
|
| 101 |
+
"Cameroon Pidgin": "wes",
|
| 102 |
+
"Campidanese Sardinian": "sro",
|
| 103 |
+
"Cantonese": "yue",
|
| 104 |
+
"Catalan": "ca",
|
| 105 |
+
"Cebuano": "ceb",
|
| 106 |
+
"Cen": "cen",
|
| 107 |
+
"Central Kurdish": "ckb",
|
| 108 |
+
"Central Nahuatl": "nhn",
|
| 109 |
+
"Central Pame": "pbs",
|
| 110 |
+
"Central Pashto": "pst",
|
| 111 |
+
"Central Puebla Nahuatl": "ncx",
|
| 112 |
+
"Central Tarahumara": "tar",
|
| 113 |
+
"Central Yupik": "esu",
|
| 114 |
+
"Central-Eastern Niger Fulfulde": "fuq",
|
| 115 |
+
"Chadian Arabic": "shu",
|
| 116 |
+
"Chichewa": "ny",
|
| 117 |
+
"Chichicapan Zapotec": "zpv",
|
| 118 |
+
"Chiga": "cgg",
|
| 119 |
+
"Chimalapa Zoque": "zoh",
|
| 120 |
+
"Chimborazo Highland Quichua": "qug",
|
| 121 |
+
"Chinese": "zh",
|
| 122 |
+
"Chiquián Ancash Quechua": "qxa",
|
| 123 |
+
"Chitwania Tharu": "the",
|
| 124 |
+
"Chokwe": "cjk",
|
| 125 |
+
"Chuvash": "cv",
|
| 126 |
+
"Cibak": "ckl",
|
| 127 |
+
"Coastal Konjo": "kjc",
|
| 128 |
+
"Copainalá Zoque": "zoc",
|
| 129 |
+
"Cornish": "kw",
|
| 130 |
+
"Corongo Ancash Quechua": "qwa",
|
| 131 |
+
"Croatian": "hr",
|
| 132 |
+
"Cross River Mbembe": "mfn",
|
| 133 |
+
"Cuyamecalco Mixtec": "xtu",
|
| 134 |
+
"Czech": "cs",
|
| 135 |
+
"Dadiya": "dbd",
|
| 136 |
+
"Dagbani": "dag",
|
| 137 |
+
"Dameli": "dml",
|
| 138 |
+
"Danish": "da",
|
| 139 |
+
"Dargwa": "dar",
|
| 140 |
+
"Dazaga": "dzg",
|
| 141 |
+
"Deccan": "dcc",
|
| 142 |
+
"Degema": "deg",
|
| 143 |
+
"Dera (Nigeria)": "kna",
|
| 144 |
+
"Dghwede": "dgh",
|
| 145 |
+
"Dhatki": "mki",
|
| 146 |
+
"Dhivehi": "dv",
|
| 147 |
+
"Dhofari Arabic": "adf",
|
| 148 |
+
"Dijim-Bwilim": "cfa",
|
| 149 |
+
"Dogri": "dgo",
|
| 150 |
+
"Domaaki": "dmk",
|
| 151 |
+
"Dotyali": "dty",
|
| 152 |
+
"Duala": "dua",
|
| 153 |
+
"Dutch": "nl",
|
| 154 |
+
"DũYa": "ldb",
|
| 155 |
+
"Dyula": "dyu",
|
| 156 |
+
"Eastern Balochi": "bgp",
|
| 157 |
+
"Eastern Bolivian Guaraní": "gui",
|
| 158 |
+
"Eastern Egyptian Bedawi Arabic": "avl",
|
| 159 |
+
"Eastern Krahn": "kqo",
|
| 160 |
+
"Eastern Mari": "mhr",
|
| 161 |
+
"Eastern Yiddish": "ydd",
|
| 162 |
+
"Ebrié": "ebr",
|
| 163 |
+
"Eggon": "ego",
|
| 164 |
+
"Egyptian Arabic": "arz",
|
| 165 |
+
"Ejagham": "etu",
|
| 166 |
+
"Eleme": "elm",
|
| 167 |
+
"Eloyi": "afo",
|
| 168 |
+
"Embu": "ebu",
|
| 169 |
+
"English": "en",
|
| 170 |
+
"Erzya": "myv",
|
| 171 |
+
"Esan": "ish",
|
| 172 |
+
"Esperanto": "eo",
|
| 173 |
+
"Estonian": "et",
|
| 174 |
+
"Eton (Cameroon)": "eto",
|
| 175 |
+
"Ewondo": "ewo",
|
| 176 |
+
"Extremaduran": "ext",
|
| 177 |
+
"Fang (Equatorial Guinea)": "fan",
|
| 178 |
+
"Fanti": "fat",
|
| 179 |
+
"Farefare": "gur",
|
| 180 |
+
"Fe'fe'": "fmp",
|
| 181 |
+
"Filipino": "fil",
|
| 182 |
+
"Filomena Mata-Coahuitlán Totonac": "tlp",
|
| 183 |
+
"Finnish": "fi",
|
| 184 |
+
"Fipa": "fip",
|
| 185 |
+
"French": "fr",
|
| 186 |
+
"Fulah": "ff",
|
| 187 |
+
"Galician": "gl",
|
| 188 |
+
"Gambian Wolof": "wof",
|
| 189 |
+
"Ganda": "lg",
|
| 190 |
+
"Garhwali": "gbm",
|
| 191 |
+
"Gawar-Bati": "gwt",
|
| 192 |
+
"Gawri": "gwc",
|
| 193 |
+
"Gbagyi": "gbr",
|
| 194 |
+
"Gbari": "gby",
|
| 195 |
+
"Geji": "gyz",
|
| 196 |
+
"Gen": "gej",
|
| 197 |
+
"Georgian": "ka",
|
| 198 |
+
"German": "de",
|
| 199 |
+
"Geser-Gorom": "ges",
|
| 200 |
+
"Gheg Albanian": "aln",
|
| 201 |
+
"Ghomálá'": "bbj",
|
| 202 |
+
"Gidar": "gid",
|
| 203 |
+
"Glavda": "glw",
|
| 204 |
+
"Goan Konkani": "gom",
|
| 205 |
+
"Goaria": "gig",
|
| 206 |
+
"Goemai": "ank",
|
| 207 |
+
"Gola": "gol",
|
| 208 |
+
"Greek": "el",
|
| 209 |
+
"Guarani": "gn",
|
| 210 |
+
"Guduf-Gava": "gdf",
|
| 211 |
+
"Guerrero Amuzgo": "amu",
|
| 212 |
+
"Gujarati": "gu",
|
| 213 |
+
"Gujari": "gju",
|
| 214 |
+
"Gulf Arabic": "afb",
|
| 215 |
+
"Gurgula": "ggg",
|
| 216 |
+
"Gusii": "guz",
|
| 217 |
+
"Gusilay": "gsl",
|
| 218 |
+
"Gweno": "gwe",
|
| 219 |
+
"Güilá Zapotec": "ztu",
|
| 220 |
+
"Hadothi": "hoj",
|
| 221 |
+
"Hahon": "hah",
|
| 222 |
+
"Haitian": "ht",
|
| 223 |
+
"Hakha Chin": "cnh",
|
| 224 |
+
"Hakö": "hao",
|
| 225 |
+
"Halia": "hla",
|
| 226 |
+
"Hausa": "ha",
|
| 227 |
+
"Hawaiian": "haw",
|
| 228 |
+
"Hazaragi": "haz",
|
| 229 |
+
"Hebrew": "he",
|
| 230 |
+
"Hemba": "hem",
|
| 231 |
+
"Herero": "hz",
|
| 232 |
+
"Highland Konjo": "kjk",
|
| 233 |
+
"Hijazi Arabic": "acw",
|
| 234 |
+
"Hindi": "hi",
|
| 235 |
+
"Huarijio": "var",
|
| 236 |
+
"Huautla Mazatec": "mau",
|
| 237 |
+
"Huaxcaleca Nahuatl": "nhq",
|
| 238 |
+
"Huba": "hbb",
|
| 239 |
+
"Huitepec Mixtec": "mxs",
|
| 240 |
+
"Hula": "hul",
|
| 241 |
+
"Hungarian": "hu",
|
| 242 |
+
"Hunjara-Kaina Ke": "hkk",
|
| 243 |
+
"Hwana": "hwo",
|
| 244 |
+
"Ibibio": "ibb",
|
| 245 |
+
"Icelandic": "is",
|
| 246 |
+
"Idakho-Isukha-Tiriki": "ida",
|
| 247 |
+
"Idoma": "idu",
|
| 248 |
+
"Igbo": "ig",
|
| 249 |
+
"Igo": "ahl",
|
| 250 |
+
"Ikposo": "kpo",
|
| 251 |
+
"Ikwere": "ikw",
|
| 252 |
+
"Imbabura Highland Quichua": "qvi",
|
| 253 |
+
"Indonesian": "id",
|
| 254 |
+
"Indus Kohistani": "mvy",
|
| 255 |
+
"Interlingua (International Auxiliary Language Association)": "ia",
|
| 256 |
+
"Inupiaq": "ik",
|
| 257 |
+
"Irish": "ga",
|
| 258 |
+
"Iron Ossetic": "os",
|
| 259 |
+
"Isekiri": "its",
|
| 260 |
+
"Isoko": "iso",
|
| 261 |
+
"Italian": "it",
|
| 262 |
+
"Ito": "itw",
|
| 263 |
+
"Itzá": "itz",
|
| 264 |
+
"Ixtayutla Mixtec": "vmj",
|
| 265 |
+
"Izon": "ijc",
|
| 266 |
+
"Jambi Malay": "jax",
|
| 267 |
+
"Japanese": "ja",
|
| 268 |
+
"Jaqaru": "jqr",
|
| 269 |
+
"Jauja Wanca Quechua": "qxw",
|
| 270 |
+
"Jaunsari": "jns",
|
| 271 |
+
"Javanese": "jv",
|
| 272 |
+
"Jiba": "juo",
|
| 273 |
+
"Jju": "kaj",
|
| 274 |
+
"Judeo-Moroccan Arabic": "aju",
|
| 275 |
+
"Juxtlahuaca Mixtec": "vmc",
|
| 276 |
+
"Kabardian": "kbd",
|
| 277 |
+
"Kabras": "lkb",
|
| 278 |
+
"Kabuverdianu": "kea",
|
| 279 |
+
"Kabyle": "kab",
|
| 280 |
+
"Kachi Koli": "gjk",
|
| 281 |
+
"Kairak": "ckr",
|
| 282 |
+
"Kalabari": "ijn",
|
| 283 |
+
"Kalasha": "kls",
|
| 284 |
+
"Kalenjin": "kln",
|
| 285 |
+
"Kalkoti": "xka",
|
| 286 |
+
"Kamba": "kam",
|
| 287 |
+
"Kamo": "kcq",
|
| 288 |
+
"Kanauji": "bjj",
|
| 289 |
+
"Kanembu": "kbl",
|
| 290 |
+
"Kannada": "kn",
|
| 291 |
+
"Karekare": "kai",
|
| 292 |
+
"Kashmiri": "ks",
|
| 293 |
+
"Kathoriya Tharu": "tkt",
|
| 294 |
+
"Kati": "bsh",
|
| 295 |
+
"Kazakh": "kk",
|
| 296 |
+
"Keiyo": "eyo",
|
| 297 |
+
"Khams Tibetan": "khg",
|
| 298 |
+
"Khana": "ogo",
|
| 299 |
+
"Khetrani": "xhe",
|
| 300 |
+
"Khmer": "km",
|
| 301 |
+
"Khowar": "khw",
|
| 302 |
+
"Kinga": "zga",
|
| 303 |
+
"Kinnauri": "kfk",
|
| 304 |
+
"Kinyarwanda": "rw",
|
| 305 |
+
"Kirghiz": "ky",
|
| 306 |
+
"Kirya-Konzəl": "fkk",
|
| 307 |
+
"Kochila Tharu": "thq",
|
| 308 |
+
"Kohistani Shina": "plk",
|
| 309 |
+
"Kohumono": "bcs",
|
| 310 |
+
"Kok Borok": "trp",
|
| 311 |
+
"Kol (Papua New Guinea)": "kol",
|
| 312 |
+
"Kom (Cameroon)": "bkm",
|
| 313 |
+
"Koma": "kmy",
|
| 314 |
+
"Konkani": "knn",
|
| 315 |
+
"Konzo": "koo",
|
| 316 |
+
"Korean": "ko",
|
| 317 |
+
"Korwa": "kfp",
|
| 318 |
+
"Kota (India)": "kfe",
|
| 319 |
+
"Koti": "eko",
|
| 320 |
+
"Kuanua": "ksd",
|
| 321 |
+
"Kuanyama": "kj",
|
| 322 |
+
"Kui (India)": "uki",
|
| 323 |
+
"Kulung (Nigeria)": "bbu",
|
| 324 |
+
"Kuot": "kto",
|
| 325 |
+
"Kushi": "kuh",
|
| 326 |
+
"Kwambi": "kwm",
|
| 327 |
+
"Kwasio": "nmg",
|
| 328 |
+
"Lala-Roba": "lla",
|
| 329 |
+
"Lamang": "hia",
|
| 330 |
+
"Lao": "lo",
|
| 331 |
+
"Larike-Wakasihu": "alo",
|
| 332 |
+
"Lasi": "lss",
|
| 333 |
+
"Latgalian": "ltg",
|
| 334 |
+
"Latvian": "lv",
|
| 335 |
+
"Levantine Arabic": "apc",
|
| 336 |
+
"Liana-Seti": "ste",
|
| 337 |
+
"Liberia Kpelle": "xpe",
|
| 338 |
+
"Liberian English": "lir",
|
| 339 |
+
"Libyan Arabic": "ayl",
|
| 340 |
+
"Ligurian": "lij",
|
| 341 |
+
"Lijili": "mgi",
|
| 342 |
+
"Lingala": "ln",
|
| 343 |
+
"Lithuanian": "lt",
|
| 344 |
+
"Loarki": "lrk",
|
| 345 |
+
"Logooli": "rag",
|
| 346 |
+
"Logudorese Sardinian": "src",
|
| 347 |
+
"Loja Highland Quichua": "qvj",
|
| 348 |
+
"Loloda": "loa",
|
| 349 |
+
"Longuda": "lnu",
|
| 350 |
+
"Loxicha Zapotec": "ztp",
|
| 351 |
+
"Luba-Lulua": "lua",
|
| 352 |
+
"Luo": "luo",
|
| 353 |
+
"Lushai": "lus",
|
| 354 |
+
"Luxembourgish": "lb",
|
| 355 |
+
"Maasina Fulfulde": "ffm",
|
| 356 |
+
"Maba (Chad)": "mde",
|
| 357 |
+
"Macedo-Romanian": "rup",
|
| 358 |
+
"Macedonian": "mk",
|
| 359 |
+
"Mada (Cameroon)": "mxu",
|
| 360 |
+
"Mafa": "maf",
|
| 361 |
+
"Maithili": "mai",
|
| 362 |
+
"Malay": "ms",
|
| 363 |
+
"Malayalam": "ml",
|
| 364 |
+
"Mali": "gcc",
|
| 365 |
+
"Malinaltepec Me'phaa": "tcf",
|
| 366 |
+
"Maltese": "mt",
|
| 367 |
+
"Mandara": "tbf",
|
| 368 |
+
"Mandjak": "mfv",
|
| 369 |
+
"Manggarai": "mqy",
|
| 370 |
+
"Manipuri": "mni",
|
| 371 |
+
"Mansoanka": "msw",
|
| 372 |
+
"Manx": "gv",
|
| 373 |
+
"Maori": "mi",
|
| 374 |
+
"Marathi": "mr",
|
| 375 |
+
"Marghi Central": "mrt",
|
| 376 |
+
"Marghi South": "mfm",
|
| 377 |
+
"Maria (India)": "mrr",
|
| 378 |
+
"Marwari (Pakistan)": "mve",
|
| 379 |
+
"Masana": "mcn",
|
| 380 |
+
"Masikoro Malagasy": "msh",
|
| 381 |
+
"Matsés": "mcf",
|
| 382 |
+
"Mazaltepec Zapotec": "zpy",
|
| 383 |
+
"Mazatlán Mazatec": "vmz",
|
| 384 |
+
"Mazatlán Mixe": "mzl",
|
| 385 |
+
"Mbe": "mfo",
|
| 386 |
+
"Mbo (Cameroon)": "mbo",
|
| 387 |
+
"Mbum": "mdd",
|
| 388 |
+
"Medumba": "byv",
|
| 389 |
+
"Mekeo": "mek",
|
| 390 |
+
"Meru": "mer",
|
| 391 |
+
"Mesopotamian Arabic": "acm",
|
| 392 |
+
"Mewari": "mtr",
|
| 393 |
+
"Min Nan Chinese": "nan",
|
| 394 |
+
"Mingrelian": "xmf",
|
| 395 |
+
"Mitlatongo Mixtec": "vmm",
|
| 396 |
+
"Miya": "mkf",
|
| 397 |
+
"Mokpwe": "bri",
|
| 398 |
+
"Moksha": "mdf",
|
| 399 |
+
"Mom Jango": "ver",
|
| 400 |
+
"Mongolian": "mn",
|
| 401 |
+
"Moroccan Arabic": "ary",
|
| 402 |
+
"Motu": "meu",
|
| 403 |
+
"Mpiemo": "mcx",
|
| 404 |
+
"Mpumpong": "mgg",
|
| 405 |
+
"Mundang": "mua",
|
| 406 |
+
"Mungaka": "mhk",
|
| 407 |
+
"Musey": "mse",
|
| 408 |
+
"Musgu": "mug",
|
| 409 |
+
"Musi": "mui",
|
| 410 |
+
"Naba": "mne",
|
| 411 |
+
"Najdi Arabic": "ars",
|
| 412 |
+
"Nalik": "nal",
|
| 413 |
+
"Nawdm": "nmz",
|
| 414 |
+
"Ndonga": "ng",
|
| 415 |
+
"Neapolitan": "nap",
|
| 416 |
+
"Nepali": "npi",
|
| 417 |
+
"Ngamo": "nbh",
|
| 418 |
+
"Ngas": "anc",
|
| 419 |
+
"Ngiemboon": "nnh",
|
| 420 |
+
"Ngizim": "ngi",
|
| 421 |
+
"Ngomba": "jgo",
|
| 422 |
+
"Ngombale": "nla",
|
| 423 |
+
"Nigerian Fulfulde": "fuv",
|
| 424 |
+
"Nigerian Pidgin": "pcm",
|
| 425 |
+
"Nimadi": "noe",
|
| 426 |
+
"Nobiin": "fia",
|
| 427 |
+
"North Mesopotamian Arabic": "ayp",
|
| 428 |
+
"North Moluccan Malay": "max",
|
| 429 |
+
"Northern Betsimisaraka Malagasy": "bmm",
|
| 430 |
+
"Northern Hindko": "hno",
|
| 431 |
+
"Northern Kurdish": "kmr",
|
| 432 |
+
"Northern Pame": "pmq",
|
| 433 |
+
"Northern Pashto": "pbu",
|
| 434 |
+
"Northern Uzbek": "uzn",
|
| 435 |
+
"Northwest Gbaya": "gya",
|
| 436 |
+
"Norwegian": "no",
|
| 437 |
+
"Norwegian Bokmål": "nb",
|
| 438 |
+
"Norwegian Nynorsk": "nn",
|
| 439 |
+
"Notsi": "ncf",
|
| 440 |
+
"Nyankpa": "yes",
|
| 441 |
+
"Nyungwe": "nyu",
|
| 442 |
+
"Nzanyi": "nja",
|
| 443 |
+
"Nüpode Huitoto": "hux",
|
| 444 |
+
"Occitan": "oc",
|
| 445 |
+
"Od": "odk",
|
| 446 |
+
"Odia": "ory",
|
| 447 |
+
"Odual": "odu",
|
| 448 |
+
"Omani Arabic": "acx",
|
| 449 |
+
"Orizaba Nahuatl": "nlv",
|
| 450 |
+
"Orma": "orc",
|
| 451 |
+
"Ormuri": "oru",
|
| 452 |
+
"Oromo": "om",
|
| 453 |
+
"Pahari-Potwari": "phr",
|
| 454 |
+
"Paiwan": "pwn",
|
| 455 |
+
"Panjabi": "pa",
|
| 456 |
+
"Papuan Malay": "pmy",
|
| 457 |
+
"Parkari Koli": "kvx",
|
| 458 |
+
"Pedi": "nso",
|
| 459 |
+
"Pero": "pip",
|
| 460 |
+
"Persian": "fa",
|
| 461 |
+
"Petats": "pex",
|
| 462 |
+
"Phalura": "phl",
|
| 463 |
+
"Piemontese": "pms",
|
| 464 |
+
"Piya-Kwonci": "piy",
|
| 465 |
+
"Plateau Malagasy": "plt",
|
| 466 |
+
"Polish": "pl",
|
| 467 |
+
"Poqomam": "poc",
|
| 468 |
+
"Portuguese": "pt",
|
| 469 |
+
"Pulaar": "fuc",
|
| 470 |
+
"Pular": "fuf",
|
| 471 |
+
"Puno Quechua": "qxp",
|
| 472 |
+
"Pushto": "ps",
|
| 473 |
+
"Pökoot": "pko",
|
| 474 |
+
"Qaqet": "byx",
|
| 475 |
+
"Quiotepec Chinantec": "chq",
|
| 476 |
+
"Rana Tharu": "thr",
|
| 477 |
+
"Rangi": "lag",
|
| 478 |
+
"Rapoisi": "kyx",
|
| 479 |
+
"Ratahan": "rth",
|
| 480 |
+
"Rayón Zoque": "zor",
|
| 481 |
+
"Romanian": "ro",
|
| 482 |
+
"Romansh": "rm",
|
| 483 |
+
"Rombo": "rof",
|
| 484 |
+
"Rotokas": "roo",
|
| 485 |
+
"Rukai": "dru",
|
| 486 |
+
"Russian": "ru",
|
| 487 |
+
"Sacapulteco": "quv",
|
| 488 |
+
"Saidi Arabic": "aec",
|
| 489 |
+
"Sakalava Malagasy": "skg",
|
| 490 |
+
"Sakizaya": "szy",
|
| 491 |
+
"Saleman": "sau",
|
| 492 |
+
"Samba Daka": "ccg",
|
| 493 |
+
"Samba Leko": "ndi",
|
| 494 |
+
"San Felipe Otlaltepec Popoloca": "pow",
|
| 495 |
+
"San Francisco Del Mar Huave": "hue",
|
| 496 |
+
"San Juan Atzingo Popoloca": "poe",
|
| 497 |
+
"San Martín Itunyoso Triqui": "trq",
|
| 498 |
+
"San Miguel El Grande Mixtec": "mig",
|
| 499 |
+
"Sansi": "ssi",
|
| 500 |
+
"Sanskrit": "sa",
|
| 501 |
+
"Santa Ana de Tusi Pasco Quechua": "qxt",
|
| 502 |
+
"Santa Catarina Albarradas Zapotec": "ztn",
|
| 503 |
+
"Santali": "sat",
|
| 504 |
+
"Santiago del Estero Quichua": "qus",
|
| 505 |
+
"Saposa": "sps",
|
| 506 |
+
"Saraiki": "skr",
|
| 507 |
+
"Sardinian": "sc",
|
| 508 |
+
"Saya": "say",
|
| 509 |
+
"Sediq": "trv",
|
| 510 |
+
"Serbian": "sr",
|
| 511 |
+
"Seri": "sei",
|
| 512 |
+
"Shina": "scl",
|
| 513 |
+
"Shona": "sn",
|
| 514 |
+
"Siar-Lak": "sjr",
|
| 515 |
+
"Sibe": "nco",
|
| 516 |
+
"Sicilian": "scn",
|
| 517 |
+
"Sihuas Ancash Quechua": "qws",
|
| 518 |
+
"Sikkimese": "sip",
|
| 519 |
+
"Sinaugoro": "snc",
|
| 520 |
+
"Sindhi": "sd",
|
| 521 |
+
"Sindhi Bhil": "sbn",
|
| 522 |
+
"Sinhala": "si",
|
| 523 |
+
"Sinicahua Mixtec": "xti",
|
| 524 |
+
"Sipacapense": "qum",
|
| 525 |
+
"Siwai": "siw",
|
| 526 |
+
"Slovak": "sk",
|
| 527 |
+
"Slovenian": "sl",
|
| 528 |
+
"Solos": "sol",
|
| 529 |
+
"Somali": "so",
|
| 530 |
+
"Soninke": "snk",
|
| 531 |
+
"South Giziga": "giz",
|
| 532 |
+
"South Ucayali Ashéninka": "cpy",
|
| 533 |
+
"Southeastern Nochixtlán Mixtec": "mxy",
|
| 534 |
+
"Southern Betsimisaraka Malagasy": "bzc",
|
| 535 |
+
"Southern Pashto": "pbt",
|
| 536 |
+
"Southern Pastaza Quechua": "qup",
|
| 537 |
+
"Soyaltepec Mazatec": "vmp",
|
| 538 |
+
"Spanish": "es",
|
| 539 |
+
"Standard Arabic": "arb",
|
| 540 |
+
"Standard Moroccan Tamazight": "zgh",
|
| 541 |
+
"Sudanese Arabic": "apd",
|
| 542 |
+
"Sulka": "sua",
|
| 543 |
+
"Svan": "sva",
|
| 544 |
+
"Swahili": "sw",
|
| 545 |
+
"Swedish": "sv",
|
| 546 |
+
"Tae'": "rob",
|
| 547 |
+
"Tahaggart Tamahaq": "thv",
|
| 548 |
+
"Taita": "dav",
|
| 549 |
+
"Tajik": "tg",
|
| 550 |
+
"Tamil": "ta",
|
| 551 |
+
"Tandroy-Mahafaly Malagasy": "tdx",
|
| 552 |
+
"Tangale": "tan",
|
| 553 |
+
"Tanosy Malagasy": "txy",
|
| 554 |
+
"Tarok": "yer",
|
| 555 |
+
"Tatar": "tt",
|
| 556 |
+
"Tedaga": "tuq",
|
| 557 |
+
"Telugu": "te",
|
| 558 |
+
"Tem": "kdh",
|
| 559 |
+
"Teop": "tio",
|
| 560 |
+
"Tepeuxila Cuicatec": "cux",
|
| 561 |
+
"Tepinapa Chinantec": "cte",
|
| 562 |
+
"Tera": "ttr",
|
| 563 |
+
"Terei": "buo",
|
| 564 |
+
"Termanu": "twu",
|
| 565 |
+
"Tesaka Malagasy": "tkg",
|
| 566 |
+
"Tetelcingo Nahuatl": "nhg",
|
| 567 |
+
"Teutila Cuicatec": "cut",
|
| 568 |
+
"Thai": "th",
|
| 569 |
+
"Tibetan": "bo",
|
| 570 |
+
"Tidaá Mixtec": "mtx",
|
| 571 |
+
"Tidore": "tvo",
|
| 572 |
+
"Tigak": "tgc",
|
| 573 |
+
"Tigre": "tig",
|
| 574 |
+
"Tigrinya": "ti",
|
| 575 |
+
"Tilquiapan Zapotec": "zts",
|
| 576 |
+
"Tinputz": "tpz",
|
| 577 |
+
"Tlacoapa Me'phaa": "tpl",
|
| 578 |
+
"Tlacoatzintepec Chinantec": "ctl",
|
| 579 |
+
"Tlingit": "tli",
|
| 580 |
+
"Toki Pona": "tok",
|
| 581 |
+
"Tomoip": "tqp",
|
| 582 |
+
"Tondano": "tdn",
|
| 583 |
+
"Tonsea": "txs",
|
| 584 |
+
"Tooro": "ttj",
|
| 585 |
+
"Torau": "ttu",
|
| 586 |
+
"Torwali": "trw",
|
| 587 |
+
"Tsimihety Malagasy": "xmw",
|
| 588 |
+
"Tsotso": "lto",
|
| 589 |
+
"Tswana": "tn",
|
| 590 |
+
"Tugen": "tuy",
|
| 591 |
+
"Tuki": "bag",
|
| 592 |
+
"Tula": "tul",
|
| 593 |
+
"Tulu": "tcy",
|
| 594 |
+
"Tunen": "tvu",
|
| 595 |
+
"Tungag": "lcm",
|
| 596 |
+
"Tunisian Arabic": "aeb",
|
| 597 |
+
"Tupuri": "tui",
|
| 598 |
+
"Turkana": "tuv",
|
| 599 |
+
"Turkish": "tr",
|
| 600 |
+
"Turkmen": "tk",
|
| 601 |
+
"Tututepec Mixtec": "mtu",
|
| 602 |
+
"Twi": "tw",
|
| 603 |
+
"Ubaghara": "byc",
|
| 604 |
+
"Uighur": "ug",
|
| 605 |
+
"Ukrainian": "uk",
|
| 606 |
+
"Umbundu": "umb",
|
| 607 |
+
"Upper Sorbian": "hsb",
|
| 608 |
+
"Urdu": "ur",
|
| 609 |
+
"Ushojo": "ush",
|
| 610 |
+
"Uzbek": "uz",
|
| 611 |
+
"Vai": "vai",
|
| 612 |
+
"Vietnamese": "vi",
|
| 613 |
+
"Votic": "vot",
|
| 614 |
+
"Võro": "vro",
|
| 615 |
+
"Waci Gbe": "wci",
|
| 616 |
+
"Wadiyara Koli": "kxp",
|
| 617 |
+
"Waja": "wja",
|
| 618 |
+
"Wakhi": "wbl",
|
| 619 |
+
"Wanga": "lwg",
|
| 620 |
+
"Wapan": "juk",
|
| 621 |
+
"Warji": "wji",
|
| 622 |
+
"Welsh": "cy",
|
| 623 |
+
"Wemale": "weo",
|
| 624 |
+
"Western Frisian": "fy",
|
| 625 |
+
"Western Highland Purepecha": "pua",
|
| 626 |
+
"Western Juxtlahuaca Mixtec": "jmx",
|
| 627 |
+
"Western Maninkakan": "mlq",
|
| 628 |
+
"Western Mari": "mrj",
|
| 629 |
+
"Western Niger Fulfulde": "fuh",
|
| 630 |
+
"Western Panjabi": "pnb",
|
| 631 |
+
"Wolof": "wo",
|
| 632 |
+
"Wuzlam": "udl",
|
| 633 |
+
"Xanaguía Zapotec": "ztg",
|
| 634 |
+
"Xhosa": "xh",
|
| 635 |
+
"Yace": "ekr",
|
| 636 |
+
"Yakut": "sah",
|
| 637 |
+
"Yalahatan": "jal",
|
| 638 |
+
"Yanahuanca Pasco Quechua": "qur",
|
| 639 |
+
"Yangben": "yav",
|
| 640 |
+
"Yaqui": "yaq",
|
| 641 |
+
"Yauyos Quechua": "qux",
|
| 642 |
+
"Yekhee": "ets",
|
| 643 |
+
"Yiddish": "yi",
|
| 644 |
+
"Yidgha": "ydg",
|
| 645 |
+
"Yoruba": "yo",
|
| 646 |
+
"Yutanduchi Mixtec": "mab",
|
| 647 |
+
"Zacatlán-Ahuacatlán-Tepetzintla Nahuatl": "nhi",
|
| 648 |
+
"Zarma": "dje",
|
| 649 |
+
"Zaza": "zza",
|
| 650 |
+
"Zulu": "zu",
|
| 651 |
+
"Ömie": "aom",
|
| 652 |
+
}
|
steps/lang/qwen3_languages.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Qwen3-TTS supported target languages.
|
| 2 |
+
# Source: https://huggingface.co/spaces/Qwen/Qwen3-TTS (LANGUAGES constant in app.py)
|
| 3 |
+
"""Qwen3-supported languages (display name -> ISO-639-1 code)."""
|
| 4 |
+
|
| 5 |
+
QWEN3_LANGUAGE_CODES: dict[str, str] = {
|
| 6 |
+
"Chinese": "zh",
|
| 7 |
+
"English": "en",
|
| 8 |
+
"French": "fr",
|
| 9 |
+
"German": "de",
|
| 10 |
+
"Japanese": "ja",
|
| 11 |
+
"Korean": "ko",
|
| 12 |
+
"Portuguese": "pt",
|
| 13 |
+
"Russian": "ru",
|
| 14 |
+
"Spanish": "es",
|
| 15 |
+
}
|
steps/lang/urdu.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Urdu-specific translation handlers.
|
| 2 |
+
|
| 3 |
+
Handles:
|
| 4 |
+
- Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary)
|
| 5 |
+
- Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari)
|
| 6 |
+
- Devanagari → Urdu script conversion for captions
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ── Public dispatcher hooks ──────────────────────────────────────────────────
|
| 15 |
+
|
| 16 |
+
def get_translation_prompt() -> str:
|
| 17 |
+
"""Return the Urdu-specific system prompt for translation."""
|
| 18 |
+
return (
|
| 19 |
+
"You are a professional voice-over translator for commonly spoken Urdu. "
|
| 20 |
+
"Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n"
|
| 21 |
+
"LANGUAGE RULES:\n"
|
| 22 |
+
"- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n"
|
| 23 |
+
"- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. "
|
| 24 |
+
"NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, "
|
| 25 |
+
"وقت not سمے, لیکن not پرنتو, اگر not یدی).\n"
|
| 26 |
+
"- Keep it natural and conversational, not literary or formal.\n"
|
| 27 |
+
"- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). "
|
| 28 |
+
"Translate ALL such expressions into Urdu equivalents.\n\n"
|
| 29 |
+
"CRITICAL — DURATION CONSTRAINT:\n"
|
| 30 |
+
"Each line shows its spoken duration in brackets (e.g. [4.6s]). "
|
| 31 |
+
"The translation will be spoken by TTS and MUST fit within that duration.\n"
|
| 32 |
+
"STRICT RULE: Your translation MUST have FEWER words than the original English. "
|
| 33 |
+
"If the English has 10 words, aim for 7-8 Urdu words maximum.\n"
|
| 34 |
+
"Every word must earn its place — if removing a word doesn't lose core meaning, remove it. "
|
| 35 |
+
"Paraphrase aggressively. Use shorter synonyms. Merge clauses. "
|
| 36 |
+
"A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n"
|
| 37 |
+
"TTS COMPATIBILITY — IMPORTANT:\n"
|
| 38 |
+
"The TTS model struggles with long sentences that have multiple commas or clauses. "
|
| 39 |
+
"Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. "
|
| 40 |
+
"Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n"
|
| 41 |
+
"Each output line is still ONE item in the array (one per input line). "
|
| 42 |
+
"You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n"
|
| 43 |
+
"Write ONLY in Urdu script (Nastaliq/Arabic script). "
|
| 44 |
+
"Return ONLY a JSON array of translated strings, in order, no extra text. "
|
| 45 |
+
"Do NOT include the duration prefix or numbering in the output — only the translated text itself. "
|
| 46 |
+
'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye '
|
| 47 |
+
'Example output: ["سلام", "خدا حافظ"]'
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_fallback_mode() -> str:
|
| 52 |
+
"""Urdu uses Bedrock instead of Google Translate as fallback."""
|
| 53 |
+
return "bedrock"
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
_ENGLISH_FILLERS = re.compile(
|
| 57 |
+
r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*',
|
| 58 |
+
re.IGNORECASE,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def post_translate(segments: list[dict]) -> list[dict]:
|
| 63 |
+
"""Run Urdu-specific post-processing after translation.
|
| 64 |
+
|
| 65 |
+
- Strips leaked English fillers.
|
| 66 |
+
- Transliterates Urdu script → Devanagari for TTS (sets 'tts_text').
|
| 67 |
+
- Captions use translated_text directly (already Urdu/Nastaliq script).
|
| 68 |
+
"""
|
| 69 |
+
for seg in segments:
|
| 70 |
+
text = seg.get("translated_text", "")
|
| 71 |
+
# Strip leaked English fillers
|
| 72 |
+
clean_text = _ENGLISH_FILLERS.sub("", text).strip()
|
| 73 |
+
seg["translated_text"] = clean_text
|
| 74 |
+
|
| 75 |
+
return transliterate_to_devanagari(segments)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ── Transliteration: Urdu → Devanagari (for TTS) ────────────────────────────
|
| 79 |
+
|
| 80 |
+
_URDU_TO_DEVA = {
|
| 81 |
+
'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स',
|
| 82 |
+
'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड',
|
| 83 |
+
'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स',
|
| 84 |
+
'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ',
|
| 85 |
+
'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग',
|
| 86 |
+
'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह',
|
| 87 |
+
'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ',
|
| 88 |
+
'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न',
|
| 89 |
+
'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';',
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _urdu_to_rough_devanagari(text: str) -> str:
|
| 94 |
+
"""Deterministic character mapping from Urdu to Devanagari.
|
| 95 |
+
Consonants are mapped correctly, but short vowels are omitted/incorrect
|
| 96 |
+
because Urdu script doesn't explicitly mark them."""
|
| 97 |
+
result = []
|
| 98 |
+
for i, ch in enumerate(text):
|
| 99 |
+
if ch == 'ا':
|
| 100 |
+
# Word-initial alif is 'अ', otherwise 'ा'
|
| 101 |
+
result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा')
|
| 102 |
+
elif ch in _URDU_TO_DEVA:
|
| 103 |
+
result.append(_URDU_TO_DEVA[ch])
|
| 104 |
+
else:
|
| 105 |
+
result.append(ch)
|
| 106 |
+
|
| 107 |
+
# Fix a common edge case: ئ + ے (e.g., in بروئے)
|
| 108 |
+
rough = ''.join(result)
|
| 109 |
+
rough = rough.replace('इे', 'ए')
|
| 110 |
+
return rough
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2):
|
| 114 |
+
"""Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary."""
|
| 115 |
+
prompt = (
|
| 116 |
+
"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
|
| 117 |
+
"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
|
| 118 |
+
"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n"
|
| 119 |
+
"STRICT RULES:\n"
|
| 120 |
+
"- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n"
|
| 121 |
+
"- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n"
|
| 122 |
+
"- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n"
|
| 123 |
+
"- Add halant (्) for conjuncts where needed\n\n"
|
| 124 |
+
"EXAMPLES:\n"
|
| 125 |
+
"Urdu: محبت | rough: महबत | fixed: मोहब्बत\n"
|
| 126 |
+
"Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
|
| 127 |
+
"Urdu: حکمت | rough: हकमत | fixed: हिकमत\n"
|
| 128 |
+
"Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n"
|
| 129 |
+
"Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n"
|
| 130 |
+
"Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n"
|
| 131 |
+
"Urdu: کریم | rough: करयम | fixed: करीम\n\n"
|
| 132 |
+
"Return ONLY a JSON array of corrected Devanagari strings, in order, one per input."
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
for attempt in range(1, max_attempts + 1):
|
| 136 |
+
try:
|
| 137 |
+
response = client.chat.completions.create(
|
| 138 |
+
model=model,
|
| 139 |
+
messages=[
|
| 140 |
+
{"role": "system", "content": prompt},
|
| 141 |
+
{"role": "user", "content": numbered},
|
| 142 |
+
],
|
| 143 |
+
temperature=0.1,
|
| 144 |
+
)
|
| 145 |
+
raw = response.choices[0].message.content.strip()
|
| 146 |
+
log_llm_call(
|
| 147 |
+
step="urdu_vowel_polish", provider="pollinations", model=model,
|
| 148 |
+
system_prompt=prompt, user_prompt=numbered,
|
| 149 |
+
response=raw, temperature=0.1,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
polished_list = parse_json_array(raw)
|
| 154 |
+
except (json.JSONDecodeError, ValueError):
|
| 155 |
+
print(f"[urdu] Attempt {attempt}: Could not parse response as JSON")
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
if len(polished_list) != expected_count:
|
| 159 |
+
print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}")
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# Quick check if it's returning Arabic/Urdu script instead
|
| 163 |
+
sample = " ".join(polished_list[:3])
|
| 164 |
+
bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
|
| 165 |
+
if bad_chars > 0:
|
| 166 |
+
print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying")
|
| 167 |
+
prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt
|
| 168 |
+
continue
|
| 169 |
+
|
| 170 |
+
return polished_list
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
print(f"[urdu] LLM error on attempt {attempt}: {e}")
|
| 174 |
+
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def transliterate_to_devanagari(segments: list[dict]) -> list[dict]:
|
| 179 |
+
"""Convert Urdu script translations to Devanagari for TTS.
|
| 180 |
+
Adds 'tts_text' field to each segment.
|
| 181 |
+
Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing."""
|
| 182 |
+
if not segments:
|
| 183 |
+
return segments
|
| 184 |
+
|
| 185 |
+
print("[urdu] Starting Hybrid Urdu → Devanagari conversion...")
|
| 186 |
+
|
| 187 |
+
# Step 1: Deterministic mapping to rough Devanagari
|
| 188 |
+
rough_texts = []
|
| 189 |
+
for seg in segments:
|
| 190 |
+
urdu_text = seg.get("translated_text", "")
|
| 191 |
+
rough_deva = _urdu_to_rough_devanagari(urdu_text)
|
| 192 |
+
rough_texts.append(rough_deva)
|
| 193 |
+
|
| 194 |
+
expected = len(segments)
|
| 195 |
+
numbered = "\n".join(
|
| 196 |
+
f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}"
|
| 197 |
+
for i, seg in enumerate(segments)
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Try Pollinations
|
| 201 |
+
client = build_client()
|
| 202 |
+
polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected)
|
| 203 |
+
|
| 204 |
+
if polished_list:
|
| 205 |
+
for seg, deva_text in zip(segments, polished_list):
|
| 206 |
+
seg["tts_text"] = deva_text
|
| 207 |
+
print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓")
|
| 208 |
+
return segments
|
| 209 |
+
|
| 210 |
+
print("[urdu] Pollinations Polish failed ��� trying Bedrock fallback...")
|
| 211 |
+
|
| 212 |
+
# Bedrock Fallback
|
| 213 |
+
try:
|
| 214 |
+
system_prompt = (
|
| 215 |
+
"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
|
| 216 |
+
"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
|
| 217 |
+
"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n"
|
| 218 |
+
"EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
|
| 219 |
+
"حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n"
|
| 220 |
+
"Return ONLY a JSON array of corrected Devanagari strings."
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
for attempt in range(1, 3):
|
| 224 |
+
raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock")
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
polished_list = parse_json_array(raw)
|
| 228 |
+
except (json.JSONDecodeError, ValueError):
|
| 229 |
+
print(f"[urdu] Bedrock attempt {attempt}: Could not parse response")
|
| 230 |
+
continue
|
| 231 |
+
|
| 232 |
+
if len(polished_list) != expected:
|
| 233 |
+
print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}")
|
| 234 |
+
continue
|
| 235 |
+
|
| 236 |
+
sample = " ".join(polished_list[:3])
|
| 237 |
+
bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
|
| 238 |
+
if bad_chars > 0:
|
| 239 |
+
print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying")
|
| 240 |
+
system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt
|
| 241 |
+
continue
|
| 242 |
+
|
| 243 |
+
for seg, deva_text in zip(segments, polished_list):
|
| 244 |
+
seg["tts_text"] = deva_text
|
| 245 |
+
print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓")
|
| 246 |
+
return segments
|
| 247 |
+
|
| 248 |
+
except Exception as e:
|
| 249 |
+
print(f"[urdu] WARNING: Bedrock fallback failed ({e})")
|
| 250 |
+
|
| 251 |
+
print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.")
|
| 252 |
+
for seg, r_text in zip(segments, rough_texts):
|
| 253 |
+
seg["tts_text"] = r_text
|
| 254 |
+
return segments
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ── Transliteration: Devanagari → Urdu script (for captions) ────────────────
|
| 258 |
+
|
| 259 |
+
def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]:
|
| 260 |
+
"""Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles.
|
| 261 |
+
Adds 'caption_text' field to each segment."""
|
| 262 |
+
if not segments:
|
| 263 |
+
return segments
|
| 264 |
+
|
| 265 |
+
texts = [seg.get("translated_text", "") for seg in segments]
|
| 266 |
+
numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts))
|
| 267 |
+
|
| 268 |
+
system_prompt = (
|
| 269 |
+
"You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). "
|
| 270 |
+
"This is NOT translation — the language is already Urdu, just written in Devanagari. "
|
| 271 |
+
"Convert it to proper Urdu script preserving every word exactly.\n\n"
|
| 272 |
+
"Return ONLY a JSON array of converted strings, in order, no extra text. "
|
| 273 |
+
"Do NOT include numbering in the output."
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
client = build_client()
|
| 277 |
+
try:
|
| 278 |
+
response = client.chat.completions.create(
|
| 279 |
+
model=MODEL,
|
| 280 |
+
messages=[
|
| 281 |
+
{"role": "system", "content": system_prompt},
|
| 282 |
+
{"role": "user", "content": numbered},
|
| 283 |
+
],
|
| 284 |
+
temperature=0.1,
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
raw = response.choices[0].message.content.strip()
|
| 288 |
+
log_llm_call(
|
| 289 |
+
step="urdu_script_convert", provider="pollinations", model=MODEL,
|
| 290 |
+
system_prompt=system_prompt, user_prompt=numbered,
|
| 291 |
+
response=raw, temperature=0.1,
|
| 292 |
+
)
|
| 293 |
+
urdu_list = parse_json_array(raw)
|
| 294 |
+
|
| 295 |
+
if len(urdu_list) != len(segments):
|
| 296 |
+
print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
|
| 297 |
+
return segments
|
| 298 |
+
|
| 299 |
+
for seg, urdu_text in zip(segments, urdu_list):
|
| 300 |
+
seg["caption_text"] = urdu_text
|
| 301 |
+
|
| 302 |
+
print("[urdu] Urdu script transliteration complete ✓")
|
| 303 |
+
return segments
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...")
|
| 307 |
+
|
| 308 |
+
try:
|
| 309 |
+
raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock")
|
| 310 |
+
urdu_list = parse_json_array(raw)
|
| 311 |
+
|
| 312 |
+
if len(urdu_list) != len(segments):
|
| 313 |
+
print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
|
| 314 |
+
return segments
|
| 315 |
+
|
| 316 |
+
for seg, urdu_text in zip(segments, urdu_list):
|
| 317 |
+
seg["caption_text"] = urdu_text
|
| 318 |
+
|
| 319 |
+
print("[urdu] Urdu script transliteration (Bedrock) complete ✓")
|
| 320 |
+
return segments
|
| 321 |
+
|
| 322 |
+
except Exception as e2:
|
| 323 |
+
print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions")
|
| 324 |
+
return segments
|
steps/s1_extract_audio.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Step 1-2: Extract audio track from input video.
|
| 3 |
+
Outputs a 16 kHz mono WAV suitable for Whisper + Chatterbox.
|
| 4 |
+
"""
|
| 5 |
+
import subprocess
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def extract_audio(video_path: str, output_path: str = "tmp/audio/source/extracted_audio.wav") -> str:
|
| 10 |
+
"""
|
| 11 |
+
Extract audio from video using ffmpeg.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
video_path: Path to the input video file.
|
| 15 |
+
output_path: Where to save the extracted audio (WAV).
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
Absolute path to the extracted audio file.
|
| 19 |
+
"""
|
| 20 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
cmd = [
|
| 23 |
+
"ffmpeg", "-y",
|
| 24 |
+
"-i", video_path,
|
| 25 |
+
"-vn", # no video
|
| 26 |
+
"-acodec", "pcm_s16le", # PCM 16-bit
|
| 27 |
+
"-ar", "16000", # 16 kHz (Whisper standard)
|
| 28 |
+
"-ac", "1", # mono
|
| 29 |
+
output_path,
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 33 |
+
if result.returncode != 0:
|
| 34 |
+
raise RuntimeError(f"FFmpeg audio extraction failed:\n{result.stderr}")
|
| 35 |
+
|
| 36 |
+
print(f"[s1] Audio extracted → {output_path}")
|
| 37 |
+
return output_path
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def extract_audio_hq(video_path: str, output_path: str = "tmp/audio/source/extracted_audio_hq.wav") -> str:
|
| 41 |
+
"""
|
| 42 |
+
Extract high-quality 44.1 kHz stereo audio for source separation (Demucs).
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
video_path: Path to the input video file.
|
| 46 |
+
output_path: Where to save the HQ audio (WAV).
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Absolute path to the extracted HQ audio file.
|
| 50 |
+
"""
|
| 51 |
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
cmd = [
|
| 54 |
+
"ffmpeg", "-y",
|
| 55 |
+
"-i", video_path,
|
| 56 |
+
"-vn",
|
| 57 |
+
"-acodec", "pcm_s16le",
|
| 58 |
+
"-ar", "44100", # 44.1 kHz for Demucs
|
| 59 |
+
"-ac", "2", # stereo
|
| 60 |
+
output_path,
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 64 |
+
if result.returncode != 0:
|
| 65 |
+
raise RuntimeError(f"FFmpeg HQ audio extraction failed:\n{result.stderr}")
|
| 66 |
+
|
| 67 |
+
print(f"[s1] HQ audio extracted → {output_path}")
|
| 68 |
+
return output_path
|
steps/s1b_separate.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Step 1b: Separate vocals from accompaniment using Demucs (Python API).
|
| 3 |
+
|
| 4 |
+
In-process inference so ZeroGPU can intercept the GPU allocation via
|
| 5 |
+
`@spaces.GPU`. Works on CUDA, MPS, and CPU without code changes.
|
| 6 |
+
Only runs when preserve_music=True.
|
| 7 |
+
"""
|
| 8 |
+
import shutil
|
| 9 |
+
import subprocess
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torchaudio
|
| 14 |
+
|
| 15 |
+
import spaces
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
_MODEL = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _select_device() -> str:
|
| 22 |
+
if torch.cuda.is_available():
|
| 23 |
+
return "cuda"
|
| 24 |
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 25 |
+
return "mps"
|
| 26 |
+
return "cpu"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _get_model():
|
| 30 |
+
"""Lazy-load htdemucs once per process. Module-level semantics; we load
|
| 31 |
+
on first call so the import itself stays cheap on non-GPU envs."""
|
| 32 |
+
global _MODEL
|
| 33 |
+
if _MODEL is None:
|
| 34 |
+
from demucs.pretrained import get_model
|
| 35 |
+
print("[s1b] Loading htdemucs on cpu...")
|
| 36 |
+
model = get_model("htdemucs")
|
| 37 |
+
model.eval()
|
| 38 |
+
model.to("cpu")
|
| 39 |
+
_MODEL = model
|
| 40 |
+
return _MODEL
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@spaces.GPU(duration=120)
|
| 44 |
+
def _apply_demucs(mix: torch.Tensor, device: str) -> torch.Tensor:
|
| 45 |
+
"""GPU-bound inference call. `mix` shape: [1, channels, time]."""
|
| 46 |
+
from demucs.apply import apply_model
|
| 47 |
+
|
| 48 |
+
model = _get_model()
|
| 49 |
+
if next(model.parameters()).device.type != device:
|
| 50 |
+
print(f"[s1b] Moving htdemucs to {device} inside GPU scope...")
|
| 51 |
+
model = model.to(device)
|
| 52 |
+
with torch.no_grad():
|
| 53 |
+
# apply_model returns [batch, sources, channels, time]
|
| 54 |
+
sources = apply_model(
|
| 55 |
+
model,
|
| 56 |
+
mix.to(device),
|
| 57 |
+
shifts=1,
|
| 58 |
+
split=True,
|
| 59 |
+
overlap=0.25,
|
| 60 |
+
device=device,
|
| 61 |
+
)
|
| 62 |
+
return sources.cpu()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _load_and_normalise(audio_hq_path: str, target_sr: int, target_ch: int) -> tuple[torch.Tensor, float, float]:
|
| 66 |
+
"""Load WAV, resample/remix to match model requirements, z-normalise."""
|
| 67 |
+
wav, sr = torchaudio.load(audio_hq_path)
|
| 68 |
+
|
| 69 |
+
if sr != target_sr:
|
| 70 |
+
wav = torchaudio.functional.resample(wav, sr, target_sr)
|
| 71 |
+
|
| 72 |
+
if wav.shape[0] == 1 and target_ch == 2:
|
| 73 |
+
wav = wav.repeat(2, 1)
|
| 74 |
+
elif wav.shape[0] > target_ch:
|
| 75 |
+
wav = wav[:target_ch]
|
| 76 |
+
|
| 77 |
+
mean = wav.mean()
|
| 78 |
+
std = wav.std().clamp_min(1e-8)
|
| 79 |
+
wav_norm = (wav - mean) / std
|
| 80 |
+
return wav_norm.unsqueeze(0), mean.item(), std.item()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def separate_audio(
|
| 84 |
+
audio_hq_path: str,
|
| 85 |
+
output_dir: str = "tmp",
|
| 86 |
+
) -> tuple[str, str]:
|
| 87 |
+
"""
|
| 88 |
+
Separate vocals from accompaniment using Demucs htdemucs (Python API).
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
audio_hq_path: Path to input audio (any sample rate / channels).
|
| 92 |
+
output_dir: Directory to write output stems.
|
| 93 |
+
|
| 94 |
+
Returns:
|
| 95 |
+
(vocals_16k_path, accompaniment_path)
|
| 96 |
+
"""
|
| 97 |
+
out = Path(output_dir)
|
| 98 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 99 |
+
|
| 100 |
+
model = _get_model()
|
| 101 |
+
device = _select_device()
|
| 102 |
+
target_sr = model.samplerate
|
| 103 |
+
target_ch = model.audio_channels
|
| 104 |
+
source_names = list(model.sources)
|
| 105 |
+
|
| 106 |
+
print(f"[s1b] Running Demucs htdemucs on {device} (Python API)...")
|
| 107 |
+
mix, mean, std = _load_and_normalise(audio_hq_path, target_sr, target_ch)
|
| 108 |
+
|
| 109 |
+
sources = _apply_demucs(mix, device)
|
| 110 |
+
sources = sources * std + mean
|
| 111 |
+
sources = sources[0] # drop batch dim → [num_sources, channels, time]
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
vocals_idx = source_names.index("vocals")
|
| 115 |
+
except ValueError as e:
|
| 116 |
+
raise RuntimeError(f"htdemucs is missing 'vocals' source: {source_names}") from e
|
| 117 |
+
|
| 118 |
+
vocals = sources[vocals_idx]
|
| 119 |
+
no_vocals = sum(
|
| 120 |
+
sources[i] for i in range(sources.shape[0]) if i != vocals_idx
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
vocals_path = str(out / "vocals.wav")
|
| 124 |
+
accompaniment_path = str(out / "accompaniment.wav")
|
| 125 |
+
vocals_16k_path = str(out / "vocals_16k.wav")
|
| 126 |
+
|
| 127 |
+
torchaudio.save(vocals_path, vocals, target_sr)
|
| 128 |
+
torchaudio.save(accompaniment_path, no_vocals, target_sr)
|
| 129 |
+
print(f"[s1b] Vocals saved → {vocals_path}")
|
| 130 |
+
print(f"[s1b] Accompaniment saved → {accompaniment_path}")
|
| 131 |
+
|
| 132 |
+
# Resample vocals to 16 kHz mono for Whisper/TTS via ffmpeg
|
| 133 |
+
# (torchaudio resample works but ffmpeg is more predictable for downstream)
|
| 134 |
+
cmd = [
|
| 135 |
+
"ffmpeg", "-y",
|
| 136 |
+
"-i", vocals_path,
|
| 137 |
+
"-ar", "16000",
|
| 138 |
+
"-ac", "1",
|
| 139 |
+
vocals_16k_path,
|
| 140 |
+
]
|
| 141 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 142 |
+
if result.returncode != 0:
|
| 143 |
+
raise RuntimeError(f"FFmpeg vocals resample failed:\n{result.stderr}")
|
| 144 |
+
|
| 145 |
+
print(f"[s1b] Vocals (16 kHz) saved → {vocals_16k_path}")
|
| 146 |
+
|
| 147 |
+
# Leftover cleanup for any previously-shelled-out demucs runs
|
| 148 |
+
old_demucs_dir = out / "demucs"
|
| 149 |
+
if old_demucs_dir.exists():
|
| 150 |
+
shutil.rmtree(str(old_demucs_dir), ignore_errors=True)
|
| 151 |
+
|
| 152 |
+
return vocals_16k_path, accompaniment_path
|
steps/s2_transcribe.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Step 3: Transcribe audio with timestamps.
|
| 3 |
+
|
| 4 |
+
Primary local backend (device-dependent):
|
| 5 |
+
- Apple MPS: mlx-whisper
|
| 6 |
+
- CUDA: faster-whisper
|
| 7 |
+
- CPU: faster-whisper
|
| 8 |
+
|
| 9 |
+
Outermost fallback:
|
| 10 |
+
- Pollinations Whisper API (verbose_json)
|
| 11 |
+
"""
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
import requests
|
| 15 |
+
import torch
|
| 16 |
+
from dotenv import load_dotenv
|
| 17 |
+
|
| 18 |
+
import spaces
|
| 19 |
+
|
| 20 |
+
load_dotenv()
|
| 21 |
+
|
| 22 |
+
POLLINATIONS_URL = "https://gen.pollinations.ai/v1/audio/transcriptions"
|
| 23 |
+
POLLEN_TRANSCRIBE_MODEL = os.getenv("POLLEN_TRANSCRIBE_MODEL", "whisper-large-v3")
|
| 24 |
+
MLX_MODEL = os.getenv("MLX_WHISPER_MODEL", "mlx-community/whisper-large-mlx")
|
| 25 |
+
FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "large-v3")
|
| 26 |
+
OPENAI_WHISPER_MODEL = os.getenv("OPENAI_WHISPER_MODEL", "large-v3")
|
| 27 |
+
LOCAL_WHISPER_BACKEND_ENV = "VIDEOVOICE_WHISPER_BACKEND"
|
| 28 |
+
_VALID_LOCAL_BACKENDS = {
|
| 29 |
+
"mlx-whisper",
|
| 30 |
+
"openai-whisper-cuda",
|
| 31 |
+
"faster-whisper-cpu",
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
_FASTER_WHISPER_MODELS = {}
|
| 35 |
+
_OPENAI_WHISPER_MODEL = None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _running_on_hf_space() -> bool:
|
| 39 |
+
return bool(
|
| 40 |
+
os.getenv("SPACE_ID")
|
| 41 |
+
or os.getenv("SPACE_HOST")
|
| 42 |
+
or os.getenv("HF_SPACE_ID")
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _get_local_whisper_backend() -> str:
|
| 47 |
+
"""
|
| 48 |
+
Resolve the local transcription backend lazily.
|
| 49 |
+
|
| 50 |
+
On HF Spaces, default to CPU faster-whisper unless explicitly overridden.
|
| 51 |
+
ZeroGPU can report CUDA availability outside an active @spaces.GPU call,
|
| 52 |
+
which makes import-time backend selection unreliable.
|
| 53 |
+
"""
|
| 54 |
+
override = os.getenv(LOCAL_WHISPER_BACKEND_ENV, "").strip().lower()
|
| 55 |
+
if override:
|
| 56 |
+
if override not in _VALID_LOCAL_BACKENDS:
|
| 57 |
+
raise ValueError(
|
| 58 |
+
f"Invalid {LOCAL_WHISPER_BACKEND_ENV}={override!r}. "
|
| 59 |
+
f"Expected one of: {', '.join(sorted(_VALID_LOCAL_BACKENDS))}."
|
| 60 |
+
)
|
| 61 |
+
return override
|
| 62 |
+
|
| 63 |
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 64 |
+
return "mlx-whisper"
|
| 65 |
+
|
| 66 |
+
if _running_on_hf_space():
|
| 67 |
+
return "faster-whisper-cpu"
|
| 68 |
+
|
| 69 |
+
if torch.cuda.is_available():
|
| 70 |
+
# PyTorch-based path so @spaces.GPU can intercept the CUDA allocation.
|
| 71 |
+
# faster-whisper uses CTranslate2 which bypasses PyTorch and breaks ZeroGPU.
|
| 72 |
+
return "openai-whisper-cuda"
|
| 73 |
+
|
| 74 |
+
return "faster-whisper-cpu"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _extract_words(raw_words: list[dict]) -> list[dict]:
|
| 78 |
+
"""Normalise word timestamps into {word, start, end}."""
|
| 79 |
+
output = []
|
| 80 |
+
for raw in raw_words or []:
|
| 81 |
+
start = raw.get("start")
|
| 82 |
+
end = raw.get("end")
|
| 83 |
+
if start is None or end is None:
|
| 84 |
+
continue
|
| 85 |
+
output.append(
|
| 86 |
+
{
|
| 87 |
+
"word": str(raw.get("word", "")).strip(),
|
| 88 |
+
"start": float(start),
|
| 89 |
+
"end": float(end),
|
| 90 |
+
}
|
| 91 |
+
)
|
| 92 |
+
return output
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _normalise_segments(segments: list[dict]) -> list[dict]:
|
| 96 |
+
"""Return canonical segment schema with word-level timestamps."""
|
| 97 |
+
output = []
|
| 98 |
+
for seg in segments:
|
| 99 |
+
start = seg.get("start")
|
| 100 |
+
end = seg.get("end")
|
| 101 |
+
if start is None or end is None:
|
| 102 |
+
continue
|
| 103 |
+
words = _extract_words(seg.get("words", []))
|
| 104 |
+
output.append(
|
| 105 |
+
{
|
| 106 |
+
"start": float(start),
|
| 107 |
+
"end": float(end),
|
| 108 |
+
"text": str(seg.get("text", "")).strip(),
|
| 109 |
+
"words": words,
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
return output
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Max duration (seconds) before a segment is considered oversized and needs splitting.
|
| 116 |
+
_MAX_SEGMENT_DURATION = 15.0
|
| 117 |
+
# Preferred pause gap (seconds) to use as a split point.
|
| 118 |
+
_PAUSE_THRESHOLD = 0.4
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _split_oversized_segments(segments: list[dict]) -> list[dict]:
|
| 122 |
+
"""Split segments longer than _MAX_SEGMENT_DURATION using word timings."""
|
| 123 |
+
output = []
|
| 124 |
+
for seg in segments:
|
| 125 |
+
duration = seg["end"] - seg["start"]
|
| 126 |
+
words = seg.get("words", [])
|
| 127 |
+
real_words = [w for w in words if w["word"]]
|
| 128 |
+
|
| 129 |
+
if duration <= _MAX_SEGMENT_DURATION or len(real_words) < 2:
|
| 130 |
+
output.append(seg)
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
chunks = []
|
| 134 |
+
chunk_start_idx = 0
|
| 135 |
+
chunk_start_time = real_words[0]["start"]
|
| 136 |
+
|
| 137 |
+
for i in range(len(real_words) - 1):
|
| 138 |
+
elapsed = real_words[i]["end"] - chunk_start_time
|
| 139 |
+
gap = real_words[i + 1]["start"] - real_words[i]["end"]
|
| 140 |
+
should_split = (
|
| 141 |
+
(elapsed >= _MAX_SEGMENT_DURATION and gap >= 0.15)
|
| 142 |
+
or (elapsed >= _MAX_SEGMENT_DURATION * 0.5 and gap >= _PAUSE_THRESHOLD)
|
| 143 |
+
)
|
| 144 |
+
if should_split:
|
| 145 |
+
chunks.append(real_words[chunk_start_idx : i + 1])
|
| 146 |
+
chunk_start_idx = i + 1
|
| 147 |
+
chunk_start_time = real_words[i + 1]["start"]
|
| 148 |
+
|
| 149 |
+
if chunk_start_idx < len(real_words):
|
| 150 |
+
chunks.append(real_words[chunk_start_idx:])
|
| 151 |
+
|
| 152 |
+
for chunk_words in chunks:
|
| 153 |
+
output.append(
|
| 154 |
+
{
|
| 155 |
+
"start": chunk_words[0]["start"],
|
| 156 |
+
"end": chunk_words[-1]["end"],
|
| 157 |
+
"text": " ".join(w["word"] for w in chunk_words).strip(),
|
| 158 |
+
"words": chunk_words,
|
| 159 |
+
}
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
return output
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _assign_words_to_segments(segments: list[dict], words: list[dict]) -> None:
|
| 166 |
+
"""Distribute top-level word list into segments by timestamp overlap."""
|
| 167 |
+
normalised = _extract_words(words)
|
| 168 |
+
for seg in segments:
|
| 169 |
+
seg["words"] = [
|
| 170 |
+
w for w in normalised if w["start"] >= seg["start"] and w["end"] <= seg["end"]
|
| 171 |
+
]
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def _segments_from_pollinations(audio_path: str, language: str) -> list[dict]:
|
| 175 |
+
"""Call Pollinations Whisper API and return canonical segments."""
|
| 176 |
+
api_key = (
|
| 177 |
+
os.getenv("POLLEN_API_KEY_SECONDARY")
|
| 178 |
+
or os.getenv("POLLEN_API_KEY")
|
| 179 |
+
or os.getenv("POLLINATIONS_API_KEY", "")
|
| 180 |
+
)
|
| 181 |
+
headers = {"Authorization": f"Bearer {api_key}"}
|
| 182 |
+
|
| 183 |
+
with open(audio_path, "rb") as audio_file:
|
| 184 |
+
files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
|
| 185 |
+
# When the caller passes "auto" (or empty), omit the `language` field so
|
| 186 |
+
# Whisper auto-detects. Forcing a wrong language code makes Whisper
|
| 187 |
+
# silently switch to translate-mode (e.g. Hindi audio + language="en"
|
| 188 |
+
# produces an English translation, not a Hindi transcript).
|
| 189 |
+
data = {
|
| 190 |
+
"model": POLLEN_TRANSCRIBE_MODEL,
|
| 191 |
+
"response_format": "verbose_json",
|
| 192 |
+
"temperature": 0,
|
| 193 |
+
"timestamp_granularities[]": "word",
|
| 194 |
+
}
|
| 195 |
+
if language and language.lower() not in ("auto", ""):
|
| 196 |
+
data["language"] = language
|
| 197 |
+
response = requests.post(
|
| 198 |
+
POLLINATIONS_URL,
|
| 199 |
+
headers=headers,
|
| 200 |
+
files=files,
|
| 201 |
+
data=data,
|
| 202 |
+
timeout=120,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
response.raise_for_status()
|
| 206 |
+
result = response.json()
|
| 207 |
+
|
| 208 |
+
segments = _normalise_segments(result.get("segments", []))
|
| 209 |
+
if not any(seg.get("words") for seg in segments) and "words" in result:
|
| 210 |
+
_assign_words_to_segments(segments, result["words"])
|
| 211 |
+
|
| 212 |
+
return _normalise_segments(segments)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def _segments_from_mlx(audio_path: str, language: str) -> list[dict]:
|
| 216 |
+
"""Run mlx-whisper locally."""
|
| 217 |
+
print("[s2] Using mlx-whisper backend...")
|
| 218 |
+
try:
|
| 219 |
+
import mlx_whisper
|
| 220 |
+
except ImportError:
|
| 221 |
+
raise ImportError("mlx-whisper is not installed. Run: uv add mlx-whisper")
|
| 222 |
+
|
| 223 |
+
result = mlx_whisper.transcribe(
|
| 224 |
+
audio_path,
|
| 225 |
+
path_or_hf_repo=MLX_MODEL,
|
| 226 |
+
language=language if language != "auto" else None,
|
| 227 |
+
word_timestamps=True,
|
| 228 |
+
)
|
| 229 |
+
return _normalise_segments(result.get("segments", []))
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def _get_faster_whisper_model(device: str, compute_type: str):
|
| 233 |
+
"""Load/cached faster-whisper model."""
|
| 234 |
+
from faster_whisper import WhisperModel
|
| 235 |
+
|
| 236 |
+
key = (device, compute_type)
|
| 237 |
+
if key not in _FASTER_WHISPER_MODELS:
|
| 238 |
+
_FASTER_WHISPER_MODELS[key] = WhisperModel(
|
| 239 |
+
FASTER_WHISPER_MODEL,
|
| 240 |
+
device=device,
|
| 241 |
+
compute_type=compute_type,
|
| 242 |
+
)
|
| 243 |
+
return _FASTER_WHISPER_MODELS[key]
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _segments_from_faster_whisper_impl(
|
| 247 |
+
audio_path: str,
|
| 248 |
+
language: str,
|
| 249 |
+
device: str,
|
| 250 |
+
compute_type: str,
|
| 251 |
+
) -> list[dict]:
|
| 252 |
+
model = _get_faster_whisper_model(device=device, compute_type=compute_type)
|
| 253 |
+
segments, _ = model.transcribe(
|
| 254 |
+
audio_path,
|
| 255 |
+
language=None if language == "auto" else language,
|
| 256 |
+
word_timestamps=True,
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
output = []
|
| 260 |
+
for seg in segments:
|
| 261 |
+
words = []
|
| 262 |
+
for word in seg.words or []:
|
| 263 |
+
if word.start is None or word.end is None:
|
| 264 |
+
continue
|
| 265 |
+
words.append(
|
| 266 |
+
{
|
| 267 |
+
"word": str(word.word or "").strip(),
|
| 268 |
+
"start": float(word.start),
|
| 269 |
+
"end": float(word.end),
|
| 270 |
+
}
|
| 271 |
+
)
|
| 272 |
+
output.append(
|
| 273 |
+
{
|
| 274 |
+
"start": float(seg.start),
|
| 275 |
+
"end": float(seg.end),
|
| 276 |
+
"text": str(seg.text or "").strip(),
|
| 277 |
+
"words": words,
|
| 278 |
+
}
|
| 279 |
+
)
|
| 280 |
+
return output
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def _segments_from_faster_whisper_cpu(
|
| 284 |
+
audio_path: str,
|
| 285 |
+
language: str,
|
| 286 |
+
) -> list[dict]:
|
| 287 |
+
"""CPU-only faster-whisper (no GPU decorator — runs outside ZeroGPU budget)."""
|
| 288 |
+
return _segments_from_faster_whisper_impl(audio_path, language, "cpu", "int8")
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _get_openai_whisper_model():
|
| 292 |
+
"""Load openai-whisper once per process. CUDA if available."""
|
| 293 |
+
global _OPENAI_WHISPER_MODEL
|
| 294 |
+
if _OPENAI_WHISPER_MODEL is None:
|
| 295 |
+
try:
|
| 296 |
+
import whisper as openai_whisper
|
| 297 |
+
except ImportError as exc:
|
| 298 |
+
raise ImportError("openai-whisper is not installed") from exc
|
| 299 |
+
|
| 300 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 301 |
+
print(f"[s2] Loading openai-whisper ({OPENAI_WHISPER_MODEL}) on {device}...")
|
| 302 |
+
_OPENAI_WHISPER_MODEL = openai_whisper.load_model(OPENAI_WHISPER_MODEL, device=device)
|
| 303 |
+
return _OPENAI_WHISPER_MODEL
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
@spaces.GPU(duration=60)
|
| 307 |
+
def _segments_from_openai_whisper(
|
| 308 |
+
audio_path: str,
|
| 309 |
+
language: str,
|
| 310 |
+
) -> list[dict]:
|
| 311 |
+
"""GPU-decorated openai-whisper execution (PyTorch-native, ZeroGPU-compatible)."""
|
| 312 |
+
model = _get_openai_whisper_model()
|
| 313 |
+
result = model.transcribe(
|
| 314 |
+
audio_path,
|
| 315 |
+
language=None if language == "auto" else language,
|
| 316 |
+
word_timestamps=True,
|
| 317 |
+
verbose=False,
|
| 318 |
+
)
|
| 319 |
+
return _normalise_segments(result.get("segments", []))
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def _segments_from_local_backend(audio_path: str, language: str) -> list[dict]:
|
| 323 |
+
"""Dispatch local whisper backend from runtime device detection."""
|
| 324 |
+
backend = _get_local_whisper_backend()
|
| 325 |
+
|
| 326 |
+
if backend == "mlx-whisper":
|
| 327 |
+
return _segments_from_mlx(audio_path, language)
|
| 328 |
+
|
| 329 |
+
if backend == "openai-whisper-cuda":
|
| 330 |
+
print("[s2] Using openai-whisper backend (cuda)...")
|
| 331 |
+
try:
|
| 332 |
+
return _segments_from_openai_whisper(audio_path, language)
|
| 333 |
+
except ImportError:
|
| 334 |
+
print("[s2] openai-whisper unavailable; falling back to faster-whisper (cpu).")
|
| 335 |
+
return _segments_from_faster_whisper_cpu(audio_path, language)
|
| 336 |
+
|
| 337 |
+
print("[s2] Using faster-whisper backend (cpu)...")
|
| 338 |
+
return _segments_from_faster_whisper_cpu(audio_path, language)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def transcribe(audio_path: str, language: str = "en") -> list[dict]:
|
| 342 |
+
"""
|
| 343 |
+
Transcribe audio and return canonical segment schema.
|
| 344 |
+
|
| 345 |
+
Priority:
|
| 346 |
+
1. Pollinations API (fast, offloads computation)
|
| 347 |
+
2. Local backend (GPU/MPS if available, otherwise CPU)
|
| 348 |
+
"""
|
| 349 |
+
print(f"[s2] Transcribing {audio_path} (lang={language})...")
|
| 350 |
+
|
| 351 |
+
segments = None
|
| 352 |
+
pollinations_error = None
|
| 353 |
+
local_error = None
|
| 354 |
+
|
| 355 |
+
# 1. Try Pollinations API first
|
| 356 |
+
try:
|
| 357 |
+
print("[s2] Trying Pollinations API...")
|
| 358 |
+
segments = _segments_from_pollinations(audio_path, language)
|
| 359 |
+
if segments:
|
| 360 |
+
print(f"[s2] Pollinations returned {len(segments)} segments ✓")
|
| 361 |
+
else:
|
| 362 |
+
segments = None
|
| 363 |
+
except Exception as exc:
|
| 364 |
+
print(f"[s2] Pollinations error ({exc}) — falling back to local backend.")
|
| 365 |
+
pollinations_error = exc
|
| 366 |
+
segments = None
|
| 367 |
+
|
| 368 |
+
# 2. Try Local Backend (GPU or CPU)
|
| 369 |
+
if segments is None:
|
| 370 |
+
try:
|
| 371 |
+
backend = _get_local_whisper_backend()
|
| 372 |
+
print(f"[s2] Trying local backend ({backend})...")
|
| 373 |
+
segments = _segments_from_local_backend(audio_path, language)
|
| 374 |
+
if segments:
|
| 375 |
+
print(f"[s2] Local backend returned {len(segments)} segments ✓")
|
| 376 |
+
except Exception as exc:
|
| 377 |
+
print(f"[s2] Local backend error ({exc}).")
|
| 378 |
+
local_error = exc
|
| 379 |
+
segments = None
|
| 380 |
+
|
| 381 |
+
if segments is None:
|
| 382 |
+
details = []
|
| 383 |
+
if pollinations_error is not None:
|
| 384 |
+
details.append(f"Pollinations: {pollinations_error}")
|
| 385 |
+
if local_error is not None:
|
| 386 |
+
details.append(f"Local backend: {local_error}")
|
| 387 |
+
suffix = f" Details: {' | '.join(details)}" if details else ""
|
| 388 |
+
raise RuntimeError(f"Transcription failed on all available backends.{suffix}")
|
| 389 |
+
|
| 390 |
+
before = len(segments)
|
| 391 |
+
segments = _split_oversized_segments(segments)
|
| 392 |
+
if len(segments) != before:
|
| 393 |
+
print(f"[s2] Split {before} oversized segment(s) → {len(segments)} segments")
|
| 394 |
+
|
| 395 |
+
return _normalise_segments(segments)
|
steps/s3_translate.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Step 4: Translate segment texts using Pollinations chat completions API
|
| 3 |
+
(OpenAI-compatible endpoint, no extra API key needed beyond POLLEN_API_KEY).
|
| 4 |
+
"""
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
from .lang._shared import build_client, bedrock_fallback, parse_json_array, MODEL, log_llm_call
|
| 8 |
+
from .lang import get_translation_prompt, get_fallback_mode, post_translate
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _translate_batch(segments: list[dict], target_language: str) -> list[dict]:
|
| 12 |
+
"""Translate a batch of segments into target_language."""
|
| 13 |
+
if not segments:
|
| 14 |
+
return segments
|
| 15 |
+
|
| 16 |
+
# Build single-shot batch: include duration so the LLM can match spoken length
|
| 17 |
+
numbered = "\n".join(
|
| 18 |
+
f"{i+1}. [{s['end'] - s['start']:.1f}s] {s['text']}"
|
| 19 |
+
for i, s in enumerate(segments)
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Default prompt (generic, works for most languages)
|
| 23 |
+
default_prompt = (
|
| 24 |
+
f"You are a voice-over dubbing writer — not a translator. "
|
| 25 |
+
f"Your job is to write what a native {target_language} speaker would *actually say out loud* "
|
| 26 |
+
f"in a casual, natural conversation. Forget the source words. Capture the meaning, tone, and energy.\n\n"
|
| 27 |
+
|
| 28 |
+
f"INPUT FORMAT:\n"
|
| 29 |
+
f"Numbered lines with a spoken duration in brackets, e.g.: 1. [4.6s] Hello there\n\n"
|
| 30 |
+
|
| 31 |
+
f"OUTPUT FORMAT:\n"
|
| 32 |
+
f"A JSON array of {target_language} strings — one per input line, in order. "
|
| 33 |
+
f"No numbering, no brackets, no extra text.\n"
|
| 34 |
+
f'Shape: ["<first line translated into {target_language}>", "<second line translated into {target_language}>"]\n\n'
|
| 35 |
+
|
| 36 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
| 37 |
+
f"SCORING RUBRIC — evaluate every line against these before outputting:\n"
|
| 38 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
| 39 |
+
|
| 40 |
+
f"[1] NATURALNESS — weight: HIGH\n"
|
| 41 |
+
f" Would a native speaker actually say this in real life?\n"
|
| 42 |
+
f" ✗ Fail: dictionary phrasing, formal register, textbook grammar\n"
|
| 43 |
+
f" ✓ Pass: contractions, colloquial rhythm, everyday vocabulary\n"
|
| 44 |
+
f" Ask yourself: 'Would I hear this in a TV show or on the street?' If no → rewrite.\n\n"
|
| 45 |
+
|
| 46 |
+
f"[2] SPOKEN FIT — weight: CRITICAL\n"
|
| 47 |
+
f" The line will be read by TTS within the duration shown in brackets.\n"
|
| 48 |
+
f" Fewer words is almost always safer. Aim for 70–80% of the original word count.\n"
|
| 49 |
+
f" ✗ Fail: translation is longer or same length as the English\n"
|
| 50 |
+
f" ✓ Pass: shorter, with no loss of core meaning or emotional tone\n"
|
| 51 |
+
f" Trick: cut filler, merge ideas, use contractions and short-form spoken words.\n\n"
|
| 52 |
+
|
| 53 |
+
f"[3] TTS READABILITY — weight: HIGH\n"
|
| 54 |
+
f" Long sentences with multiple commas trip up TTS engines.\n"
|
| 55 |
+
f" ✗ Fail: 'She met him, her true love, on a rainy evening, in the city she once fled.'\n"
|
| 56 |
+
f" ✓ Pass: 'She met him on a rainy evening. Her true love. In the city she once fled.'\n"
|
| 57 |
+
f" Short beats. Natural pauses. Each sentence punches clean.\n\n"
|
| 58 |
+
|
| 59 |
+
f"[4] EMOTIONAL REGISTER — weight: HIGH\n"
|
| 60 |
+
f" Match the tone of the original: casual, urgent, tender, funny, sarcastic — whatever it is.\n"
|
| 61 |
+
f" ✗ Fail: a sarcastic line becomes polite; a tender moment becomes clinical\n"
|
| 62 |
+
f" ✓ Pass: the emotional texture is preserved even if the words are completely different\n\n"
|
| 63 |
+
|
| 64 |
+
f"[5] TRANSLATION PURITY — weight: MEDIUM\n"
|
| 65 |
+
f" Every word in the output must be {target_language}. No words from the original "
|
| 66 |
+
f"language should leak through.\n"
|
| 67 |
+
f" This includes: filler words (Oh, Hmm, Well, So, Right when not native to "
|
| 68 |
+
f"{target_language}), names used as exclamations, brand-style interjections. "
|
| 69 |
+
f"Find the {target_language} equivalent every time.\n\n"
|
| 70 |
+
|
| 71 |
+
f"[6] WORD-FOR-WORD TRAP — weight: HIGH (avoid this)\n"
|
| 72 |
+
f" Do NOT translate word by word. No one speaks that way.\n"
|
| 73 |
+
f" ✗ Fail: a literal one-to-one rendering that preserves the source word order\n"
|
| 74 |
+
f" ✓ Pass: a restructured line that reads naturally in {target_language} "
|
| 75 |
+
f"while keeping the same meaning\n"
|
| 76 |
+
f" Restructure freely. {target_language} has its own natural word order — use it.\n\n"
|
| 77 |
+
|
| 78 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
| 79 |
+
f"BEFORE RETURNING OUTPUT:\n"
|
| 80 |
+
f"For each line, silently run this checklist:\n"
|
| 81 |
+
f" □ Would a native speaker say this naturally out loud?\n"
|
| 82 |
+
f" □ Is it shorter than the English original?\n"
|
| 83 |
+
f" □ Are there any commas that create awkward TTS pauses? → break into short sentences\n"
|
| 84 |
+
f" □ Does the emotional tone match?\n"
|
| 85 |
+
f" □ Are there any English words hiding in the output?\n"
|
| 86 |
+
f"If any box fails → rewrite that line. Then output.\n"
|
| 87 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
| 88 |
+
|
| 89 |
+
f"Return ONLY the JSON array. No preamble, no explanation, no duration prefixes."
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Let language-specific handler override the prompt if needed
|
| 93 |
+
system_prompt = get_translation_prompt(target_language, default_prompt)
|
| 94 |
+
|
| 95 |
+
expected = len(segments)
|
| 96 |
+
strict_prompt = (
|
| 97 |
+
system_prompt
|
| 98 |
+
+ f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
|
| 99 |
+
f"— one per input line. Do NOT merge, skip, or split any lines."
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
client = build_client()
|
| 103 |
+
max_retries = 2
|
| 104 |
+
try:
|
| 105 |
+
for attempt in range(1, max_retries + 1):
|
| 106 |
+
response = client.chat.completions.create(
|
| 107 |
+
model=MODEL,
|
| 108 |
+
messages=[
|
| 109 |
+
{"role": "system", "content": strict_prompt},
|
| 110 |
+
{"role": "user", "content": numbered},
|
| 111 |
+
],
|
| 112 |
+
temperature=0.2,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
raw = response.choices[0].message.content.strip()
|
| 116 |
+
log_llm_call(
|
| 117 |
+
step="s3_translate", provider="pollinations", model=MODEL,
|
| 118 |
+
system_prompt=strict_prompt, user_prompt=numbered,
|
| 119 |
+
response=raw, temperature=0.2,
|
| 120 |
+
)
|
| 121 |
+
translated_list = parse_json_array(raw)
|
| 122 |
+
|
| 123 |
+
if len(translated_list) == expected:
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
print(f"[s3] Pollinations returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
|
| 127 |
+
if attempt == max_retries:
|
| 128 |
+
raise ValueError(
|
| 129 |
+
f"Translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]
|
| 133 |
+
|
| 134 |
+
result = []
|
| 135 |
+
for seg, translated_text in zip(segments, cleaned):
|
| 136 |
+
result.append({**seg, "translated_text": translated_text})
|
| 137 |
+
|
| 138 |
+
print(f"[s3] Translating via Pollinations complete ✓")
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
except Exception as e:
|
| 142 |
+
print(f"[s3] Pollinations translation error ({e}) — using fallback.")
|
| 143 |
+
|
| 144 |
+
# Language-specific fallback routing
|
| 145 |
+
if get_fallback_mode(target_language) == "bedrock":
|
| 146 |
+
return bedrock_fallback(segments, numbered, system_prompt)
|
| 147 |
+
|
| 148 |
+
# Default: Google Translate
|
| 149 |
+
from deep_translator import GoogleTranslator
|
| 150 |
+
try:
|
| 151 |
+
translator = GoogleTranslator(source='auto', target=target_language.lower())
|
| 152 |
+
except Exception as e2:
|
| 153 |
+
print(f"[s3] Fallback failed to init translator ({e2})")
|
| 154 |
+
raise
|
| 155 |
+
|
| 156 |
+
result = []
|
| 157 |
+
for seg in segments:
|
| 158 |
+
translated_text = translator.translate(seg["text"])
|
| 159 |
+
result.append({**seg, "translated_text": translated_text})
|
| 160 |
+
|
| 161 |
+
print(f"[s3] Translation via fallback complete ✓")
|
| 162 |
+
return result
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def translate(segments: list[dict], target_language: str) -> list[dict]:
|
| 166 |
+
"""
|
| 167 |
+
Translate the text of each segment into target_language in batches.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
segments: List of {start, end, text} dicts.
|
| 171 |
+
target_language: Full language name, e.g. "Spanish", "French", "Hindi".
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
Same list with 'translated_text' added to each segment.
|
| 175 |
+
Language-specific fields (e.g. 'tts_text') may also be added.
|
| 176 |
+
"""
|
| 177 |
+
if not segments:
|
| 178 |
+
return segments
|
| 179 |
+
|
| 180 |
+
print(f"[s3] Translating {len(segments)} segments → {target_language} (in batches)...")
|
| 181 |
+
|
| 182 |
+
BATCH_SIZE = 15
|
| 183 |
+
final_result = []
|
| 184 |
+
|
| 185 |
+
for i in range(0, len(segments), BATCH_SIZE):
|
| 186 |
+
batch = segments[i:i + BATCH_SIZE]
|
| 187 |
+
if len(segments) > BATCH_SIZE:
|
| 188 |
+
print(f"[s3] Processing batch {i//BATCH_SIZE + 1} ({len(batch)} items)...")
|
| 189 |
+
batch_result = _translate_batch(batch, target_language)
|
| 190 |
+
final_result.extend(batch_result)
|
| 191 |
+
|
| 192 |
+
# Run language-specific post-processing (e.g., Urdu transliteration)
|
| 193 |
+
final_result = post_translate(final_result, target_language)
|
| 194 |
+
|
| 195 |
+
return final_result
|