github-actions[bot] commited on
Commit
0422215
·
1 Parent(s): 10b6cf0

deploy: switch to dramabox requirements @ a95fda4

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +35 -0
  2. .gitattributes +14 -35
  3. .gitignore +31 -0
  4. CLAUDE.md +13 -0
  5. LICENSE +21 -0
  6. README.md +330 -8
  7. app.py +80 -0
  8. graphify-out/.graphify_python +1 -0
  9. graphify-out/.graphify_root +1 -0
  10. graphify-out/GRAPH_REPORT.md +465 -0
  11. graphify-out/graph.html +0 -0
  12. packages.txt +4 -0
  13. pipeline.py +363 -0
  14. pyproject.toml +59 -0
  15. requirements-cbox.txt +51 -0
  16. requirements-omni.txt +157 -0
  17. requirements-qwen3.txt +55 -0
  18. requirements.txt +62 -0
  19. scripts/prefetch_models.py +47 -0
  20. server.py +929 -0
  21. social_distributor/.env.example +16 -0
  22. social_distributor/.gitignore +8 -0
  23. social_distributor/README.md +205 -0
  24. social_distributor/post.py +311 -0
  25. social_distributor/poster/__init__.py +0 -0
  26. social_distributor/poster/auth/__init__.py +0 -0
  27. social_distributor/poster/auth/session.py +111 -0
  28. social_distributor/poster/caption_gen.py +164 -0
  29. social_distributor/poster/config.py +88 -0
  30. social_distributor/poster/creator_extract.py +149 -0
  31. social_distributor/poster/models.py +29 -0
  32. social_distributor/poster/platforms/__init__.py +0 -0
  33. social_distributor/poster/platforms/base.py +57 -0
  34. social_distributor/poster/platforms/instagram.py +206 -0
  35. social_distributor/poster/platforms/tiktok.py +155 -0
  36. social_distributor/poster/platforms/youtube.py +165 -0
  37. social_distributor/poster/post_log.py +45 -0
  38. social_distributor/poster/video_loader.py +101 -0
  39. social_distributor/pyproject.toml +20 -0
  40. social_distributor/uv.lock +0 -0
  41. steps/__init__.py +1 -0
  42. steps/lang/__init__.py +38 -0
  43. steps/lang/_shared.py +150 -0
  44. steps/lang/omnivoice_languages.py +652 -0
  45. steps/lang/qwen3_languages.py +15 -0
  46. steps/lang/urdu.py +324 -0
  47. steps/s1_extract_audio.py +68 -0
  48. steps/s1b_separate.py +152 -0
  49. steps/s2_transcribe.py +395 -0
  50. steps/s3_translate.py +195 -0
.env.example ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VideoVoice — Environment Variables
2
+ # Copy this to .env and fill in values
3
+
4
+ # Server port (default 8000)
5
+ PORT=8000
6
+
7
+ # Where per-job artifact folders get written. On HF Spaces this is resolved
8
+ # automatically (/data/jobs with persistent storage, /tmp/videovoice_jobs
9
+ # without). For local dev, set this to ./data so jobs land next to the repo
10
+ # — same layout the old `main` used.
11
+ ARTIFACTS_ROOT=./data
12
+
13
+ # OpenAI API key (for translation step)
14
+ OPENAI_API_KEY=sk-...
15
+
16
+ # Pollinations API key (optional, for Whisper transcription fallback)
17
+ POLLINATIONS_API_KEY=
18
+ POLLEN_TRANSCRIBE_MODEL=whisper-large-v3
19
+ POLLEN_MODEL=gemini-search
20
+
21
+ # Stripe (optional, for paid tiers)
22
+ STRIPE_PUBLISHABLE_KEY=
23
+ STRIPE_SECRET_KEY=
24
+
25
+ # AWS S3 (optional, for cloud storage)
26
+ AWS_ACCESS_KEY_ID=
27
+ AWS_SECRET_ACCESS_KEY=
28
+ AWS_S3_BUCKET=
29
+ AWS_REGION=us-east-1
30
+
31
+ # AWS Bedrock (optional, fallback translator for Urdu)
32
+ AWS_BEDROCK_API_KEY=
33
+ BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
34
+
35
+ HF_TOKEN=
.gitattributes CHANGED
@@ -1,35 +1,14 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Files in this repo that are dev-only and must NOT ship to the HF Spaces.
2
+ # `deploy.sh` honors this via `git archive --worktree-attributes`.
3
+ # Rule of thumb: if HF Spaces would never import/execute it, export-ignore it.
4
+ # Do NOT export-ignore server.py app.py imports from it at runtime on HF.
5
+
6
+ .github/ export-ignore
7
+ SPLIT_STRATEGY.md export-ignore
8
+ deploy.sh export-ignore
9
+ Dockerfile export-ignore
10
+ .dockerignore export-ignore
11
+ social_media_distributor/ export-ignore
12
+ frontend/ export-ignore
13
+ batch_translate.py export-ignore
14
+ client_insta_links.jsonl export-ignore
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.wav
2
+ *.mp4
3
+ *.mov
4
+ *.webp
5
+ *.ass
6
+ *.txt
7
+ !requirements.txt
8
+ !requirements-cbox.txt
9
+ !requirements-omni.txt
10
+ !requirements-qwen3.txt
11
+ !packages.txt
12
+ !SPLIT_STRATEGY.md
13
+ *.DS_Store
14
+ .env
15
+ .venv/
16
+ __pycache__/
17
+ **/__pycache__/
18
+ *.py[cod]
19
+ *$py.class
20
+ *.json
21
+ !data/showcase.json
22
+ tmp/
23
+ uploads/
24
+ outputs/
25
+ data/
26
+ batch_outputs/
27
+ # Subproject runtime artifacts (not for HF Space)
28
+ social_distributor/.venv/
29
+ social_distributor/poster/auth/storage/
30
+ social_distributor/debug_*.png
31
+ fine_tuning/
CLAUDE.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Deployment
2
+
3
+ HF Spaces deployment is fully automated via `.github/workflows/deploy-hf.yml`. Pushing to `origin/main` triggers the workflow which runs `./deploy.sh --force` and pushes to all three Spaces (Chatterbox, OmniVoice, Qwen3). Do not run `./deploy.sh` locally after a push — it is redundant. To verify a deploy, use `gh run list --workflow=deploy-hf.yml`.
4
+
5
+ ## graphify
6
+
7
+ This project has a graphify knowledge graph at graphify-out/.
8
+
9
+ Rules:
10
+ - Before answering architecture or codebase questions, read graphify-out/GRAPH_REPORT.md for god nodes and community structure
11
+ - If graphify-out/wiki/index.md exists, navigate it instead of reading raw files
12
+ - For cross-module "how does X relate to Y" questions, prefer `graphify query "<question>"`, `graphify path "<A>" "<B>"`, or `graphify explain "<concept>"` over grep — these traverse the graph's EXTRACTED + INFERRED edges instead of scanning files
13
+ - After modifying code files in this session, run `graphify update .` to keep the graph current (AST-only, no API cost)
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Raafi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,335 @@
1
  ---
2
- title: Videovoice Dramabox
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.12'
9
  app_file: app.py
10
- pinned: false
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: VideoVoice API
 
 
 
3
  sdk: gradio
4
+ sdk_version: 6.12.0
 
5
  app_file: app.py
6
+ python_version: "3.10"
7
  ---
8
 
9
+ <!--
10
+ ZeroGPU is enabled from the Space Settings UI (not via frontmatter).
11
+ PRO account required. `app.py` mounts the FastAPI pipeline onto Gradio
12
+ so the React client keeps calling `/api/*` over CORS unchanged.
13
+ -->
14
+
15
+
16
+ # VideoVoice
17
+
18
+ **AI-powered short video translation with zero-shot voice cloning.**
19
+
20
+ Translate any short video (≤60s) into 23+ languages while preserving the original speaker's voice. Paste an Instagram Reel, YouTube Short, or upload any video file.
21
+
22
+ ---
23
+
24
+ ## How It Works
25
+
26
+ 1. **Upload or Paste URL** — Drop a video file or paste a social media link
27
+ 2. **AI Translates & Clones** — Our 6-step pipeline transcribes, translates, and synthesizes new speech using a voice clone of the original speaker
28
+ 3. **Preview & Download** — Watch your translated video and download in full quality
29
+
30
+ ### Pipeline Architecture
31
+
32
+ ```
33
+ Video → Extract Audio → Whisper Transcription → LLM Translation
34
+ → Chatterbox Voice Clone + TTS → Time-Sync → Final Merge
35
+ ```
36
+
37
+ | Step | Component | Description |
38
+ |------|-----------|-------------|
39
+ | 1 | FFmpeg | Extract audio track from video |
40
+ | 2 | Whisper Large V3 | Transcribe with word-level timestamps |
41
+ | 3 | GPT-4o-mini | Context-aware subtitle translation |
42
+ | 4 | Chatterbox Multilingual | Zero-shot voice cloning + TTS synthesis |
43
+ | 5 | Dynamic Time-Stretch | Align translated audio to original timing |
44
+ | 6 | FFmpeg | Merge new audio track back into video |
45
+
46
+ ---
47
+
48
+ ## Running Locally
49
+
50
+ ### Prerequisites
51
+
52
+ - Python 3.10+ (`requires-python = ">=3.10,<3.13"`)
53
+ - FFmpeg (`brew install ffmpeg` on macOS, `sudo apt install ffmpeg` on Ubuntu)
54
+ - An OpenAI API key
55
+
56
+ ### First-time setup
57
+
58
+ ```bash
59
+ # 1. Install uv (skip if you already have it)
60
+ curl -LsSf https://astral.sh/uv/install.sh | sh
61
+
62
+ # 2. Clone and enter the repo
63
+ git clone https://github.com/Video-Voice/VideoVoice-be.git
64
+ cd VideoVoice-be
65
+
66
+ # 3. Install deps with the chatterbox TTS engine (default for local dev)
67
+ # Use `--extra omnivoice` instead if you want OmniVoice. The two extras
68
+ # are mutually exclusive — pick one.
69
+ uv sync --extra chatterbox
70
+
71
+ # 4. Configure env vars
72
+ cp .env.example .env
73
+ # Edit .env — at minimum set OPENAI_API_KEY and ARTIFACTS_ROOT=./data
74
+ ```
75
+
76
+ ### One-time: hide the vendored chatterbox folder
77
+
78
+ The repo ships a vendored `./chatterbox/` folder that the HF Chatterbox Space needs (it has ZeroGPU-specific tweaks). Locally we want Python to import the PyPI `chatterbox-tts` package instead, so tell git to ignore the working-tree state for that folder and delete it locally:
79
+
80
+ ```bash
81
+ git ls-files chatterbox/ | xargs git update-index --skip-worktree
82
+ rm -rf chatterbox/
83
+ ```
84
+
85
+ HEAD still contains the folder, so HF deploys are unaffected. Reverse with `git update-index --no-skip-worktree` + `git checkout HEAD -- chatterbox/`.
86
+
87
+ ### Run the server
88
+
89
+ ```bash
90
+ uv run python server.py
91
+ ```
92
+
93
+ Open [http://localhost:8000](http://localhost:8000). `/api/*` are the backend routes; `/` serves the legacy static UI in `frontend/`. If the port is in use, set `PORT=8001`.
94
+
95
+ Per-job artifacts land in `$ARTIFACTS_ROOT/<job_id>/`. With `ARTIFACTS_ROOT=./data` (in `.env`) that's `./data/<job_id>/` next to the repo — same layout the repo has always used.
96
+
97
+ ### Run the pipeline headlessly
98
+
99
+ ```bash
100
+ uv run python pipeline.py --input data/my_video.mp4 --target-lang Spanish
101
+ ```
102
+
103
+ ---
104
+ ## API Reference
105
+
106
+ The following endpoints are available on the backend (FastAPI/Gradio Server). When running on Hugging Face, replace `localhost:8000` with your Space's API URL (e.g., `https://rafii-videovoice.hf.space`).
107
+
108
+ ### Core Endpoints
109
+
110
+ #### `POST /api/jobs`
111
+ Submit a video for translation. You can provide either a local file or a URL.
112
+
113
+ **Form Data:**
114
+ - `file`: (Optional) Video file upload (MP4, MOV, WebM, ≤90MB).
115
+ - `url`: (Optional) Social media URL (Instagram, YouTube, TikTok).
116
+ - `target_language`: (Required) Name of target language (e.g., "Spanish", "Hindi").
117
+ - `source_language`: (Optional) ISO code of source (default: "en").
118
+ - `voice_mode`: (Optional) `chatterbox` or `omnivoice` (must match Space engine).
119
+ - `captions`: (Optional) "true" or "false" (default: "true").
120
+ - `preserve_music`: (Optional) "true" or "false" (default: "false").
121
+
122
+ **Example:**
123
+ ```bash
124
+ curl -X POST http://localhost:8000/api/jobs \
125
+ -F "file=@my_video.mp4" \
126
+ -F "target_language=French"
127
+ ```
128
+
129
+ #### `GET /api/jobs/{job_id}`
130
+ Poll for the real-time status and progress messages of a specific job.
131
+
132
+ **Query Parameters:**
133
+ - `after`: (Optional) Index of the last message received to fetch only new ones.
134
+
135
+ **Example:**
136
+ ```bash
137
+ curl http://localhost:8000/api/jobs/abc123_1?after=5
138
+ ```
139
+
140
+ #### `GET /api/jobs/{job_id}/result`
141
+ Download the final translated video file.
142
+
143
+ **Example:**
144
+ ```bash
145
+ curl -O -L http://localhost:8000/api/jobs/abc123_1/result
146
+ ```
147
+
148
+ ---
149
+
150
+ ### Utility & Configuration
151
+
152
+ #### `GET /api/config`
153
+ Fetch server configuration, including supported languages, max file size, and the active TTS engine.
154
+
155
+ #### `GET /api/health`
156
+ Check if the server is alive and see GPU availability/queue depth.
157
+
158
+ #### `GET /api/showcase`
159
+ Retrieve curated "before & after" demo entries defined in `data/showcase.json`.
160
+
161
+ #### `GET /api/demo-videos`
162
+ List all whitelisted demo videos available for streaming from the `outputs/` and `data/` folders.
163
+
164
+ #### `GET /api/demo-videos/{video_id}/stream`
165
+ Stream a specific demo video by its opaque ID.
166
+
167
+ ---
168
+
169
+ ### Interactive / Preview Endpoints
170
+
171
+ #### `GET /api/jobs/{job_id}/preview/{model_name}`
172
+ Retrieve a short audio snippet of the cloned voice for a specific TTS model before proceeding with full synthesis.
173
+
174
+ #### `POST /api/jobs/{job_id}/select-model`
175
+ Confirm which TTS model to use after listening to previews (used in multi-model workflows).
176
+
177
+ ---
178
+
179
+ ### ZeroGPU / Gradio Internal API
180
+
181
+ #### `POST /run_pipeline` (Gradio API)
182
+ Internal endpoint used by ZeroGPU to trigger the heavy ML processing logic. Recommended for use via `gradio_client`.
183
+
184
+ **Example (Python):**
185
+ ```python
186
+ from gradio_client import Client
187
+ client = Client("Rafii/videovoice")
188
+ client.predict(job_id="abc123_1", api_name="/run_pipeline")
189
+ ```
190
+
191
+ ---
192
+
193
+
194
+ ## Testing the API (Hugging Face Spaces)
195
+
196
+ When running on Hugging Face Spaces (using `app.py`), you can test the API using standard HTTP tools or the Gradio Client. Choose the Space corresponding to the desired TTS engine:
197
+
198
+ | TTS Engine | Space URL | API Endpoint |
199
+ |------------|-----------|--------------|
200
+ | **Chatterbox** | `Rafii/videovoice` | `https://rafii-videovoice.hf.space` |
201
+ | **OmniVoice** | `Rafii/videovoice-omni` | `https://rafii-videovoice-omni.hf.space` |
202
+
203
+ ### 1. Using `curl` (FastAPI Routes)
204
+
205
+ You can check the health of the API and verify that it's running:
206
+
207
+ ```bash
208
+ # Chatterbox Space
209
+ curl https://rafii-videovoice.hf.space/api/health
210
+
211
+ # OmniVoice Space
212
+ curl https://rafii-videovoice-omni.hf.space/api/health
213
+ ```
214
+
215
+ To submit a job via the standard API:
216
+
217
+ ```bash
218
+ curl -X POST https://rafii-videovoice.hf.space/api/jobs \
219
+ -F "url=https://www.instagram.com/reels/XYZ/" \
220
+ -F "target_language=Spanish"
221
+ ```
222
+
223
+ ### 2. Using `gradio_client` (Gradio API Routes)
224
+
225
+ The `gradio.Server` endpoints are optimized for ZeroGPU and can be accessed using the Python `gradio_client`:
226
+
227
+ ```python
228
+ from gradio_client import Client
229
+
230
+ # Change to "Rafii/videovoice-omni" for OmniVoice
231
+ client = Client("Rafii/videovoice")
232
+ result = client.predict(
233
+ job_id="abc123",
234
+ api_name="/run_pipeline"
235
+ )
236
+ print(result)
237
+ ```
238
+
239
+ ### 3. Using JavaScript (Frontend)
240
+
241
+ The new `gradio.Server` mode is designed for custom frontends. You can use the `@gradio/client` JS library:
242
+
243
+ ```javascript
244
+ import { Client } from "@gradio/client";
245
+
246
+ // Connect to the specific Space
247
+ const client = await Client.connect("Rafii/videovoice");
248
+ const result = await client.predict("/run_pipeline", {
249
+ job_id: "abc123",
250
+ });
251
+ ```
252
+
253
+ ---
254
+
255
+ ## Supported Languages
256
+
257
+ Spanish, French, German, Hindi, Portuguese, Italian, Japanese, Chinese, Arabic, Korean — and more.
258
+
259
+ ---
260
+
261
+ ## Project Structure
262
+
263
+ ```
264
+ VideoVoice/
265
+ ├── server.py # FastAPI backend
266
+ ├── pipeline.py # Core translation pipeline
267
+ ├── steps/ # Pipeline step modules
268
+ │ ├── s1_extract_audio.py
269
+ │ ├── s2_transcribe.py
270
+ │ ├── s3_translate.py
271
+ │ ├── s4_tts.py
272
+ │ ├── s5_sync.py
273
+ │ └── s6_merge.py
274
+ ├── frontend/ # Static web UI
275
+ │ ├── index.html
276
+ │ ├── style.css
277
+ │ └── app.js
278
+ ├── pyproject.toml # Dependencies & project config
279
+ ├── uv.lock # Lockfile (reproducible installs)
280
+ ├── .env.example
281
+ └── README.md
282
+ ```
283
+
284
+ ---
285
+
286
+ ## Entrypoints
287
+
288
+ Two files intentionally exist, run in different contexts, but **ship the same code**:
289
+
290
+ | File | When it runs | What it does |
291
+ |------|-------------|--------------|
292
+ | `server.py` | Local dev (`uv run python server.py`) | Plain FastAPI app — defines every `/api/*` route. |
293
+ | `app.py` | Hugging Face Spaces | Gradio Server that imports `server.py`'s router and wraps it with `@spaces.GPU` for ZeroGPU. |
294
+
295
+ `app.py` depends on `server.py`, so server.py must ship to HF. Do not strip it.
296
+
297
+ ## Deployment
298
+
299
+ ### Hugging Face Spaces (production)
300
+
301
+ Push to `main` → GitHub Actions runs `.github/workflows/deploy-hf.yml` → both Spaces (`Rafii/videovoice` and `Rafii/videovoice-omni`) redeploy automatically. No manual step.
302
+
303
+ One-time CI setup:
304
+ 1. Create an HF access token with write access to both Spaces: https://huggingface.co/settings/tokens
305
+ 2. Add it as `HF_TOKEN` under **Settings → Secrets and variables → Actions** in the GitHub repo.
306
+
307
+ Manual fallback (from a local clean checkout with `space` and `space-omni` remotes configured):
308
+ ```bash
309
+ ./deploy.sh # skips if remote is already at HEAD
310
+ ./deploy.sh --force # always redeploy
311
+ ```
312
+
313
+ Files filtered out of every Space deploy are listed in `.gitattributes` (`export-ignore`).
314
+
315
+ ### Branching
316
+
317
+ `main` is canonical. Use short-lived `feat/<thing>` branches, open a PR, merge, delete. Never maintain a parallel deploy branch — every change on main reaches both Spaces via CI.
318
+
319
+ ### AWS (alternative GPU host)
320
+
321
+ ```bash
322
+ # On a g4dn.xlarge instance
323
+ sudo apt update && sudo apt install -y ffmpeg
324
+ curl -LsSf https://astral.sh/uv/install.sh | sh
325
+ uv sync
326
+ uv run python server.py
327
+ ```
328
+
329
+ Recommended: use `systemd` service for auto-restart, CloudFront for CDN, S3 for video storage with 24h auto-delete lifecycle policy.
330
+
331
+ ---
332
+
333
+ ## License
334
+
335
+ MIT License — see [LICENSE](LICENSE).
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZeroGPU-compatible entrypoint using gradio.Server.
3
+ Server extends FastAPI, so all your existing API routes work unchanged.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+
9
+ # 1. Lightweight imports only at top level
10
+ import spaces
11
+ import gradio as gr
12
+ from gradio import Server
13
+ from gradio.data_classes import FileData
14
+ from fastapi import Request
15
+ from slowapi.errors import RateLimitExceeded
16
+ from slowapi import _rate_limit_exceeded_handler
17
+
18
+ TTS_ENGINE = os.getenv("TTS_ENGINE", "chatterbox").lower()
19
+
20
+ # 2. Create Server instead of FastAPI
21
+ # Name it 'demo' so HF Space picks it up automatically
22
+ demo = Server()
23
+
24
+ # -----------------------------------------------------
25
+ # INTEGRATE SERVER.PY ROUTES
26
+ # -----------------------------------------------------
27
+ from server import router, limiter, enforce_content_length_limit
28
+ from tools_api import router as tools_router
29
+
30
+ demo.include_router(router)
31
+ demo.include_router(tools_router)
32
+ demo.state.limiter = limiter
33
+ demo.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
34
+
35
+ # Apply content length middleware to the main app
36
+ @demo.middleware("http")
37
+ async def content_length_middleware(request: Request, call_next):
38
+ return await enforce_content_length_limit(request, call_next)
39
+
40
+ @demo.get("/api/health")
41
+ def health():
42
+ return {"status": "ok", "tts": TTS_ENGINE}
43
+
44
+ # -----------------------------------------------------
45
+ # ZERO GPU FUNCTION — lazy-loads torch/CUDA
46
+ # -----------------------------------------------------
47
+ @spaces.GPU(duration=60)
48
+ def run_pipeline(job_id: str):
49
+ from pipeline import process_job
50
+ return process_job(job_id)
51
+
52
+ # -----------------------------------------------------
53
+ # GRADIO API INTEGRATION (this is what ZeroGPU detects)
54
+ # -----------------------------------------------------
55
+ @demo.api(name="run_pipeline")
56
+ def api_run_pipeline(job_id: str):
57
+ """
58
+ Exposed through Gradio's API engine.
59
+ ZeroGPU will allocate a GPU when this endpoint is called.
60
+ """
61
+ return run_pipeline(job_id)
62
+
63
+ # -----------------------------------------------------
64
+ # OPTIONAL: Gradio UI (if you still want a basic UI)
65
+ # -----------------------------------------------------
66
+ with gr.Blocks(title="VideoVoice API") as ui:
67
+ gr.Markdown(f"# VideoVoice API ({TTS_ENGINE.upper()})")
68
+ job_id_box = gr.Textbox(label="Job ID")
69
+ output_box = gr.Textbox(label="Result")
70
+ btn = gr.Button("Run Pipeline")
71
+ btn.click(fn=run_pipeline, inputs=job_id_box, outputs=output_box)
72
+
73
+ # Mount the UI onto the Server instance
74
+ gr.mount_gradio_app(demo, ui, path="/ui")
75
+
76
+ # -----------------------------------------------------
77
+ # ENTRYPOINT
78
+ # -----------------------------------------------------
79
+ if __name__ == "__main__":
80
+ demo.launch(show_error=True)
graphify-out/.graphify_python ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/rafa/.local/share/uv/tools/graphifyy/bin/python
graphify-out/.graphify_root ADDED
@@ -0,0 +1 @@
 
 
1
+ /Users/rafa/MscAi/VideoVoice-be
graphify-out/GRAPH_REPORT.md ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Graph Report - VideoVoice-be (2026-05-17)
2
+
3
+ ## Corpus Check
4
+ - 60 files · ~254,726 words
5
+ - Verdict: corpus is large enough that graph structure adds value.
6
+
7
+ ## Summary
8
+ - 1065 nodes · 1859 edges · 64 communities detected
9
+ - Extraction: 79% EXTRACTED · 21% INFERRED · 0% AMBIGUOUS · INFERRED: 397 edges (avg confidence: 0.62)
10
+ - Token cost: 0 input · 0 output
11
+
12
+ ## Community Hubs (Navigation)
13
+ - [[_COMMUNITY_Community 0|Community 0]]
14
+ - [[_COMMUNITY_Community 1|Community 1]]
15
+ - [[_COMMUNITY_Community 2|Community 2]]
16
+ - [[_COMMUNITY_Community 3|Community 3]]
17
+ - [[_COMMUNITY_Community 4|Community 4]]
18
+ - [[_COMMUNITY_Community 5|Community 5]]
19
+ - [[_COMMUNITY_Community 6|Community 6]]
20
+ - [[_COMMUNITY_Community 7|Community 7]]
21
+ - [[_COMMUNITY_Community 8|Community 8]]
22
+ - [[_COMMUNITY_Community 9|Community 9]]
23
+ - [[_COMMUNITY_Community 10|Community 10]]
24
+ - [[_COMMUNITY_Community 11|Community 11]]
25
+ - [[_COMMUNITY_Community 12|Community 12]]
26
+ - [[_COMMUNITY_Community 13|Community 13]]
27
+ - [[_COMMUNITY_Community 14|Community 14]]
28
+ - [[_COMMUNITY_Community 15|Community 15]]
29
+ - [[_COMMUNITY_Community 16|Community 16]]
30
+ - [[_COMMUNITY_Community 17|Community 17]]
31
+ - [[_COMMUNITY_Community 18|Community 18]]
32
+ - [[_COMMUNITY_Community 19|Community 19]]
33
+ - [[_COMMUNITY_Community 20|Community 20]]
34
+ - [[_COMMUNITY_Community 21|Community 21]]
35
+ - [[_COMMUNITY_Community 22|Community 22]]
36
+ - [[_COMMUNITY_Community 23|Community 23]]
37
+ - [[_COMMUNITY_Community 25|Community 25]]
38
+ - [[_COMMUNITY_Community 33|Community 33]]
39
+ - [[_COMMUNITY_Community 34|Community 34]]
40
+ - [[_COMMUNITY_Community 35|Community 35]]
41
+ - [[_COMMUNITY_Community 36|Community 36]]
42
+ - [[_COMMUNITY_Community 37|Community 37]]
43
+ - [[_COMMUNITY_Community 38|Community 38]]
44
+ - [[_COMMUNITY_Community 39|Community 39]]
45
+ - [[_COMMUNITY_Community 40|Community 40]]
46
+ - [[_COMMUNITY_Community 41|Community 41]]
47
+ - [[_COMMUNITY_Community 42|Community 42]]
48
+ - [[_COMMUNITY_Community 43|Community 43]]
49
+ - [[_COMMUNITY_Community 44|Community 44]]
50
+ - [[_COMMUNITY_Community 45|Community 45]]
51
+ - [[_COMMUNITY_Community 46|Community 46]]
52
+ - [[_COMMUNITY_Community 47|Community 47]]
53
+ - [[_COMMUNITY_Community 48|Community 48]]
54
+ - [[_COMMUNITY_Community 49|Community 49]]
55
+ - [[_COMMUNITY_Community 50|Community 50]]
56
+ - [[_COMMUNITY_Community 51|Community 51]]
57
+ - [[_COMMUNITY_Community 52|Community 52]]
58
+ - [[_COMMUNITY_Community 53|Community 53]]
59
+ - [[_COMMUNITY_Community 54|Community 54]]
60
+ - [[_COMMUNITY_Community 55|Community 55]]
61
+ - [[_COMMUNITY_Community 56|Community 56]]
62
+ - [[_COMMUNITY_Community 57|Community 57]]
63
+ - [[_COMMUNITY_Community 58|Community 58]]
64
+ - [[_COMMUNITY_Community 59|Community 59]]
65
+ - [[_COMMUNITY_Community 60|Community 60]]
66
+ - [[_COMMUNITY_Community 61|Community 61]]
67
+ - [[_COMMUNITY_Community 62|Community 62]]
68
+ - [[_COMMUNITY_Community 63|Community 63]]
69
+ - [[_COMMUNITY_Community 64|Community 64]]
70
+ - [[_COMMUNITY_Community 65|Community 65]]
71
+ - [[_COMMUNITY_Community 66|Community 66]]
72
+ - [[_COMMUNITY_Community 67|Community 67]]
73
+ - [[_COMMUNITY_Community 68|Community 68]]
74
+ - [[_COMMUNITY_Community 69|Community 69]]
75
+ - [[_COMMUNITY_Community 70|Community 70]]
76
+ - [[_COMMUNITY_Community 71|Community 71]]
77
+
78
+ ## God Nodes (most connected - your core abstractions)
79
+ 1. `Qwen3TTSSpeakerEncoderConfig` - 49 edges
80
+ 2. `Qwen3TTSTalkerCodePredictorConfig` - 49 edges
81
+ 3. `Qwen3TTSTalkerConfig` - 49 edges
82
+ 4. `Qwen3TTSConfig` - 48 edges
83
+ 5. `Qwen3TTSModel` - 21 edges
84
+ 6. `PostResult` - 19 edges
85
+ 7. `Qwen3TTSTalkerForConditionalGeneration` - 19 edges
86
+ 8. `Qwen3TTSTalkerCodePredictorModelForConditionalGeneration` - 17 edges
87
+ 9. `generate()` - 15 edges
88
+ 10. `BasePoster` - 14 edges
89
+
90
+ ## Surprising Connections (you probably didn't know these)
91
+ - `chatterbox-tts==0.1.7 --no-deps` --semantically_similar_to--> `omnivoice>=0.1.4` [INFERRED] [semantically similar]
92
+ requirements.txt → requirements-omni.txt
93
+ - `gradio==6.8.0` --semantically_similar_to--> `gradio==6.12.0 (omni)` [INFERRED] [semantically similar]
94
+ requirements.txt → requirements-omni.txt
95
+ - `enforce_content_length_limit()` --calls--> `content_length_middleware()` [INFERRED]
96
+ server.py → app.py
97
+ - `run_pipeline()` --calls--> `separate_audio()` [INFERRED]
98
+ pipeline.py → steps/s1b_separate.py
99
+ - `run_pipeline()` --calls--> `transcribe()` [INFERRED]
100
+ pipeline.py → steps/s2_transcribe.py
101
+
102
+ ## Hyperedges (group relationships)
103
+ - **Six-step translation pipeline** — [EXTRACTED 1.00]
104
+ - **TTS engine split (env, two reqs files, two spaces, conditional imports)** — [EXTRACTED 1.00]
105
+ - **Live pipeline run (s1b->s2->s3->s4->s5->s6)** — [EXTRACTED 1.00]
106
+
107
+ ## Communities
108
+
109
+ ### Community 0 - "Community 0"
110
+ Cohesion: 0.04
111
+ Nodes (70): Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig, r""" This is the configuration class to store the configuration of a [`Qwen3, r""" This is the configuration class to store the configuration of a [`Qwen3, This is the configuration class to store the configuration of a [`Qwen3TTSForCon, r""" This is the configuration class to store the configuration of a [`Qwen3 (+62 more)
112
+
113
+ ### Community 1 - "Community 1"
114
+ Cohesion: 0.02
115
+ Nodes (118): api_run_pipeline(), content_length_middleware(), ZeroGPU-compatible entrypoint using gradio.Server. Server extends FastAPI, so al, Exposed through Gradio's API engine. ZeroGPU will allocate a GPU when this e, run_pipeline(), BaseHTTPMiddleware, BaseModel, _artifact_reaper_loop() (+110 more)
116
+
117
+ ### Community 2 - "Community 2"
118
+ Cohesion: 0.04
119
+ Nodes (38): default(), DistributedGroupResidualVectorQuantization, DistributedResidualVectorQuantization, ema_inplace(), EuclideanCodebook, kmeans(), laplace_smoothing(), postprocess_emb() (+30 more)
120
+
121
+ ### Community 3 - "Community 3"
122
+ Cohesion: 0.05
123
+ Nodes (57): ABC, BasePoster, Abstract base class for platform posters., Save a debug screenshot on failure., BasePoster, _build_system_prompt(), _build_user_prompt(), format_caption() (+49 more)
124
+
125
+ ### Community 4 - "Community 4"
126
+ Cohesion: 0.06
127
+ Nodes (31): _audio_to_tuple(), _build_choices_and_map(), build_demo(), build_parser(), _collect_gen_kwargs(), _detect_model_kind(), _dtype_from_str(), main() (+23 more)
128
+
129
+ ### Community 5 - "Community 5"
130
+ Cohesion: 0.06
131
+ Nodes (59): post(), _assign_words_to_segments(), _extract_words(), _get_faster_whisper_model(), _get_local_whisper_backend(), _get_openai_whisper_model(), _normalise_segments(), Step 3: Transcribe audio with timestamps. Primary local backend (device-depende (+51 more)
132
+
133
+ ### Community 6 - "Community 6"
134
+ Cohesion: 0.07
135
+ Nodes (50): forward(), generate(), generate_speaker_prompt(), from_pretrained(), _clip_audio(), _ensure_browser_wav(), _filter_preview_segments(), _free_memory() (+42 more)
136
+
137
+ ### Community 7 - "Community 7"
138
+ Cohesion: 0.05
139
+ Nodes (49): FFmpeg concat list (synced TTS), Try-Now app panel, app.js script ref, Comparison table (HeyGen, Rask, ElevenLabs, Synthesia), Hero section + 23+ languages, Frontend index.html, Source/target language selectors, Pricing tiers (Free/Starter/Creator) (+41 more)
140
+
141
+ ### Community 8 - "Community 8"
142
+ Cohesion: 0.07
143
+ Nodes (35): _collect_output(), _log_step_done(), main(), pipeline.py — Core pipeline: CLI entrypoint + importable run_pipeline() for Grad, Print duration + separator line for a completed step., Collect all yields and the return value from the generator., Run the full translation pipeline, yielding progress messages. Args:, run_pipeline() (+27 more)
144
+
145
+ ### Community 9 - "Community 9"
146
+ Cohesion: 0.09
147
+ Nodes (27): $(), clearFile(), createDemoCard(), detectPlatform(), formatBytes(), formatDemoDate(), formatDemoTitle(), getUsedVideos() (+19 more)
148
+
149
+ ### Community 10 - "Community 10"
150
+ Cohesion: 0.09
151
+ Nodes (34): Step 4: Translate segment texts using Pollinations chat completions API (OpenAI-, Translate a batch of segments into target_language., _translate_batch(), bedrock_converse(), bedrock_fallback(), build_client(), log_llm_call(), parse_json_array() (+26 more)
152
+
153
+ ### Community 11 - "Community 11"
154
+ Cohesion: 0.08
155
+ Nodes (32): _apply_demucs(), _get_model(), _load_and_normalise(), Step 1b: Separate vocals from accompaniment using Demucs (Python API). In-proce, Lazy-load htdemucs once per process. Module-level semantics; we load on firs, GPU-bound inference call. `mix` shape: [1, channels, time]., Load WAV, resample/remix to match model requirements, z-normalise., Separate vocals from accompaniment using Demucs htdemucs (Python API). Args (+24 more)
156
+
157
+ ### Community 12 - "Community 12"
158
+ Cohesion: 0.1
159
+ Nodes (28): tools_api — Standalone endpoints for creator quick tools. Lives alongside the m, audio_cleanup_endpoint(), dramabox_endpoint(), _ext_to_media_type(), APIRouter for /api/tools/* endpoints. Each endpoint is sync request-response (n, Serve a generated artifact. Run dirs auto-expire after RUN_TTL_SECONDS., Manual reap trigger (mostly for testing). Auto-reap runs on a timer., Serve a generated artifact. Run dirs auto-expire after RUN_TTL_SECONDS. (+20 more)
160
+
161
+ ### Community 13 - "Community 13"
162
+ Cohesion: 0.12
163
+ Nodes (27): build_for_job(), ensure_transcription(), extract_audio_hq(), extract_reference_audio(), get_audio_duration(), get_device(), load_chatterbox(), main() (+19 more)
164
+
165
+ ### Community 14 - "Community 14"
166
+ Cohesion: 0.12
167
+ Nodes (23): build_t3_cond(), main(), prepare_sample(), prepare_sample.py — Turn one dataset.jsonl row into the exact tensors T3.loss(), Build the speaker conditioning (frozen during training)., MTLTokenizer + SOT/EOT padding (mirrors what generate() does internally)., S3Tokenizer on the target dubbed audio → speech tokens (the LABEL). Critica, Turn one dataset row into ready-to-train tensors. (+15 more)
168
+
169
+ ### Community 15 - "Community 15"
170
+ Cohesion: 0.13
171
+ Nodes (26): _compress_silences(), _detect_pauses(), _distribute_padding(), _find_tts_silences(), _generate_silence(), _get_wav_duration(), _pad_silence(), _pause_aware_sync() (+18 more)
172
+
173
+ ### Community 16 - "Community 16"
174
+ Cohesion: 0.19
175
+ Nodes (18): _burn_in(), _clamp(), _extract_audio(), _force_style_for(), _format_timestamp_srt(), _format_timestamp_vtt(), generate_subtitles(), _is_video() (+10 more)
176
+
177
+ ### Community 17 - "Community 17"
178
+ Cohesion: 0.22
179
+ Nodes (12): download_result(), _is_noise(), main(), Batch translate Instagram reels to English via the VideoVoice server API. Usage, Extract the Instagram reel shortcode from a URL, e.g. 'DWn_yPoDsYw'., Submit a single video URL and return the job_id., Return True if a log line is internal noise we don't want in the log., Poll job status until complete or error. Returns final messages and collected lo (+4 more)
180
+
181
+ ### Community 18 - "Community 18"
182
+ Cohesion: 0.23
183
+ Nodes (12): evaluate(), load_baseline(), load_with_lora(), main(), pick_held_out_samples(), print_summary(), eval.py — Evaluate the fine-tuned LoRA against the un-tuned baseline. Picks N s, Return overshoot samples (duration_diff > 0.2) — these are NOT in the asymme (+4 more)
184
+
185
+ ### Community 19 - "Community 19"
186
+ Cohesion: 0.24
187
+ Nodes (11): extract_creator(), _extract_instagram(), _extract_tiktok(), _extract_youtube(), _load_cache(), Extract original creator @username from video URLs., YouTube: visit video page, extract channel name from meta tags., Extract the @username of the original creator from the video URL. Uses Play (+3 more)
188
+
189
+ ### Community 20 - "Community 20"
190
+ Cohesion: 0.27
191
+ Nodes (9): get_fallback_mode(), _get_handler(), get_translation_prompt(), post_translate(), Language-specific handlers for the translation pipeline. Each language that nee, Return a language-specific translation prompt, or the default., Return 'bedrock' or 'google' depending on the language., Run any language-specific post-processing after translation. (+1 more)
192
+
193
+ ### Community 21 - "Community 21"
194
+ Cohesion: 0.38
195
+ Nodes (6): _ensure_server(), _generate_impl(), generate_scene(), Dramabox — Resemble AI directable speech engine. Single-Space tool: generates a, Lazy-import the Dramabox model + load checkpoints once. Raises a clean Runti, Run Dramabox on `prompt` and write the resulting WAV under `out_dir`. Retur
196
+
197
+ ### Community 22 - "Community 22"
198
+ Cohesion: 0.53
199
+ Nodes (5): main(), _prefetch_chatterbox(), _prefetch_demucs(), _prefetch_faster_whisper(), Prefetch model weights into HF_HOME for faster cold starts on Spaces.
200
+
201
+ ### Community 23 - "Community 23"
202
+ Cohesion: 0.33
203
+ Nodes (6): app.py validation, pipeline.py simplified, steps/s4_preview.py, steps/s4_tts.py conditional imports, server.py /api/config, TTS_ENGINE env var
204
+
205
+ ### Community 25 - "Community 25"
206
+ Cohesion: 1.0
207
+ Nodes (2): gradio==6.8.0, gradio==6.12.0 (omni)
208
+
209
+ ### Community 33 - "Community 33"
210
+ Cohesion: 1.0
211
+ Nodes (1): Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.
212
+
213
+ ### Community 34 - "Community 34"
214
+ Cohesion: 1.0
215
+ Nodes (1): Build voice-clone prompt items from reference audio (and optionally reference te
216
+
217
+ ### Community 35 - "Community 35"
218
+ Cohesion: 1.0
219
+ Nodes (1): Voice clone speech using the Base model. You can provide either:
220
+
221
+ ### Community 36 - "Community 36"
222
+ Cohesion: 1.0
223
+ Nodes (1): Generate speech with the VoiceDesign model using natural-language style instruct
224
+
225
+ ### Community 37 - "Community 37"
226
+ Cohesion: 1.0
227
+ Nodes (1): Generate speech with the CustomVoice model using a predefined speaker id, option
228
+
229
+ ### Community 38 - "Community 38"
230
+ Cohesion: 1.0
231
+ Nodes (1): Delete stale per-job artifact directories from ARTIFACTS_ROOT.
232
+
233
+ ### Community 39 - "Community 39"
234
+ Cohesion: 1.0
235
+ Nodes (1): Reject oversized uploads before body parsing.
236
+
237
+ ### Community 40 - "Community 40"
238
+ Cohesion: 1.0
239
+ Nodes (1): Run the translation pipeline in a background thread, pushing progress to the job
240
+
241
+ ### Community 41 - "Community 41"
242
+ Cohesion: 1.0
243
+ Nodes (1): List whitelisted MP4 demo videos from outputs/ and data/.
244
+
245
+ ### Community 42 - "Community 42"
246
+ Cohesion: 1.0
247
+ Nodes (1): Return curated showcase entries with resolved streaming URLs.
248
+
249
+ ### Community 43 - "Community 43"
250
+ Cohesion: 1.0
251
+ Nodes (1): Submit a video for translation.
252
+
253
+ ### Community 44 - "Community 44"
254
+ Cohesion: 1.0
255
+ Nodes (1): Poll endpoint returning new messages since index `after`, plus live wait status.
256
+
257
+ ### Community 45 - "Community 45"
258
+ Cohesion: 1.0
259
+ Nodes (1): User selects a TTS model after previewing.
260
+
261
+ ### Community 46 - "Community 46"
262
+ Cohesion: 1.0
263
+ Nodes (1): Serve a preview audio WAV file.
264
+
265
+ ### Community 47 - "Community 47"
266
+ Cohesion: 1.0
267
+ Nodes (1): Download the translated video.
268
+
269
+ ### Community 48 - "Community 48"
270
+ Cohesion: 1.0
271
+ Nodes (1): Create artifact directories and start background cleanup.
272
+
273
+ ### Community 49 - "Community 49"
274
+ Cohesion: 1.0
275
+ Nodes (1): Sync TTS audio using pause-aware strategy: compress silences first, then atempo.
276
+
277
+ ### Community 50 - "Community 50"
278
+ Cohesion: 1.0
279
+ Nodes (1): Rewrite WAV with silence regions compressed to keep_ratio of their original dura
280
+
281
+ ### Community 51 - "Community 51"
282
+ Cohesion: 1.0
283
+ Nodes (1): Insert extra silence distributed across detected pause points.
284
+
285
+ ### Community 52 - "Community 52"
286
+ Cohesion: 1.0
287
+ Nodes (1): Generate a silent WAV file of given duration.
288
+
289
+ ### Community 53 - "Community 53"
290
+ Cohesion: 1.0
291
+ Nodes (1): Sync each TTS segment to its original timestamp window and stitch into a single
292
+
293
+ ### Community 54 - "Community 54"
294
+ Cohesion: 1.0
295
+ Nodes (1): Translate the text of each segment into target_language in batches. Args:
296
+
297
+ ### Community 55 - "Community 55"
298
+ Cohesion: 1.0
299
+ Nodes (1): Load + run Chatterbox inside a single GPU-decorated scope. ZeroGPU only int
300
+
301
+ ### Community 56 - "Community 56"
302
+ Cohesion: 1.0
303
+ Nodes (1): Remove trailing noise/artifacts after speech ends.
304
+
305
+ ### Community 57 - "Community 57"
306
+ Cohesion: 1.0
307
+ Nodes (1): Hard-trim TTS output to orig_dur * headroom, with a short fade-out.
308
+
309
+ ### Community 58 - "Community 58"
310
+ Cohesion: 1.0
311
+ Nodes (1): Clip audio to max_sec to prevent excessively slow voice cloning.
312
+
313
+ ### Community 59 - "Community 59"
314
+ Cohesion: 1.0
315
+ Nodes (1): Numpy variant of _trim_trailing_noise for engines returning np.ndarray.
316
+
317
+ ### Community 60 - "Community 60"
318
+ Cohesion: 1.0
319
+ Nodes (1): Perform full OmniVoice processing (load + generate batch) inside a GPU-decorated
320
+
321
+ ### Community 61 - "Community 61"
322
+ Cohesion: 1.0
323
+ Nodes (1): Generate speech for all segments using OmniVoice voice cloning.
324
+
325
+ ### Community 62 - "Community 62"
326
+ Cohesion: 1.0
327
+ Nodes (1): Synthesise translated text for each segment using voice cloned from reference au
328
+
329
+ ### Community 63 - "Community 63"
330
+ Cohesion: 1.0
331
+ Nodes (1): torch==2.6.0
332
+
333
+ ### Community 64 - "Community 64"
334
+ Cohesion: 1.0
335
+ Nodes (1): fastapi
336
+
337
+ ### Community 65 - "Community 65"
338
+ Cohesion: 1.0
339
+ Nodes (1): yt-dlp
340
+
341
+ ### Community 66 - "Community 66"
342
+ Cohesion: 1.0
343
+ Nodes (1): diffusers==0.29.0
344
+
345
+ ### Community 67 - "Community 67"
346
+ Cohesion: 1.0
347
+ Nodes (1): ARTIFACTS_ROOT env
348
+
349
+ ### Community 68 - "Community 68"
350
+ Cohesion: 1.0
351
+ Nodes (1): AWS g4dn.xlarge alternative
352
+
353
+ ### Community 69 - "Community 69"
354
+ Cohesion: 1.0
355
+ Nodes (1): nodejs (system pkg)
356
+
357
+ ### Community 70 - "Community 70"
358
+ Cohesion: 1.0
359
+ Nodes (1): fonts-noto-core / cjk
360
+
361
+ ### Community 71 - "Community 71"
362
+ Cohesion: 1.0
363
+ Nodes (1): graphify project rules
364
+
365
+ ## Knowledge Gaps
366
+ - **329 isolated node(s):** `server.py — FastAPI backend for VideoVoice. Endpoints: POST /api/jobs`, `Download video from Instagram/YouTube using yt-dlp.`, `Allow only trusted social platforms for yt-dlp.`, `Read media duration from ffprobe.`, `Report CUDA/MPS availability.` (+324 more)
367
+ These have ≤1 connection - possible missing edges or undocumented components.
368
+ - **Thin community `Community 25`** (2 nodes): `gradio==6.8.0`, `gradio==6.12.0 (omni)`
369
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
370
+ - **Thin community `Community 33`** (1 nodes): `Load a Qwen3 TTS model and its processor in HuggingFace `from_pretrained` style.`
371
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
372
+ - **Thin community `Community 34`** (1 nodes): `Build voice-clone prompt items from reference audio (and optionally reference te`
373
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
374
+ - **Thin community `Community 35`** (1 nodes): `Voice clone speech using the Base model. You can provide either:`
375
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
376
+ - **Thin community `Community 36`** (1 nodes): `Generate speech with the VoiceDesign model using natural-language style instruct`
377
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
378
+ - **Thin community `Community 37`** (1 nodes): `Generate speech with the CustomVoice model using a predefined speaker id, option`
379
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
380
+ - **Thin community `Community 38`** (1 nodes): `Delete stale per-job artifact directories from ARTIFACTS_ROOT.`
381
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
382
+ - **Thin community `Community 39`** (1 nodes): `Reject oversized uploads before body parsing.`
383
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
384
+ - **Thin community `Community 40`** (1 nodes): `Run the translation pipeline in a background thread, pushing progress to the job`
385
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
386
+ - **Thin community `Community 41`** (1 nodes): `List whitelisted MP4 demo videos from outputs/ and data/.`
387
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
388
+ - **Thin community `Community 42`** (1 nodes): `Return curated showcase entries with resolved streaming URLs.`
389
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
390
+ - **Thin community `Community 43`** (1 nodes): `Submit a video for translation.`
391
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
392
+ - **Thin community `Community 44`** (1 nodes): `Poll endpoint returning new messages since index `after`, plus live wait status.`
393
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
394
+ - **Thin community `Community 45`** (1 nodes): `User selects a TTS model after previewing.`
395
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
396
+ - **Thin community `Community 46`** (1 nodes): `Serve a preview audio WAV file.`
397
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
398
+ - **Thin community `Community 47`** (1 nodes): `Download the translated video.`
399
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
400
+ - **Thin community `Community 48`** (1 nodes): `Create artifact directories and start background cleanup.`
401
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
402
+ - **Thin community `Community 49`** (1 nodes): `Sync TTS audio using pause-aware strategy: compress silences first, then atempo.`
403
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
404
+ - **Thin community `Community 50`** (1 nodes): `Rewrite WAV with silence regions compressed to keep_ratio of their original dura`
405
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
406
+ - **Thin community `Community 51`** (1 nodes): `Insert extra silence distributed across detected pause points.`
407
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
408
+ - **Thin community `Community 52`** (1 nodes): `Generate a silent WAV file of given duration.`
409
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
410
+ - **Thin community `Community 53`** (1 nodes): `Sync each TTS segment to its original timestamp window and stitch into a single`
411
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
412
+ - **Thin community `Community 54`** (1 nodes): `Translate the text of each segment into target_language in batches. Args:`
413
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
414
+ - **Thin community `Community 55`** (1 nodes): `Load + run Chatterbox inside a single GPU-decorated scope. ZeroGPU only int`
415
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
416
+ - **Thin community `Community 56`** (1 nodes): `Remove trailing noise/artifacts after speech ends.`
417
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
418
+ - **Thin community `Community 57`** (1 nodes): `Hard-trim TTS output to orig_dur * headroom, with a short fade-out.`
419
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
420
+ - **Thin community `Community 58`** (1 nodes): `Clip audio to max_sec to prevent excessively slow voice cloning.`
421
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
422
+ - **Thin community `Community 59`** (1 nodes): `Numpy variant of _trim_trailing_noise for engines returning np.ndarray.`
423
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
424
+ - **Thin community `Community 60`** (1 nodes): `Perform full OmniVoice processing (load + generate batch) inside a GPU-decorated`
425
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
426
+ - **Thin community `Community 61`** (1 nodes): `Generate speech for all segments using OmniVoice voice cloning.`
427
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
428
+ - **Thin community `Community 62`** (1 nodes): `Synthesise translated text for each segment using voice cloned from reference au`
429
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
430
+ - **Thin community `Community 63`** (1 nodes): `torch==2.6.0`
431
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
432
+ - **Thin community `Community 64`** (1 nodes): `fastapi`
433
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
434
+ - **Thin community `Community 65`** (1 nodes): `yt-dlp`
435
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
436
+ - **Thin community `Community 66`** (1 nodes): `diffusers==0.29.0`
437
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
438
+ - **Thin community `Community 67`** (1 nodes): `ARTIFACTS_ROOT env`
439
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
440
+ - **Thin community `Community 68`** (1 nodes): `AWS g4dn.xlarge alternative`
441
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
442
+ - **Thin community `Community 69`** (1 nodes): `nodejs (system pkg)`
443
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
444
+ - **Thin community `Community 70`** (1 nodes): `fonts-noto-core / cjk`
445
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
446
+ - **Thin community `Community 71`** (1 nodes): `graphify project rules`
447
+ Too small to be a meaningful cluster - may be noise or needs more connections extracted.
448
+
449
+ ## Suggested Questions
450
+ _Questions this graph is uniquely positioned to answer:_
451
+
452
+ - **Why does `synthesise_segments()` connect `Community 6` to `Community 8`, `Community 11`?**
453
+ _High betweenness centrality (0.324) - this node is a cross-community bridge._
454
+ - **Why does `generate()` connect `Community 6` to `Community 0`, `Community 4`?**
455
+ _High betweenness centrality (0.200) - this node is a cross-community bridge._
456
+ - **Are the 44 inferred relationships involving `Qwen3TTSSpeakerEncoderConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
457
+ _`Qwen3TTSSpeakerEncoderConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
458
+ - **Are the 44 inferred relationships involving `Qwen3TTSTalkerCodePredictorConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
459
+ _`Qwen3TTSTalkerCodePredictorConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
460
+ - **Are the 44 inferred relationships involving `Qwen3TTSTalkerConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
461
+ _`Qwen3TTSTalkerConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
462
+ - **Are the 44 inferred relationships involving `Qwen3TTSConfig` (e.g. with `Res2NetBlock` and `SqueezeExcitationBlock`) actually correct?**
463
+ _`Qwen3TTSConfig` has 44 INFERRED edges - model-reasoned connections that need verification._
464
+ - **What connects `server.py — FastAPI backend for VideoVoice. Endpoints: POST /api/jobs`, `Download video from Instagram/YouTube using yt-dlp.`, `Allow only trusted social platforms for yt-dlp.` to the rest of the system?**
465
+ _329 weakly-connected nodes found - possible documentation gaps or missing edges._
graphify-out/graph.html ADDED
The diff for this file is too large to render. See raw diff
 
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ffmpeg
2
+ nodejs
3
+ fonts-noto-core
4
+ fonts-noto-cjk
pipeline.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ pipeline.py — Core pipeline: CLI entrypoint + importable run_pipeline() for Gradio.
4
+
5
+ Usage:
6
+ python pipeline.py --input data/test_video_3.mp4 --target-lang Spanish
7
+ """
8
+
9
+ import argparse
10
+ import os
11
+ import io
12
+ import logging
13
+ import os
14
+ import shutil
15
+ import sys
16
+ import threading
17
+ import time
18
+ from pathlib import Path
19
+ from typing import Generator
20
+
21
+ from steps.s1_extract_audio import extract_audio, extract_audio_hq
22
+ from steps.s2_transcribe import transcribe, POLLEN_TRANSCRIBE_MODEL
23
+ from steps.s3_translate import translate
24
+ from steps.s4_tts import synthesise_segments
25
+ from steps.s5_sync import sync_and_stitch
26
+ from steps.s6_captions import generate_captions
27
+ from steps.s6_merge import merge_audio_video
28
+
29
+ def _log_step_done(label: str, start: float):
30
+ """Print duration + separator line for a completed step."""
31
+ elapsed = time.time() - start
32
+ if elapsed >= 60:
33
+ mins, secs = divmod(elapsed, 60)
34
+ print(f"[{label}] Duration: {int(mins)}m {int(secs)}s")
35
+ else:
36
+ print(f"[{label}] Duration: {int(elapsed)}s")
37
+ print("=" * 40)
38
+
39
+
40
+ LANGUAGE_CODES = {
41
+ "Arabic": "ar",
42
+ "Chinese": "zh",
43
+ "Danish": "da",
44
+ "Dutch": "nl",
45
+ "English": "en",
46
+ "Finnish": "fi",
47
+ "French": "fr",
48
+ "German": "de",
49
+ "Greek": "el",
50
+ "Hebrew": "he",
51
+ "Hindi": "hi",
52
+ "Italian": "it",
53
+ "Japanese": "ja",
54
+ "Korean": "ko",
55
+ "Malay": "ms",
56
+ "Norwegian": "no",
57
+ "Polish": "pl",
58
+ "Portuguese": "pt",
59
+ "Russian": "ru",
60
+ "Spanish": "es",
61
+ "Swahili": "sw",
62
+ "Swedish": "sv",
63
+ "Turkish": "tr",
64
+ "Urdu": "hi",
65
+ }
66
+
67
+
68
+ def run_pipeline(
69
+ video_path: str,
70
+ target_language: str = "Spanish",
71
+ source_language: str = "auto",
72
+ output_path: str | None = None,
73
+ voice_mode: str = "chatterbox",
74
+ preview_event: threading.Event | None = None,
75
+ job_state: dict | None = None,
76
+ captions: bool = True,
77
+ preserve_music: bool = False,
78
+ data_dir: str | None = None,
79
+ video_link: str | None = None,
80
+ ) -> Generator[str | dict, None, str]:
81
+ """
82
+ Run the full translation pipeline, yielding progress messages.
83
+
84
+ Args:
85
+ video_path: Path to the input video file.
86
+ target_language: Target language name (e.g. "Spanish").
87
+ source_language: ISO-639-1 code of the source language, or "auto" for
88
+ Whisper to auto-detect (default "auto"). Forcing a wrong code makes
89
+ Whisper silently translate-and-transcribe instead of transcribing.
90
+ output_path: Where to save the output video. Auto-generated if None.
91
+ voice_mode: TTS engine to use ("chatterbox" or "omnivoice").
92
+ In Space deployments, this must match TTS_ENGINE env var.
93
+ preview_event: Deprecated - kept for compatibility, but unused in single-engine mode.
94
+ job_state: Shared dict with the server.
95
+
96
+ Yields:
97
+ str: Progress messages for each step.
98
+ dict: Special sentinel when previews are ready.
99
+
100
+ Returns:
101
+ str: Path to the translated output video.
102
+ """
103
+ # Single-engine mode: voice_mode must match TTS_ENGINE if set
104
+ space_engine = os.getenv("TTS_ENGINE")
105
+ if space_engine and voice_mode != space_engine:
106
+ yield f"⚠️ Warning: voice_mode='{voice_mode}' but Space TTS_ENGINE='{space_engine}'. Using {space_engine}.\n"
107
+ voice_mode = space_engine
108
+
109
+ # Fixed step count (no more preview_both mode)
110
+ total_steps = 6 + (1 if preserve_music else 0)
111
+
112
+ # Prepare output path
113
+ if output_path is None:
114
+ if data_dir:
115
+ output_path = str(Path(data_dir) / "output.mp4")
116
+ else:
117
+ stem = Path(video_path).stem
118
+ output_path = f"output_{stem}_{target_language.lower()}.mp4"
119
+
120
+ # Clean tmp dir
121
+ shutil.rmtree("tmp", ignore_errors=True)
122
+ os.makedirs("tmp/audio/source", exist_ok=True)
123
+
124
+ # Set up logging to tmp/logs.txt (clean logs only, no torch/chatterbox noise)
125
+ log_path = "tmp/logs.txt"
126
+ _log_file = open(log_path, "w", encoding="utf-8")
127
+ _orig_stdout = sys.stdout
128
+ _orig_stderr = sys.stderr
129
+
130
+ # Patterns to filter out of log file (still shown in terminal)
131
+ _NOISE = (
132
+ "Sampling:", "sampling", "UserWarning", "FutureWarning", "DeprecationWarning",
133
+ "torch.backends", "torch.functional", "torch.fft", "torchaudio/compliance",
134
+ "sdp_kernel", "LoRACompatible", "pkg_resources", "Fetching",
135
+ "output_attentions", "TRANSFORMERS_VERBOSITY",
136
+ "istft", "stft", "resize_", "inverse_transform",
137
+ "PerthNet", "loaded Perth", "diffusers/models",
138
+ "chatterbox/models/s3gen", "alignment_stream_analyzer",
139
+ "WARNING:chatterbox",
140
+ )
141
+
142
+ class _Tee(io.TextIOBase):
143
+ """Write to both the original stream and the log file (filtered)."""
144
+ def __init__(self, original, filter_noise=False):
145
+ self._original = original
146
+ self._filter = filter_noise
147
+ def write(self, s):
148
+ self._original.write(s)
149
+ if self._filter and any(p in s for p in _NOISE):
150
+ return len(s)
151
+ if not _log_file.closed:
152
+ _log_file.write(s)
153
+ _log_file.flush()
154
+ return len(s)
155
+ def flush(self):
156
+ self._original.flush()
157
+ if not _log_file.closed:
158
+ _log_file.flush()
159
+
160
+ sys.stdout = _Tee(_orig_stdout, filter_noise=True)
161
+ sys.stderr = _Tee(_orig_stderr, filter_noise=True)
162
+
163
+ try:
164
+ yield f"🎬 Starting pipeline: {video_path} → {target_language}\n"
165
+
166
+ # Step 1: Extract audio
167
+ yield f"🔊 Step 1/{total_steps}: Extracting audio...\n"
168
+ _t0 = time.time()
169
+ audio_path = extract_audio(video_path, "tmp/audio/source/extracted_audio.wav")
170
+ yield f" ✓ Audio extracted: {audio_path}\n"
171
+
172
+ # Step 1b: Source separation (conditional)
173
+ vocals_path = audio_path # default: use full mix
174
+ music_path = None
175
+ if preserve_music:
176
+ from steps.s1b_separate import separate_audio
177
+
178
+ audio_hq = extract_audio_hq(video_path, "tmp/audio/source/extracted_audio_hq.wav")
179
+ _log_step_done("s1", _t0)
180
+
181
+ yield f"🎵 Step 2/{total_steps}: Separating vocals from background music...\n"
182
+ _t0 = time.time()
183
+ vocals_path, music_path = separate_audio(audio_hq, "tmp/audio/source")
184
+ yield f" ✓ Vocals and accompaniment separated\n"
185
+ _log_step_done("s1b", _t0)
186
+ else:
187
+ _log_step_done("s1", _t0)
188
+
189
+ # Step offset: steps after separation shift by 1 when preserve_music is on
190
+ step_offset = 1 if preserve_music else 0
191
+
192
+ # Step 2: Transcribe
193
+ yield f"📝 Step {2 + step_offset}/{total_steps}: Transcribing (Pollinations Whisper / mlx-whisper)...\n"
194
+ _t0 = time.time()
195
+ segments = transcribe(vocals_path, language=source_language)
196
+ yield f" ✓ {len(segments)} segments transcribed\n"
197
+ for seg in segments:
198
+ yield f" [{seg['start']:.1f}s–{seg['end']:.1f}s] {seg['text']}\n"
199
+
200
+ # Dump transcription to tmp for inspection
201
+ import json as _json
202
+ from urllib.parse import urlparse, urlunparse
203
+ with open("tmp/transcription.json", "w", encoding="utf-8") as _tf:
204
+ out_data = {
205
+ "model_provider": "pollinations",
206
+ "model_name": POLLEN_TRANSCRIBE_MODEL,
207
+ "source_language": source_language,
208
+ "audio_path": vocals_path,
209
+ "segment_count": len(segments),
210
+ "total_duration": round(segments[-1]["end"], 2) if segments else 0,
211
+ "segments": [
212
+ {
213
+ "index": i,
214
+ "start": seg["start"],
215
+ "end": seg["end"],
216
+ "duration": round(seg["end"] - seg["start"], 2),
217
+ "text": seg["text"],
218
+ **({"words": seg["words"]} if "words" in seg else {}),
219
+ }
220
+ for i, seg in enumerate(segments)
221
+ ],
222
+ }
223
+ if video_link:
224
+ parsed = urlparse(video_link)
225
+ clean_link = urlunparse(parsed._replace(query="", fragment=""))
226
+ out_data = {"video_link": clean_link, **out_data}
227
+
228
+ _json.dump(out_data, _tf, indent=2, ensure_ascii=False)
229
+
230
+ _log_step_done("s2", _t0)
231
+
232
+ # Step 3: Translate
233
+ yield f"🌍 Step {3 + step_offset}/{total_steps}: Translating to {target_language}...\n"
234
+ _t0 = time.time()
235
+ segments = translate(segments, target_language)
236
+ yield f" ✓ Translation complete\n"
237
+ for seg in segments:
238
+ yield f" → {seg['translated_text']}\n"
239
+
240
+ target_lang_code = LANGUAGE_CODES.get(target_language, "es")
241
+ _log_step_done("s3", _t0)
242
+
243
+ # ── Step 4: TTS Synthesis ───────────────────────────────
244
+ model_name = voice_mode # Uses TTS_ENGINE env var in Space deployments
245
+
246
+ yield f"🗣️ Step {4 + step_offset}/{total_steps}: Synthesising speech ({model_name})...\n"
247
+ _t0 = time.time()
248
+ tts_gen = synthesise_segments(
249
+ segments, vocals_path,
250
+ language_id=target_lang_code,
251
+ output_dir="tmp/audio/tts",
252
+ model_name=model_name,
253
+ )
254
+ for msg in tts_gen:
255
+ if isinstance(msg, dict) and "__TTS_RESULT__" in msg:
256
+ segments = msg["__TTS_RESULT__"]
257
+ else:
258
+ yield msg
259
+
260
+ yield f" ✓ {len(segments)} segments synthesised\n"
261
+ _log_step_done("s4_tts", _t0)
262
+
263
+ # Step 5: Sync
264
+ yield f"⏱️ Step {5 + step_offset}/{total_steps}: Syncing audio to original timestamps...\n"
265
+ _t0 = time.time()
266
+ final_audio = sync_and_stitch(segments, "tmp/audio/final_audio.wav", "tmp/audio/tts_synced")
267
+ yield f" ✓ Audio synced: {final_audio}\n"
268
+ _log_step_done("s5", _t0)
269
+
270
+ # Captions + Merge
271
+ captions_path = None
272
+ _t0 = time.time()
273
+ if captions:
274
+ captions_path = generate_captions(segments, "tmp/captions.ass", target_language=target_language)
275
+ yield f" ✓ Captions generated: {captions_path}\n"
276
+
277
+ # Step 6: Merge
278
+ music_label = " + music" if music_path else ""
279
+ yield f"🎞️ Step {6 + step_offset}/{total_steps}: Merging translated audio{' + captions' if captions_path else ''}{music_label} into video...\n"
280
+ result = merge_audio_video(video_path, final_audio, output_path, captions_path=captions_path, music_path=music_path)
281
+ _log_step_done("s6", _t0)
282
+ yield f"\n✅ Done! Output saved to: {result}\n"
283
+
284
+ finally:
285
+ sys.stdout = _orig_stdout
286
+ sys.stderr = _orig_stderr
287
+ if not _log_file.closed:
288
+ _log_file.close()
289
+
290
+ if data_dir:
291
+ def _safe_copy(src, dst_name):
292
+ if os.path.exists(src):
293
+ shutil.copy2(src, os.path.join(data_dir, dst_name))
294
+ _safe_copy(log_path, "logs.txt")
295
+ _safe_copy("tmp/transcription.json", "transcription.json")
296
+ _safe_copy("tmp/llm_calls.json", "llm_calls.json")
297
+ _safe_copy("tmp/audio/tts/tts_manifest.json", "tts_manifest.json")
298
+ _safe_copy("tmp/audio/tts/segment_comparison.json", "segment_comparison.json")
299
+
300
+ print(f"[pipeline] Logs saved → {log_path}")
301
+
302
+ return result
303
+
304
+
305
+ def _collect_output(gen: Generator) -> tuple[list[str], str]:
306
+ """Collect all yields and the return value from the generator."""
307
+ messages = []
308
+ output_path = None
309
+ try:
310
+ while True:
311
+ msg = next(gen)
312
+ if isinstance(msg, dict):
313
+ # Ignore preview sentinels in CLI mode (deprecated preview_both flow)
314
+ continue
315
+ messages.append(msg)
316
+ print(msg, end="", flush=True)
317
+ except StopIteration as e:
318
+ output_path = e.value
319
+ return messages, output_path
320
+
321
+
322
+ def main():
323
+ parser = argparse.ArgumentParser(description="Video Translation Pipeline")
324
+ parser.add_argument("--input", required=True, help="Input video path")
325
+ parser.add_argument(
326
+ "--target-lang",
327
+ default="Spanish",
328
+ choices=list(LANGUAGE_CODES.keys()),
329
+ help="Target language (default: Spanish)",
330
+ )
331
+ parser.add_argument(
332
+ "--source-lang",
333
+ default="auto",
334
+ help="Source language ISO-639-1 code or 'auto' to let Whisper detect (default: auto)",
335
+ )
336
+ parser.add_argument("--output", default=None, help="Output video path")
337
+ parser.add_argument(
338
+ "--voice-mode",
339
+ default="chatterbox",
340
+ choices=["chatterbox", "omnivoice", "qwen3"],
341
+ help="TTS engine to use (default: chatterbox). Must match TTS_ENGINE env var in Space deployments.",
342
+ )
343
+ parser.add_argument(
344
+ "--preserve-music",
345
+ action="store_true",
346
+ help="Separate and preserve background music using Demucs",
347
+ )
348
+ args = parser.parse_args()
349
+
350
+ gen = run_pipeline(
351
+ video_path=args.input,
352
+ target_language=args.target_lang,
353
+ source_language=args.source_lang,
354
+ output_path=args.output,
355
+ voice_mode=args.voice_mode,
356
+ preserve_music=args.preserve_music,
357
+ )
358
+ _, output = _collect_output(gen)
359
+ print(f"\nFinal output: {output}")
360
+
361
+
362
+ if __name__ == "__main__":
363
+ main()
pyproject.toml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "videovoice"
3
+ version = "1.0.0"
4
+ description = "AI-powered short video translation with zero-shot voice cloning"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.10,<3.13"
8
+ dependencies = [
9
+ "openai>=2.30.0",
10
+ "requests>=2.33.0",
11
+ "python-dotenv>=1.2.2",
12
+ "pydub>=0.25.1",
13
+ "ffmpeg-python>=0.2.0",
14
+ "mlx-whisper>=0.4.3",
15
+ "tqdm>=4.67.3",
16
+ "fastapi>=0.135.2",
17
+ "uvicorn[standard]>=0.42.0",
18
+ "python-multipart>=0.0.22",
19
+ "yt-dlp>=2026.3.17",
20
+ "sse-starlette>=3.3.4",
21
+ "soundfile>=0.13.1",
22
+ "deep-translator>=1.11.4",
23
+ "demucs>=4.0.1",
24
+ "boto3>=1.42.82",
25
+ "torch==2.6.0",
26
+ "torchaudio==2.6.0",
27
+ "slowapi>=0.1.9",
28
+ "faster-whisper>=1.2.1",
29
+ "spaces>=0.48.3",
30
+ "openai-whisper>=20240930",
31
+ "gradio>=6.12.0",
32
+ "accelerate>=1.12.0",
33
+ "transformers>=4.57.3",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ # HF Spaces install from requirements-{cbox,omni}.txt and ignore these.
38
+ # Locally: `uv sync --extra chatterbox` installs the PyPI chatterbox-tts
39
+ # (we skip-worktree the vendored ./chatterbox/ folder so it doesn't shadow
40
+ # the PyPI package). `--extra omnivoice` is heavier and optional.
41
+ chatterbox = ["chatterbox-tts>=0.1.7"]
42
+ omnivoice = ["omnivoice>=0.1.4"]
43
+
44
+ [tool.uv]
45
+ # Declare chatterbox and omnivoice extras as mutually exclusive so uv
46
+ # doesn't try to resolve them into one lockfile view.
47
+ conflicts = [
48
+ [{ extra = "chatterbox" }, { extra = "omnivoice" }],
49
+ ]
50
+ override-dependencies = [
51
+ # onnxruntime 1.24.x metadata claims py3.10 support but no 3.10 wheels
52
+ # ship on PyPI — force resolution to the last version that has 3.10 wheels.
53
+ "onnxruntime<1.24",
54
+ # chatterbox-tts==0.1.7 pins gradio==6.8.0, but app.py needs >=6.12.0
55
+ # for gradio.Server. Override so the extras can coexist in a lockfile;
56
+ # gradio is only loaded by app.py (HF), so the local chatterbox install
57
+ # never exercises gradio code.
58
+ "gradio>=6.12.0",
59
+ ]
requirements-cbox.txt ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools<70.0.0
2
+ # Core ML
3
+ torch==2.8.0
4
+ torchaudio==2.8.0
5
+ accelerate==1.12.0
6
+ transformers>=4.57.3
7
+ diffusers==0.29.0
8
+ safetensors==0.5.3
9
+
10
+ # Audio processing
11
+ librosa==0.11.0
12
+ soundfile
13
+ pydub
14
+ demucs==4.0.1
15
+ openunmix
16
+ pyloudnorm
17
+
18
+ # Transcription
19
+ faster-whisper
20
+
21
+ # Translation
22
+ deep-translator
23
+
24
+ # TTS
25
+ conformer==0.3.2
26
+ omegaconf
27
+ pykakasi==2.3.0
28
+ resemble-perth>=1.0.0
29
+ s3tokenizer
30
+ spacy-pkuseg
31
+
32
+ # API / server
33
+ fastapi
34
+ uvicorn
35
+ slowapi
36
+ sse-starlette
37
+ python-multipart
38
+ python-dotenv
39
+ pydantic
40
+
41
+ # HuggingFace
42
+ huggingface-hub
43
+ spaces
44
+
45
+ # Utilities
46
+ openai
47
+ boto3
48
+ yt-dlp
49
+ ffmpeg-python
50
+ numpy<2.0.0
51
+ pandas<2.3.0
requirements-omni.txt ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for OmniVoice TTS Space (ZeroGPU / Python 3.10)
2
+ # TTS Engine: OmniVoice (set TTS_ENGINE=omnivoice in Space Secrets)
3
+ #
4
+ # This Space serves only the OmniVoice TTS engine, avoiding dependency
5
+ # conflicts with chatterbox-tts (which pins transformers==5.2.0).
6
+
7
+ accelerate==1.12.0
8
+ aiofiles
9
+ annotated-types
10
+ anyio
11
+ audioread
12
+ av
13
+ beautifulsoup4
14
+ boto3
15
+ botocore
16
+ brotli
17
+ catalogue
18
+ certifi
19
+ cffi
20
+ cfgv
21
+ charset-normalizer
22
+ click
23
+ cloudpickle
24
+ coloredlogs
25
+ conformer
26
+ ctranslate2
27
+ decorator
28
+ deep-translator
29
+ demucs==4.0.1
30
+ deprecated
31
+ diffusers
32
+ distlib
33
+ distro
34
+ dora-search
35
+ einops
36
+ fastapi
37
+ faster-whisper
38
+ ffmpeg-python
39
+ ffmpy
40
+ filelock
41
+ flatbuffers
42
+ fsspec
43
+ future
44
+ gradio==6.12.0
45
+ gradio-client
46
+ h11
47
+ httpcore
48
+ httptools
49
+ httpx
50
+ huggingface-hub
51
+ humanfriendly
52
+ identify
53
+ idna
54
+ importlib-metadata
55
+ jaconv
56
+ jinja2
57
+ jiter
58
+ jmespath
59
+ joblib
60
+ julius
61
+ lameenc
62
+ lazy-loader
63
+ librosa
64
+ limits
65
+ llvmlite
66
+ markdown-it-py
67
+ markupsafe
68
+ mdurl
69
+ ml-dtypes
70
+ mlx; sys_platform == 'darwin'
71
+ mlx-whisper; sys_platform == 'darwin'
72
+ more-itertools
73
+ mpmath
74
+ msgpack
75
+ networkx
76
+ nodeenv
77
+ numba
78
+ numpy<2.0.0
79
+ omegaconf
80
+ onnx
81
+ onnxruntime
82
+ openai
83
+ openai-whisper
84
+ openunmix
85
+ orjson
86
+ packaging
87
+ pandas<2.3.0
88
+ pillow
89
+ platformdirs
90
+ pooch
91
+ pre-commit
92
+ protobuf
93
+ psutil
94
+ pycparser
95
+ pydantic
96
+ pydantic-core
97
+ pydub
98
+ pygments
99
+ pykakasi
100
+ pyloudnorm
101
+ python-dateutil
102
+ python-discovery
103
+ python-dotenv
104
+ python-multipart
105
+ pytz
106
+ pyyaml
107
+ regex
108
+ resemble-perth
109
+ retrying
110
+ rich
111
+ s3tokenizer
112
+ s3transfer
113
+ safehttpx
114
+ safetensors
115
+ scikit-learn
116
+ scipy
117
+ semantic-version
118
+ setuptools
119
+ shellingham
120
+ six
121
+ slowapi
122
+ sniffio
123
+ soundfile
124
+ soupsieve
125
+ soxr
126
+ spaces
127
+ spacy-pkuseg
128
+ srsly
129
+ sse-starlette
130
+ starlette
131
+ submitit
132
+ sympy
133
+ threadpoolctl
134
+ tiktoken
135
+ tokenizers
136
+ tomlkit
137
+ torch==2.8.0
138
+ torchaudio==2.8.0
139
+ tqdm
140
+ transformers>=4.57.3
141
+ treetable
142
+ typer
143
+ typing-extensions
144
+ typing-inspection
145
+ tzdata
146
+ urllib3
147
+ uvicorn
148
+ uvloop; sys_platform != 'win32'
149
+ virtualenv
150
+ watchfiles
151
+ websockets
152
+ wrapt
153
+ yt-dlp
154
+ zipp
155
+
156
+ # OmniVoice TTS
157
+ omnivoice>=0.1.4
requirements-qwen3.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Qwen3-TTS Space (ZeroGPU / Python 3.10)
2
+ # TTS Engine: Qwen3-TTS Base 1.7B (set TTS_ENGINE=qwen3 in Space Secrets)
3
+ #
4
+ # This Space serves only the Qwen3-TTS engine, mirroring the chatterbox/
5
+ # omnivoice split. Pins are derived from the official Qwen/Qwen3-TTS Space
6
+ # (torch 2.8, transformers 4.57.3) plus the VideoVoice pipeline's
7
+ # transcription/translation/audio dependencies.
8
+
9
+ # ── Qwen3-TTS core (matches Qwen/Qwen3-TTS Space) ────────────
10
+ # NOTE: `qwen_tts` is NOT a PyPI package. The Qwen3TTSModel class is loaded
11
+ # from a vendored `qwen_tts/` directory at the repo root, mirroring the
12
+ # vendored `chatterbox/` folder pattern. Copy that directory from
13
+ # https://huggingface.co/spaces/Qwen/Qwen3-TTS/tree/main/qwen_tts into this
14
+ # repo before deploying.
15
+ torch==2.8.0
16
+ torchaudio==2.8.0
17
+ transformers==4.57.3
18
+ accelerate==1.12.0
19
+ einops
20
+ librosa
21
+ soundfile
22
+ sox
23
+ onnxruntime
24
+ kernels
25
+ spaces
26
+
27
+ # ── VideoVoice pipeline (transcription + translation + IO) ──
28
+ fastapi
29
+ uvicorn
30
+ slowapi
31
+ sse-starlette
32
+ python-multipart
33
+ python-dotenv
34
+ pydantic
35
+
36
+ faster-whisper
37
+ openai-whisper
38
+ mlx; sys_platform == 'darwin'
39
+ mlx-whisper; sys_platform == 'darwin'
40
+
41
+ deep-translator
42
+ openai
43
+
44
+ demucs==4.0.1
45
+ openunmix
46
+ pyloudnorm
47
+ pydub
48
+ ffmpeg-python
49
+
50
+ huggingface-hub
51
+ boto3
52
+ yt-dlp
53
+ gradio==6.12.0
54
+ numpy<2.0.0
55
+ pandas<2.3.0
requirements.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for Dramabox Space (ZeroGPU / Python 3.10)
2
+ # TTS Engine: Resemble Dramabox (set TTS_ENGINE=dramabox in Space Secrets)
3
+ #
4
+ # This Space serves the Dramabox "directable speech" model via the
5
+ # /api/tools/dramabox tools endpoint. The dub pipeline is reachable but
6
+ # rejects voice_mode != "dramabox" (server.py), and the frontend never
7
+ # routes dub requests here.
8
+ #
9
+ # NOTE: The Dramabox inference glue (TTSServer, model_downloader) is NOT
10
+ # a PyPI package. Vendor it from
11
+ # https://huggingface.co/spaces/ResembleAI/Dramabox/tree/main/src
12
+ # into this repo as `dramabox_src/` before deploying. The tools_api/dramabox
13
+ # worker adds that path to sys.path on first request.
14
+
15
+ # ── Dramabox core (verbatim from upstream ResembleAI/Dramabox Space) ──
16
+ torch==2.8.0
17
+ torchaudio==2.8.0
18
+ # pydantic 2.11+ emits bool-shorthand `additionalProperties: True` which
19
+ # crashes gradio_client's get_type. 2.10.6 is the last version emitting
20
+ # the dict form — Dramabox requires this pin.
21
+ pydantic==2.10.6
22
+ safetensors>=0.4.0
23
+ accelerate>=0.25.0
24
+ peft>=0.7.0
25
+ av>=12.0.0
26
+ einops>=0.7.0
27
+ PyYAML>=6.0
28
+ sentencepiece>=0.1.99
29
+ transformers>=4.45.0
30
+ huggingface_hub>=0.20.0,<1.0
31
+ bitsandbytes>=0.45.0
32
+ gradio==5.7.1
33
+ spaces>=0.30.0
34
+ soundfile>=0.12.0
35
+ resemble-perth @ git+https://github.com/resemble-ai/Perth.git@master
36
+
37
+ # ── VideoVoice pipeline (server.py / app.py imports these at startup) ──
38
+ fastapi
39
+ uvicorn
40
+ slowapi
41
+ sse-starlette
42
+ python-multipart
43
+ python-dotenv
44
+
45
+ faster-whisper
46
+ openai-whisper
47
+ mlx; sys_platform == 'darwin'
48
+ mlx-whisper; sys_platform == 'darwin'
49
+
50
+ deep-translator
51
+ openai
52
+
53
+ demucs==4.0.1
54
+ openunmix
55
+ pyloudnorm
56
+ pydub
57
+ ffmpeg-python
58
+
59
+ boto3
60
+ yt-dlp
61
+ numpy<2.0.0
62
+ pandas<2.3.0
scripts/prefetch_models.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prefetch model weights into HF_HOME for faster cold starts on Spaces."""
2
+
3
+ import os
4
+
5
+
6
+ def _prefetch_chatterbox() -> None:
7
+ from chatterbox.mtl_tts import ChatterboxMultilingualTTS
8
+
9
+ print("[prefetch] Chatterbox Multilingual TTS")
10
+ _ = ChatterboxMultilingualTTS.from_pretrained("cpu")
11
+
12
+
13
+ def _prefetch_faster_whisper() -> None:
14
+ from faster_whisper import WhisperModel
15
+
16
+ raw = os.getenv("FASTER_WHISPER_MODELS")
17
+ if raw:
18
+ models = [m.strip() for m in raw.split(",") if m.strip()]
19
+ else:
20
+ models = [os.getenv("FASTER_WHISPER_MODEL", "large-v3")]
21
+
22
+ for model_name in models:
23
+ print(f"[prefetch] faster-whisper {model_name}")
24
+ _ = WhisperModel(model_name, device="cpu", compute_type="int8")
25
+
26
+
27
+ def _prefetch_demucs() -> None:
28
+ from demucs.pretrained import get_model
29
+
30
+ print("[prefetch] Demucs htdemucs")
31
+ _ = get_model("htdemucs")
32
+
33
+
34
+ def main() -> None:
35
+ tts_engine = os.getenv("TTS_ENGINE", "chatterbox").lower()
36
+ print(f"[prefetch] HF_HOME={os.getenv('HF_HOME', '<unset>')}")
37
+ if tts_engine == "chatterbox":
38
+ _prefetch_chatterbox()
39
+ else:
40
+ print(f"[prefetch] skipping chatterbox prefetch for TTS_ENGINE={tts_engine}")
41
+ _prefetch_faster_whisper()
42
+ _prefetch_demucs()
43
+ print("[prefetch] done")
44
+
45
+
46
+ if __name__ == "__main__":
47
+ main()
server.py ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ server.py — FastAPI backend for VideoVoice.
3
+
4
+ Endpoints:
5
+ POST /api/jobs — Submit a video for translation (file upload or URL)
6
+ GET /api/jobs/{id} — SSE stream of pipeline progress
7
+ GET /api/jobs/{id}/result — Download the translated video
8
+ POST /api/jobs/{id}/select-model — Select TTS model after preview
9
+ GET /api/jobs/{id}/preview/{model} — Stream preview audio
10
+ GET /api/demo-videos — List available demo videos (outputs + data)
11
+ GET /api/demo-videos/{video_id}/stream — Stream demo video by ID
12
+ GET /api/showcase — Curated before/after showcase entries
13
+ """
14
+ import asyncio
15
+ import hashlib
16
+ import json
17
+ import os
18
+ import subprocess
19
+ import shutil
20
+ import threading
21
+ import time
22
+ import uuid
23
+ import re
24
+ from pathlib import Path
25
+ from urllib.parse import urlparse
26
+ from typing import Optional
27
+
28
+ from dotenv import load_dotenv
29
+ from fastapi import FastAPI, APIRouter, File, Form, HTTPException, Request, UploadFile, Header
30
+ from fastapi.middleware.cors import CORSMiddleware
31
+ from fastapi import Request
32
+ from fastapi.responses import FileResponse, JSONResponse
33
+ from fastapi.staticfiles import StaticFiles
34
+ from pydantic import BaseModel
35
+ from slowapi import Limiter, _rate_limit_exceeded_handler
36
+ from slowapi.errors import RateLimitExceeded
37
+ from slowapi.middleware import SlowAPIMiddleware
38
+ from slowapi.util import get_remote_address
39
+ from sse_starlette.sse import EventSourceResponse
40
+
41
+ load_dotenv()
42
+
43
+ # TTS_ENGINE controls which TTS backend this Space serves
44
+ TTS_ENGINE = os.getenv("TTS_ENGINE", "chatterbox").lower()
45
+ if TTS_ENGINE not in ("chatterbox", "omnivoice", "qwen3", "dramabox"):
46
+ raise ValueError(f"Invalid TTS_ENGINE: {TTS_ENGINE}. Use 'chatterbox', 'omnivoice', 'qwen3', or 'dramabox'.")
47
+
48
+ # ── Config ────────────────────────────────────────────────
49
+ PORT = int(os.getenv("PORT", "7860"))
50
+ MAX_FILE_SIZE_MB = 90
51
+ MAX_DURATION_SEC = 90
52
+ MAX_UPLOAD_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
53
+
54
+
55
+ def _default_artifacts_root() -> Path:
56
+ # Prefer /data/jobs when the Space has persistent storage mounted
57
+ # (Docker deploys, or Gradio SDK Spaces with persistent storage enabled).
58
+ # Fall back to /tmp when /data is not writable, which is the case on
59
+ # Zero GPU / Gradio SDK Spaces without the paid persistent-storage add-on.
60
+ preferred = Path("/data/jobs")
61
+ try:
62
+ preferred.parent.mkdir(parents=True, exist_ok=True)
63
+ if os.access(preferred.parent, os.W_OK):
64
+ return preferred
65
+ except (PermissionError, OSError):
66
+ pass
67
+ return Path("/tmp/videovoice_jobs")
68
+
69
+
70
+ ARTIFACTS_ROOT = Path(os.getenv("ARTIFACTS_ROOT") or _default_artifacts_root())
71
+ ALLOWED_YTDLP_HOSTS = {
72
+ "instagram.com",
73
+ "youtube.com",
74
+ "youtu.be",
75
+ "tiktok.com",
76
+ "vm.tiktok.com",
77
+ }
78
+ PERSISTENT_ARTIFACT_DIRS = {"uploads", "outputs", "data", "tmp", "tools"}
79
+ REAPER_INTERVAL_SECONDS = 10 * 60
80
+ REAPER_MAX_AGE_SECONDS = 2 * 60 * 60
81
+
82
+
83
+ def _parse_allowed_origins(value: str) -> list[str]:
84
+ origins = [origin.strip() for origin in value.split(",") if origin.strip()]
85
+ return origins or ["http://localhost:5173"]
86
+
87
+
88
+ ALLOWED_ORIGINS = _parse_allowed_origins(
89
+ os.getenv("ALLOWED_ORIGINS", "http://localhost:5173")
90
+ )
91
+
92
+ # ── App ────────────────────────────────────────────────
93
+ router = APIRouter()
94
+ _RATE_LIMIT_ENABLED = os.getenv("DISABLE_RATE_LIMIT", "").lower() not in ("1", "true", "yes")
95
+ limiter = Limiter(key_func=get_remote_address, enabled=_RATE_LIMIT_ENABLED)
96
+ # Note: app.state.limiter, exception handlers, and SlowAPIMiddleware
97
+ # are now configured on the main Server instance in app.py.
98
+
99
+ # ── In-memory job store ────────────────────────────────
100
+ # Structure: { job_id: { status, messages[], result_path, error, created_at,
101
+ # voice_mode, preview_paths, preview_event, selected_model } }
102
+ jobs: dict = {}
103
+
104
+ # ── GPU job queue ─────────────────────────────────────
105
+ # Only 1 GPU job at a time — others wait in FIFO order
106
+ gpu_semaphore = threading.Semaphore(1)
107
+ gpu_queue: list[str] = [] # ordered list of queued job_ids waiting for GPU
108
+ gpu_active: dict = { # the currently running job's live info
109
+ "job_id": None,
110
+ "started_at": None,
111
+ "step": 0,
112
+ "total_steps": 6,
113
+ "step_label": "",
114
+ }
115
+ # Per-step timing history: { step_num: [durations] } — learns real per-step costs
116
+ step_durations: dict[int, list[float]] = {}
117
+ session_active_jobs: dict[str, str] = {}
118
+ artifact_reaper_task: Optional[asyncio.Task] = None
119
+
120
+
121
+ UPLOAD_DIR = ARTIFACTS_ROOT / "uploads"
122
+ OUTPUT_DIR = ARTIFACTS_ROOT / "outputs"
123
+ SHOWCASE_DIR = ARTIFACTS_ROOT / "data" / "showcase"
124
+ SHOWCASE_FILE = ARTIFACTS_ROOT / "data" / "showcase.json"
125
+ DEMO_VIDEO_DIRS = {
126
+ "outputs": OUTPUT_DIR,
127
+ "data": ARTIFACTS_ROOT / "data",
128
+ "showcase": SHOWCASE_DIR,
129
+ }
130
+
131
+
132
+ # ── Helpers ────────────────────────────────────────────
133
+ def _download_url(url: str, dest: str) -> str:
134
+ """Download video from Instagram/YouTube using yt-dlp."""
135
+ result = subprocess.run(
136
+ [
137
+ "yt-dlp",
138
+ "--no-playlist",
139
+ "--max-filesize", "100M",
140
+ "--js-runtimes", "node",
141
+ "--extractor-args", "youtube:player_client=android,ios,web_safari",
142
+ "-f", "mp4/best[ext=mp4]/best",
143
+ "-o", dest,
144
+ url,
145
+ ],
146
+ capture_output=True,
147
+ text=True,
148
+ timeout=120,
149
+ )
150
+ if result.returncode != 0:
151
+ raise RuntimeError(f"yt-dlp failed: {result.stderr[:300]}")
152
+ return dest
153
+
154
+
155
+ def _is_allowed_video_host(url: str) -> bool:
156
+ """Allow only trusted social platforms for yt-dlp."""
157
+ parsed = urlparse(url)
158
+ host = (parsed.hostname or "").lower()
159
+ if not host:
160
+ return False
161
+
162
+ return (
163
+ host in ALLOWED_YTDLP_HOSTS
164
+ or host.endswith(".instagram.com")
165
+ or host.endswith(".youtube.com")
166
+ or host.endswith(".tiktok.com")
167
+ )
168
+
169
+
170
+ def _probe_duration_seconds(path: str) -> float:
171
+ """Read media duration from ffprobe."""
172
+ result = subprocess.run(
173
+ [
174
+ "ffprobe",
175
+ "-v", "error",
176
+ "-show_entries", "format=duration",
177
+ "-of", "csv=p=0",
178
+ path,
179
+ ],
180
+ capture_output=True,
181
+ text=True,
182
+ timeout=30,
183
+ )
184
+ if result.returncode != 0:
185
+ raise RuntimeError(f"ffprobe failed: {result.stderr[:300]}")
186
+
187
+ try:
188
+ return float(result.stdout.strip())
189
+ except ValueError as exc:
190
+ raise RuntimeError("ffprobe returned an invalid duration value") from exc
191
+
192
+
193
+ def _gpu_available() -> bool:
194
+ """Report CUDA/MPS availability."""
195
+ try:
196
+ import torch
197
+
198
+ mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
199
+ return bool(torch.cuda.is_available() or mps_available)
200
+ except Exception:
201
+ return False
202
+
203
+
204
+ def _queue_depth() -> int:
205
+ """Total queue pressure: active job + queued jobs."""
206
+ return len(gpu_queue) + (1 if gpu_active["job_id"] else 0)
207
+
208
+
209
+ def _is_job_active(job_id: str) -> bool:
210
+ """Whether a job is still active (queued/running)."""
211
+ job = jobs.get(job_id)
212
+ if not job:
213
+ return False
214
+ return job.get("status") in {"queued", "running"}
215
+
216
+
217
+ def _release_session_lock(job: dict) -> None:
218
+ session_id = job.get("session_id")
219
+ if not session_id:
220
+ return
221
+ if session_active_jobs.get(session_id) == job.get("job_id"):
222
+ session_active_jobs.pop(session_id, None)
223
+
224
+
225
+ def _demo_video_id(folder: str, filename: str) -> str:
226
+ """Generate a stable opaque ID for a whitelisted demo video."""
227
+ raw = f"{folder}/{filename}".encode("utf-8")
228
+ return hashlib.sha256(raw).hexdigest()[:20]
229
+
230
+
231
+ def _collect_demo_videos():
232
+ """Discover demo videos and return (metadata list, id -> path lookup)."""
233
+ videos = []
234
+ video_lookup = {}
235
+
236
+ for folder, directory in DEMO_VIDEO_DIRS.items():
237
+ if not directory.exists() or not directory.is_dir():
238
+ continue
239
+
240
+ for file_path in directory.iterdir():
241
+ if not file_path.is_file() or file_path.suffix.lower() != ".mp4":
242
+ continue
243
+
244
+ stat = file_path.stat()
245
+ video_id = _demo_video_id(folder, file_path.name)
246
+ videos.append(
247
+ {
248
+ "id": video_id,
249
+ "name": file_path.name,
250
+ "url": f"/api/demo-videos/{video_id}/stream",
251
+ "folder": folder,
252
+ "size_bytes": stat.st_size,
253
+ "modified_at": int(stat.st_mtime),
254
+ }
255
+ )
256
+ video_lookup[video_id] = file_path
257
+
258
+ videos.sort(
259
+ key=lambda item: (
260
+ item["name"].lower(),
261
+ item["folder"].lower(),
262
+ item["url"].lower(),
263
+ )
264
+ )
265
+ return videos, video_lookup
266
+
267
+
268
+ def _queue_status_for(job_id: str) -> str | None:
269
+ """Build a live queue status string for a waiting job."""
270
+ if job_id not in gpu_queue:
271
+ return None
272
+ pos = gpu_queue.index(job_id) + 1 # 1-based position
273
+
274
+ active = gpu_active
275
+ if not active["job_id"]:
276
+ return f"Queue position: {pos} — GPU starting up..."
277
+
278
+ step = active["step"]
279
+ total = active["total_steps"]
280
+ label = active["step_label"]
281
+
282
+ # Build ETA from per-step history if we have it
283
+ eta_part = ""
284
+ if step > 0 and step_durations:
285
+ remaining_secs = 0
286
+ for s in range(step, total + 1):
287
+ hist = step_durations.get(s, [])
288
+ remaining_secs += (sum(hist) / len(hist)) if hist else 15
289
+ # Multiply by queue position (jobs ahead)
290
+ remaining_secs = int(remaining_secs * pos)
291
+ if remaining_secs > 0:
292
+ if remaining_secs < 60:
293
+ eta_part = f" — ~{remaining_secs}s remaining"
294
+ else:
295
+ m, s_ = divmod(remaining_secs, 60)
296
+ eta_part = f" — ~{m}m {s_:02d}s remaining"
297
+
298
+ jobs_word = "job" if pos == 1 else "jobs"
299
+ if label:
300
+ return f"{pos} {jobs_word} ahead (Step {step}/{total} — {label}){eta_part}"
301
+ else:
302
+ return f"{pos} {jobs_word} ahead (Step {step}/{total}){eta_part}"
303
+
304
+
305
+ def _config_languages() -> list[str]:
306
+ """Expose supported language names from the pipeline (Chatterbox set)."""
307
+ from pipeline import LANGUAGE_CODES
308
+
309
+ return list(LANGUAGE_CODES.keys())
310
+
311
+
312
+ def _chatterbox_language_options() -> list[dict]:
313
+ from pipeline import LANGUAGE_CODES
314
+
315
+ return [{"name": name, "code": code} for name, code in LANGUAGE_CODES.items()]
316
+
317
+
318
+ def _omnivoice_language_options() -> list[dict]:
319
+ from steps.lang.omnivoice_languages import OMNIVOICE_LANGUAGE_CODES
320
+
321
+ return [{"name": name, "code": code} for name, code in OMNIVOICE_LANGUAGE_CODES.items()]
322
+
323
+
324
+ def _qwen3_language_options() -> list[dict]:
325
+ from steps.lang.qwen3_languages import QWEN3_LANGUAGE_CODES
326
+
327
+ return [{"name": name, "code": code} for name, code in QWEN3_LANGUAGE_CODES.items()]
328
+
329
+
330
+ async def _artifact_reaper_loop():
331
+ """Delete stale per-job artifact directories from ARTIFACTS_ROOT."""
332
+ while True:
333
+ try:
334
+ now = time.time()
335
+ for path in ARTIFACTS_ROOT.iterdir():
336
+ if not path.is_dir():
337
+ continue
338
+ if path.name in PERSISTENT_ARTIFACT_DIRS:
339
+ continue
340
+
341
+ age = now - path.stat().st_mtime
342
+ if age > REAPER_MAX_AGE_SECONDS:
343
+ shutil.rmtree(path, ignore_errors=True)
344
+
345
+ stale_jobs = [
346
+ job_id
347
+ for job_id, state in jobs.items()
348
+ if state.get("status") in {"complete", "error"}
349
+ and (now - state.get("created_at", now)) > REAPER_MAX_AGE_SECONDS
350
+ ]
351
+ for job_id in stale_jobs:
352
+ jobs.pop(job_id, None)
353
+ except Exception as exc:
354
+ print(f"[reaper] cleanup error: {exc}")
355
+
356
+ await asyncio.sleep(REAPER_INTERVAL_SECONDS)
357
+
358
+
359
+ async def enforce_content_length_limit(request: Request, call_next):
360
+ """Reject oversized uploads before body parsing."""
361
+ if request.method.upper() == "POST" and request.url.path == "/api/jobs":
362
+ content_length = request.headers.get("content-length")
363
+ if content_length:
364
+ try:
365
+ if int(content_length) > MAX_UPLOAD_BYTES:
366
+ return JSONResponse(
367
+ status_code=413,
368
+ content={"detail": f"File too large (max {MAX_FILE_SIZE_MB}MB)."},
369
+ )
370
+ except ValueError:
371
+ return JSONResponse(
372
+ status_code=400,
373
+ content={"detail": "Invalid Content-Length header."},
374
+ )
375
+
376
+ return await call_next(request)
377
+
378
+
379
+ async def _run_pipeline_async(
380
+ job_id: str, video_path: str, target_lang: str, source_lang: str, voice_mode: str, captions: bool = True, preserve_music: bool = True, video_link: Optional[str] = None
381
+ ):
382
+ """Run the translation pipeline in a background thread, pushing progress to the job store."""
383
+ from pipeline import run_pipeline
384
+
385
+ job = jobs[job_id]
386
+ job["status"] = "queued"
387
+
388
+ # Join the queue
389
+ gpu_queue.append(job_id)
390
+ job["_wait_status"] = _queue_status_for(job_id) or "Waiting for GPU..."
391
+
392
+ # Wait for GPU without blocking the event loop — update queue status each tick
393
+ while not gpu_semaphore.acquire(blocking=False):
394
+ job["_wait_status"] = _queue_status_for(job_id) or "Waiting for GPU..."
395
+ await asyncio.sleep(1)
396
+
397
+ # Leave the queue, mark as running
398
+ if job_id in gpu_queue:
399
+ gpu_queue.remove(job_id)
400
+ job["_wait_status"] = None
401
+ job["status"] = "running"
402
+
403
+ # Fixed 6 pipeline steps: extract, separate, transcribe, translate, tts, sync, merge
404
+ # (+1 if preserve_music for music restoration)
405
+ total_steps = 6 + (1 if preserve_music else 0)
406
+ gpu_active["job_id"] = job_id
407
+ gpu_active["started_at"] = time.time()
408
+ gpu_active["step"] = 0
409
+ gpu_active["total_steps"] = total_steps
410
+ gpu_active["step_label"] = ""
411
+
412
+ job["messages"].append({"type": "progress", "message": "GPU acquired — starting pipeline...", "step": 0})
413
+ start = time.time()
414
+ step_start = time.time()
415
+
416
+ try:
417
+ data_dir = str(ARTIFACTS_ROOT / job_id)
418
+ os.makedirs(data_dir, exist_ok=True)
419
+ output_path = str(Path(data_dir) / "output.mp4")
420
+
421
+ # Note: preview_both mode removed in single-engine Spaces
422
+ # Each Space only serves one TTS engine (TTS_ENGINE env var)
423
+ preview_event = None
424
+
425
+ gen = run_pipeline(
426
+ video_path=video_path,
427
+ target_language=target_lang,
428
+ source_language=source_lang,
429
+ output_path=output_path,
430
+ voice_mode=voice_mode,
431
+ preview_event=preview_event,
432
+ job_state=job,
433
+ captions=captions,
434
+ preserve_music=preserve_music,
435
+ data_dir=data_dir,
436
+ video_link=video_link,
437
+ )
438
+
439
+ step = 0
440
+
441
+ def _run_gen():
442
+ nonlocal step, step_start
443
+ output = None
444
+ try:
445
+ while True:
446
+ msg = next(gen)
447
+
448
+ # Handle preview-ready sentinel dict
449
+ if isinstance(msg, dict) and msg.get("__PREVIEW_READY__"):
450
+ preview_paths = msg["paths"]
451
+ job["preview_paths"] = preview_paths
452
+
453
+ # Build preview URLs
454
+ preview_urls = {}
455
+ for model_name, path in preview_paths.items():
456
+ if path:
457
+ preview_urls[model_name] = (
458
+ f"/api/jobs/{job_id}/preview/{model_name}"
459
+ )
460
+
461
+ job["messages"].append({
462
+ "type": "voice_preview",
463
+ "step": 4,
464
+ "previews": preview_urls,
465
+ })
466
+ continue
467
+
468
+ # Regular string message
469
+ if isinstance(msg, str):
470
+ # Detect step transitions and record per-step timing
471
+ if "Step" in msg and f"/{total_steps}" in msg:
472
+ try:
473
+ new_step = int(
474
+ msg.split("Step")[1].split("/")[0].strip()
475
+ )
476
+ # Record duration of the step that just ended
477
+ if step > 0:
478
+ dur = time.time() - step_start
479
+ step_durations.setdefault(step, [])
480
+ step_durations[step].append(dur)
481
+ if len(step_durations[step]) > 10:
482
+ step_durations[step].pop(0)
483
+
484
+ step = new_step
485
+ step_start = time.time()
486
+
487
+ # Extract step label (text after "Step X/Y: ")
488
+ label = msg.split(":", 1)[1].strip() if ":" in msg else ""
489
+ # Remove emoji prefix
490
+ label = label.lstrip("🔊📝🌍🗣️⏱️🎞️🎧 ")
491
+ gpu_active["step"] = step
492
+ gpu_active["step_label"] = label
493
+
494
+ except (ValueError, IndexError):
495
+ pass
496
+
497
+ job["messages"].append({
498
+ "type": "progress",
499
+ "message": msg.strip(),
500
+ "step": step,
501
+ })
502
+
503
+ except StopIteration as e:
504
+ output = e.value
505
+ except Exception as e:
506
+ # Pipeline crashed — set error status directly from
507
+ # the thread so the frontend sees it immediately,
508
+ # rather than relying on exception propagation through
509
+ # run_in_executor (which can silently swallow errors
510
+ # when stdout/stderr are in a broken state).
511
+ import traceback
512
+ tb = traceback.format_exc()
513
+ print(f"[pipeline] CRASH in job {job_id}: {e}\n{tb}")
514
+ job["status"] = "error"
515
+ job["messages"].append({
516
+ "type": "error",
517
+ "message": f"Pipeline crashed: {e}",
518
+ })
519
+ return None
520
+
521
+ # Record the final step's duration
522
+ if step > 0:
523
+ dur = time.time() - step_start
524
+ step_durations.setdefault(step, [])
525
+ step_durations[step].append(dur)
526
+ if len(step_durations[step]) > 10:
527
+ step_durations[step].pop(0)
528
+ return output
529
+
530
+ loop = asyncio.get_event_loop()
531
+ result_path = await loop.run_in_executor(None, _run_gen)
532
+
533
+ if job["status"] == "error":
534
+ # Error already reported by _run_gen — skip marking as complete
535
+ pass
536
+ else:
537
+ elapsed = round(time.time() - start)
538
+ job["status"] = "complete"
539
+ job["result_path"] = result_path or output_path
540
+ job["messages"].append({"type": "complete", "elapsed": elapsed})
541
+
542
+ except Exception as e:
543
+ job["status"] = "error"
544
+ job["messages"].append({"type": "error", "message": str(e)})
545
+
546
+ finally:
547
+ # Free GPU memory between jobs
548
+ import gc
549
+ import torch
550
+ gc.collect()
551
+ if hasattr(torch, "mps") and torch.backends.mps.is_available():
552
+ torch.mps.empty_cache()
553
+
554
+ gpu_active["job_id"] = None
555
+ gpu_active["started_at"] = None
556
+ gpu_active["step"] = 0
557
+ gpu_active["step_label"] = ""
558
+ if job_id in gpu_queue:
559
+ gpu_queue.remove(job_id)
560
+ _release_session_lock(job)
561
+ gpu_semaphore.release()
562
+
563
+
564
+ # ── Routes ─────────────────────────────────────────────
565
+
566
+ @router.get("/api/health")
567
+ async def health():
568
+ return JSONResponse(
569
+ {
570
+ "status": "ok",
571
+ "gpu_available": _gpu_available(),
572
+ "queue_depth": _queue_depth(),
573
+ "active_job_id": gpu_active["job_id"],
574
+ }
575
+ )
576
+
577
+
578
+ @router.get("/api/config")
579
+ async def config():
580
+ return JSONResponse(
581
+ {
582
+ "max_file_size_mb": MAX_FILE_SIZE_MB,
583
+ "max_duration_sec": MAX_DURATION_SEC,
584
+ "languages": _config_languages(),
585
+ "chatterbox_languages": _chatterbox_language_options(),
586
+ "omnivoice_languages": _omnivoice_language_options(),
587
+ "qwen3_languages": _qwen3_language_options(),
588
+ "tts_models": [TTS_ENGINE],
589
+ "tts_engine": TTS_ENGINE,
590
+ }
591
+ )
592
+
593
+
594
+ @router.get("/api/demo-videos")
595
+ async def list_demo_videos():
596
+ """List whitelisted MP4 demo videos from outputs/ and data/."""
597
+ videos, _ = _collect_demo_videos()
598
+ return JSONResponse({"videos": videos})
599
+
600
+
601
+ @router.get("/api/demo-videos/{video_id}/stream")
602
+ async def stream_demo_video(video_id: str):
603
+ """Stream a demo video by opaque ID (no client-provided path)."""
604
+ _, video_lookup = _collect_demo_videos()
605
+ video_path = video_lookup.get(video_id)
606
+ if not video_path:
607
+ raise HTTPException(404, "Demo video not found.")
608
+
609
+ return FileResponse(
610
+ str(video_path),
611
+ media_type="video/mp4",
612
+ filename=video_path.name,
613
+ )
614
+
615
+
616
+ @router.get("/api/showcase")
617
+ async def get_showcase():
618
+ """Return curated showcase entries with resolved streaming URLs."""
619
+ if not SHOWCASE_FILE.exists():
620
+ return JSONResponse({"showcases": []})
621
+
622
+ try:
623
+ data = json.loads(SHOWCASE_FILE.read_text(encoding="utf-8"))
624
+ except (json.JSONDecodeError, OSError):
625
+ return JSONResponse({"showcases": []})
626
+
627
+ showcases = data.get("showcases", [])
628
+ for entry in showcases:
629
+ for key in ("their_dub", "our_dub"):
630
+ dub = entry.get(key)
631
+ if dub and dub.get("type") == "local" and dub.get("filename"):
632
+ video_id = _demo_video_id("showcase", dub["filename"])
633
+ dub["url"] = f"/api/demo-videos/{video_id}/stream"
634
+
635
+ return JSONResponse({"showcases": showcases})
636
+
637
+
638
+ @router.post("/api/jobs")
639
+ @limiter.limit("3/hour")
640
+ async def create_job(
641
+ request: Request,
642
+ file: Optional[UploadFile] = File(None),
643
+ url: Optional[str] = Form(None),
644
+ target_language: str = Form("Spanish"),
645
+ source_language: str = Form("auto"),
646
+ voice_mode: str = Form("chatterbox"),
647
+ captions: str = Form("true"),
648
+ preserve_music: str = Form("false"),
649
+ x_session_id: Optional[str] = Header(default=None, alias="X-Session-Id"),
650
+ ):
651
+ """Submit a video for translation."""
652
+ if not file and not url:
653
+ raise HTTPException(400, "Provide either a file upload or a URL.")
654
+
655
+ if x_session_id:
656
+ existing_job_id = session_active_jobs.get(x_session_id)
657
+ if existing_job_id and _is_job_active(existing_job_id):
658
+ return JSONResponse(
659
+ status_code=409,
660
+ content={"existing_job_id": existing_job_id},
661
+ )
662
+ if existing_job_id and not _is_job_active(existing_job_id):
663
+ session_active_jobs.pop(x_session_id, None)
664
+
665
+ # Validate voice_mode - only TTS_ENGINE is valid for this Space
666
+ # "preview_both" is disabled in single-engine mode (no way to choose between engines)
667
+ valid_modes = (TTS_ENGINE,)
668
+ if voice_mode not in valid_modes:
669
+ voice_mode = TTS_ENGINE
670
+
671
+ job_id = None
672
+ if url:
673
+ if not _is_allowed_video_host(url):
674
+ raise HTTPException(400, "Unsupported URL host.")
675
+
676
+ # Instagram
677
+ m = re.search(r'/(?:reel|reels|p)/([A-Za-z0-9_-]+)', url)
678
+ if m:
679
+ job_id = m.group(1)
680
+ # YouTube
681
+ if not job_id:
682
+ m = re.search(r'(?:v=|youtu\.be/)([\w-]+)', url)
683
+ if m:
684
+ job_id = m.group(1)
685
+ # TikTok (vm.tiktok.com)
686
+ if not job_id:
687
+ m = re.search(r'vm\.tiktok\.com/([\w-]+)', url)
688
+ if m:
689
+ job_id = m.group(1)
690
+ # TikTok (standard /video/xxx)
691
+ if not job_id:
692
+ m = re.search(r'/video/(\d+)', url)
693
+ if m:
694
+ job_id = m.group(1)
695
+
696
+ if not job_id:
697
+ job_id = str(uuid.uuid4())[:12]
698
+
699
+ base_job_id = job_id
700
+ counter = 1
701
+ job_dir = ARTIFACTS_ROOT / job_id
702
+ while job_dir.exists():
703
+ job_id = f"{base_job_id}_{counter}"
704
+ job_dir = ARTIFACTS_ROOT / job_id
705
+ counter += 1
706
+
707
+ job_dir.mkdir(parents=True, exist_ok=True)
708
+
709
+ video_path = ""
710
+
711
+ if file:
712
+ # Save uploaded file
713
+ ext = Path(file.filename or "video.mp4").suffix or ".mp4"
714
+ video_path = str(job_dir / f"input{ext}")
715
+ with open(video_path, "wb") as f:
716
+ content = await file.read()
717
+ f.write(content)
718
+ elif url:
719
+ # Download from URL
720
+ video_path = str(job_dir / "input.mp4")
721
+ try:
722
+ _download_url(url, video_path)
723
+ except Exception as e:
724
+ shutil.rmtree(job_dir, ignore_errors=True)
725
+ raise HTTPException(400, f"Failed to download video: {e}")
726
+
727
+ try:
728
+ duration_seconds = _probe_duration_seconds(video_path)
729
+ except Exception as exc:
730
+ shutil.rmtree(job_dir, ignore_errors=True)
731
+ raise HTTPException(400, f"Could not validate video duration: {exc}")
732
+
733
+ if duration_seconds > MAX_DURATION_SEC:
734
+ shutil.rmtree(job_dir, ignore_errors=True)
735
+ raise HTTPException(400, f"Video exceeds {MAX_DURATION_SEC} seconds limit.")
736
+
737
+ # Initialize job
738
+ jobs[job_id] = {
739
+ "job_id": job_id,
740
+ "status": "queued",
741
+ "messages": [],
742
+ "result_path": None,
743
+ "error": None,
744
+ "created_at": time.time(),
745
+ "voice_mode": voice_mode,
746
+ "preview_paths": None,
747
+ "preview_event": None,
748
+ "selected_model": None,
749
+ "session_id": x_session_id,
750
+ }
751
+ if x_session_id:
752
+ session_active_jobs[x_session_id] = job_id
753
+
754
+ # Start pipeline in background
755
+ enable_captions = captions.lower() == "true"
756
+ enable_music = preserve_music.lower() == "true"
757
+ asyncio.create_task(
758
+ _run_pipeline_async(job_id, video_path, target_language, source_language, voice_mode, enable_captions, enable_music, url)
759
+ )
760
+
761
+ return JSONResponse({"job_id": job_id, "status": "queued"})
762
+
763
+
764
+ @router.get("/api/jobs/{job_id}")
765
+ @limiter.limit("20/second")
766
+ async def job_status_poll(request: Request, job_id: str, after: int = 0):
767
+ """Poll endpoint returning new messages since index `after`, plus live wait status."""
768
+ if job_id not in jobs:
769
+ raise HTTPException(404, "Job not found.")
770
+
771
+ job = jobs[job_id]
772
+ messages = job["messages"][after:]
773
+
774
+ # Include live wait ETA (updated in-place, not a queued message)
775
+ wait_status = job.get("_wait_status")
776
+
777
+ return JSONResponse(
778
+ {"messages": messages, "next": after + len(messages), "wait_status": wait_status},
779
+ headers={"Cache-Control": "no-cache, no-store"},
780
+ )
781
+
782
+
783
+ class ModelSelection(BaseModel):
784
+ model: str
785
+
786
+
787
+ @router.post("/api/jobs/{job_id}/select-model")
788
+ async def select_model(job_id: str, selection: ModelSelection):
789
+ """User selects a TTS model after previewing."""
790
+ job = jobs.get(job_id)
791
+ if not job:
792
+ raise HTTPException(404, "Job not found.")
793
+
794
+ if selection.model != TTS_ENGINE:
795
+ raise HTTPException(400, f"Invalid model. This Space only serves {TTS_ENGINE}.")
796
+
797
+ job["selected_model"] = selection.model
798
+
799
+ # Unblock the pipeline
800
+ if job.get("preview_event"):
801
+ job["preview_event"].set()
802
+
803
+ return JSONResponse({"status": "ok", "selected": selection.model})
804
+
805
+
806
+ @router.get("/api/jobs/{job_id}/preview/{model_name}")
807
+ async def get_preview_audio(job_id: str, model_name: str):
808
+ """Serve a preview audio WAV file."""
809
+ job = jobs.get(job_id)
810
+ if not job:
811
+ raise HTTPException(404, "Job not found.")
812
+
813
+ if model_name != TTS_ENGINE:
814
+ raise HTTPException(400, f"Invalid model name. This Space serves {TTS_ENGINE} only.")
815
+
816
+ preview_paths = job.get("preview_paths")
817
+ if not preview_paths:
818
+ raise HTTPException(404, "Previews not yet generated.")
819
+
820
+ path = preview_paths.get(model_name)
821
+ if not path or not Path(path).exists():
822
+ raise HTTPException(404, f"Preview for '{model_name}' not available.")
823
+
824
+ return FileResponse(
825
+ path,
826
+ media_type="audio/wav",
827
+ filename=f"preview_{model_name}.wav",
828
+ )
829
+
830
+
831
+ @router.get("/api/jobs/{job_id}/result")
832
+ async def job_result(job_id: str):
833
+ """Download the translated video."""
834
+ job = jobs.get(job_id)
835
+ if not job:
836
+ raise HTTPException(404, "Job not found.")
837
+ if job["status"] != "complete":
838
+ raise HTTPException(400, f"Job is {job['status']}, not complete.")
839
+ if not job["result_path"] or not Path(job["result_path"]).exists():
840
+ raise HTTPException(404, "Result file not found.")
841
+
842
+ return FileResponse(
843
+ job["result_path"],
844
+ media_type="video/mp4",
845
+ filename=f"videovoice_{job_id}.mp4",
846
+ )
847
+
848
+
849
+ @router.on_event("startup")
850
+ async def startup_event():
851
+ """Create artifact directories and start background cleanup."""
852
+ global artifact_reaper_task
853
+
854
+ ARTIFACTS_ROOT.mkdir(parents=True, exist_ok=True)
855
+ UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
856
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
857
+ (ARTIFACTS_ROOT / "data").mkdir(parents=True, exist_ok=True)
858
+ (ARTIFACTS_ROOT / "tmp").mkdir(parents=True, exist_ok=True)
859
+
860
+ if os.getenv("DISABLE_CLEANUP", "").lower() in ("1", "true", "yes"):
861
+ print("[reaper] DISABLE_CLEANUP is set — artifact reaper will not run")
862
+ elif artifact_reaper_task is None or artifact_reaper_task.done():
863
+ artifact_reaper_task = asyncio.create_task(_artifact_reaper_loop())
864
+
865
+
866
+ @router.on_event("shutdown")
867
+ async def shutdown_event():
868
+ global artifact_reaper_task
869
+ if artifact_reaper_task is not None and not artifact_reaper_task.done():
870
+ artifact_reaper_task.cancel()
871
+ try:
872
+ await artifact_reaper_task
873
+ except asyncio.CancelledError:
874
+ pass
875
+
876
+
877
+ # ── No-cache headers for dev/tunnel (ensures Cloudflare serves fresh files) ──
878
+ from starlette.middleware.base import BaseHTTPMiddleware
879
+
880
+ # Phase 1.7 marker: remove legacy static middleware when React FE fully owns UI.
881
+ class NoCacheStaticMiddleware(BaseHTTPMiddleware):
882
+ async def dispatch(self, request: Request, call_next):
883
+ response = await call_next(request)
884
+ if request.url.path.endswith(('.css', '.js', '.html')) or request.url.path == '/':
885
+ response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
886
+ response.headers['Pragma'] = 'no-cache'
887
+ return response
888
+
889
+ # Standalone middleware and static mounts removed (now handled in app.py/main app)
890
+
891
+
892
+ # ── Local dev entrypoint ──────────────────────────────
893
+ # On HF Spaces `app.py` creates its own Server and imports this router, so
894
+ # the block below is skipped. Locally, `python server.py` builds a minimal
895
+ # FastAPI wrapper around the router so there's something for uvicorn to run.
896
+ if __name__ == "__main__":
897
+ local_app = FastAPI(title="VideoVoice API (local)")
898
+ local_app.state.limiter = limiter
899
+ local_app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
900
+ local_app.add_middleware(SlowAPIMiddleware)
901
+ local_app.add_middleware(NoCacheStaticMiddleware)
902
+ local_app.add_middleware(
903
+ CORSMiddleware,
904
+ allow_origins=ALLOWED_ORIGINS,
905
+ allow_credentials=True,
906
+ allow_methods=["*"],
907
+ allow_headers=["*"],
908
+ )
909
+
910
+ @local_app.middleware("http")
911
+ async def _local_content_length(request: Request, call_next):
912
+ return await enforce_content_length_limit(request, call_next)
913
+
914
+ local_app.include_router(router)
915
+
916
+ # Tools API — independent of pipeline; safe to include here too.
917
+ from tools_api import router as tools_router
918
+ local_app.include_router(tools_router)
919
+
920
+ # Serve the legacy static frontend at / so `python server.py` keeps the
921
+ # old dev UX (open http://localhost:8000 to hit frontend/index.html).
922
+ # The React SPA in production is deployed separately to S3.
923
+ frontend_dir = Path(__file__).parent / "frontend"
924
+ if frontend_dir.exists():
925
+ local_app.mount("/", StaticFiles(directory=str(frontend_dir), html=True), name="frontend")
926
+
927
+ import uvicorn
928
+ port = int(os.getenv("PORT", 8000))
929
+ uvicorn.run(local_app, host="0.0.0.0", port=port)
social_distributor/.env.example ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VideoVoice data directory (default: ../data relative to poster/)
2
+ VIDEOVOICE_DATA_DIR=/Users/rafa/MscAi/VideoVoice/data
3
+
4
+ # Pollinations LLM (for caption generation)
5
+ POLLEN_MODEL=gemini-search
6
+ POLLEN_API_KEY=pollinations
7
+
8
+ # AWS Bedrock fallback (for caption generation)
9
+ AWS_REGION=us-east-1
10
+ BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
11
+ # AWS_ACCESS_KEY_ID=
12
+ # AWS_SECRET_ACCESS_KEY=
13
+
14
+ # Posting settings
15
+ POST_DELAY=30
16
+ HEADLESS=true
social_distributor/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ poster/auth/storage/
4
+ *.pyc
5
+ .env
6
+ post_history.json
7
+ creator_cache.json
8
+ errors/
social_distributor/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Social Media Distributor
2
+
3
+ Automated social media posting for VideoVoice dubbed videos. Posts AI-dubbed videos to Instagram, TikTok, and YouTube with AI-generated captions.
4
+
5
+ ## Features
6
+
7
+ - **Multi-platform posting**: Instagram, TikTok, YouTube
8
+ - **AI-generated captions**: Uses Pollinations LLM with AWS Bedrock fallback
9
+ - **Creator handle extraction**: Automatically pulls creator info from source videos
10
+ - **Smart scheduling**: Configurable delays between posts to avoid rate limits
11
+ - **Session management**: Persistent browser sessions (no repeated logins)
12
+ - **Post tracking**: Tracks what was posted to avoid duplicates
13
+
14
+ ## Setup
15
+
16
+ ### 1. Install Dependencies
17
+
18
+ ```bash
19
+ # Using uv (recommended)
20
+ uv sync
21
+
22
+ # Or using pip
23
+ pip install -r pyproject.toml
24
+ playwright install
25
+ ```
26
+
27
+ ### 2. Configure Environment
28
+
29
+ Copy the example environment file and edit:
30
+
31
+ ```bash
32
+ cp .env.example .env
33
+ ```
34
+
35
+ Edit `.env`:
36
+ ```env
37
+ # VideoVoice data directory (where dubbed video folders are)
38
+ VIDEOVOICE_DATA_DIR=/path/to/VideoVoice/data
39
+
40
+ # LLM for caption generation (Pollinations)
41
+ POLLEN_MODEL=gemini-search
42
+ POLLEN_API_KEY=pollinations
43
+
44
+ # Optional: AWS Bedrock fallback
45
+ AWS_REGION=us-east-1
46
+ BEDROCK_MODEL=qwen.qwen3-next-80b-a3b
47
+ AWS_ACCESS_KEY_ID=...
48
+ AWS_SECRET_ACCESS_KEY=...
49
+
50
+ # Posting behavior
51
+ POST_DELAY=30 # Seconds between posts
52
+ HEADLESS=true # Run browser headlessly
53
+ ```
54
+
55
+ ### 3. Login to Platforms
56
+
57
+ You need to authenticate with each platform once. This opens a browser window for you to log in:
58
+
59
+ ```bash
60
+ # Login to Instagram
61
+ python post.py login instagram
62
+
63
+ # Login to TikTok
64
+ python post.py login tiktok
65
+
66
+ # Login to YouTube
67
+ python post.py login youtube
68
+ ```
69
+
70
+ Sessions are saved in `poster/auth/storage/` — you won't need to log in again.
71
+
72
+ ## Usage
73
+
74
+ ### Post Videos
75
+
76
+ Post all videos from a data folder:
77
+
78
+ ```bash
79
+ # Post all platforms (default)
80
+ python post.py post /path/to/VideoVoice/data/Dxxxxxxxxx
81
+
82
+ # Post to specific platforms only
83
+ python post.py post /path/to/data/Folder1 -p instagram,tiktok
84
+
85
+ # Post multiple folders
86
+ python post.py post Folder1 Folder2 Folder3
87
+
88
+ # Dry run (generate captions but don't post)
89
+ python post.py post Folder1 --dry-run
90
+
91
+ # Force re-post even if already posted
92
+ python post.py post Folder1 --force
93
+
94
+ # Override language (e.g., if auto-detection is wrong)
95
+ python post.py post Folder1 --lang-override "Urdu"
96
+
97
+ # Customize delay between posts
98
+ python post.py post Folder1 --delay 60
99
+
100
+ # Run with visible browser (for debugging)
101
+ python post.py post Folder1 --no-headless
102
+ ```
103
+
104
+ ### Preview Captions
105
+
106
+ Generate and preview captions without posting:
107
+
108
+ ```bash
109
+ # Preview captions for all platforms
110
+ python post.py caption /path/to/data/Folder1
111
+
112
+ # Preview for specific platforms
113
+ python post.py caption Folder1 -p youtube
114
+
115
+ # Preview multiple folders
116
+ python post.py caption Folder1 Folder2 Folder3
117
+ ```
118
+
119
+ ### Check Posting History
120
+
121
+ ```bash
122
+ python post.py status
123
+ ```
124
+
125
+ Shows a table of all posted videos with timestamps and status.
126
+
127
+ ## Command Reference
128
+
129
+ | Command | Description |
130
+ |---------|-------------|
131
+ | `python post.py login <platform>` | Authenticate with a platform |
132
+ | `python post.py post <folders...>` | Post videos to social media |
133
+ | `python post.py caption <folders...>` | Preview generated captions |
134
+ | `python post.py status` | View posting history |
135
+
136
+ ### Post Options
137
+
138
+ | Option | Description |
139
+ |--------|-------------|
140
+ | `-p, --platforms` | Comma-separated platforms (default: instagram,tiktok,youtube) |
141
+ | `--force` | Re-post even if already posted |
142
+ | `--dry-run` | Generate captions but don't post |
143
+ | `--delay <seconds>` | Seconds between posts |
144
+ | `--headless / --no-headless` | Run browser headlessly |
145
+ | `--lang-override <name>` | Override target language (e.g., "Urdu") |
146
+
147
+ ## How It Works
148
+
149
+ 1. **Loads videos** from VideoVoice data folders
150
+ 2. **Extracts creator info** from the original video link
151
+ 3. **Generates captions** using AI (Pollinations LLM)
152
+ 4. **Posts to each platform** with platform-optimized formatting
153
+ 5. **Tracks posts** in `post_history.json`
154
+
155
+ ## File Structure
156
+
157
+ ```
158
+ social_distributor/
159
+ ├── post.py # CLI entry point
160
+ ├── poster/
161
+ │ ├── auth/
162
+ │ │ ├── session.py # Browser session management
163
+ │ │ └── storage/ # Saved session files
164
+ │ ├── platforms/
165
+ │ │ ├── base.py # Base poster class
166
+ │ │ ├── instagram.py # Instagram automation
167
+ │ │ ├── tiktok.py # TikTok automation
168
+ │ │ └── youtube.py # YouTube automation
169
+ │ ├── caption_gen.py # AI caption generation
170
+ │ ├── creator_extract.py # Creator handle extraction
171
+ │ ├── video_loader.py # Video metadata loading
172
+ │ ├── post_log.py # Post history tracking
173
+ │ ├── config.py # Configuration & constants
174
+ │ └── models.py # Data models
175
+ ├── .env # Your environment config
176
+ └── post_history.json # Auto-generated post log
177
+ ```
178
+
179
+ ## Troubleshooting
180
+
181
+ **Login fails / session expires:**
182
+ ```bash
183
+ # Re-login to the platform
184
+ python post.py login instagram
185
+ ```
186
+
187
+ **Caption generation fails:**
188
+ - Check your `POLLEN_API_KEY` in `.env`
189
+ - Or configure AWS Bedrock credentials as fallback
190
+
191
+ **Post fails on specific platform:**
192
+ - Use `--no-headless` to see the browser and debug
193
+ - Check `post_history.json` for error messages
194
+ - Platforms may require re-authentication periodically
195
+
196
+ **Videos not found:**
197
+ - Ensure `VIDEOVOICE_DATA_DIR` points to your VideoVoice `data/` folder
198
+ - Folder names should match VideoVoice video IDs (e.g., `Dxxxxxxxxx`)
199
+
200
+ ## Notes
201
+
202
+ - Instagram and TikTok use browser automation (Playwright)
203
+ - YouTube posts via web upload (requires logged-in session)
204
+ - First login for each platform opens a real browser window
205
+ - Headless mode runs faster but hides the browser (use `--no-headless` to debug)
social_distributor/post.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """VideoVoice Social Media Poster — CLI entrypoint."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import asyncio
7
+ import time
8
+
9
+ import click
10
+ from rich.console import Console
11
+ from rich.table import Table
12
+
13
+ console = Console()
14
+
15
+ ALL_PLATFORMS = ["instagram", "tiktok", "youtube"]
16
+
17
+
18
+ @click.group()
19
+ def cli():
20
+ """VideoVoice Social Media Poster — post dubbed videos to Instagram, TikTok, and YouTube."""
21
+ pass
22
+
23
+
24
+ # ── Login command ────────────────────────────────────────────────────────
25
+
26
+
27
+ @cli.command()
28
+ @click.argument("platform", type=click.Choice(ALL_PLATFORMS))
29
+ def login(platform: str):
30
+ """Interactively log in to a platform (opens a browser window)."""
31
+ from poster.auth.session import interactive_login
32
+
33
+ asyncio.run(interactive_login(platform))
34
+
35
+
36
+ # ── Caption preview command ──────────────────────────────────────────────
37
+
38
+
39
+ @cli.command()
40
+ @click.argument("folders", nargs=-1, required=True)
41
+ @click.option("--platforms", "-p", default="instagram,tiktok,youtube", help="Comma-separated platforms")
42
+ @click.option("--lang-override", default=None, help="Override target language name (e.g. 'Urdu')")
43
+ def caption(folders: tuple[str, ...], platforms: str, lang_override: str | None):
44
+ """Preview generated captions without posting."""
45
+ from poster.caption_gen import format_caption, generate_caption
46
+ from poster.video_loader import load_videos
47
+
48
+ target_platforms = [p.strip() for p in platforms.split(",")]
49
+ videos = load_videos(list(folders), lang_override)
50
+
51
+ if not videos:
52
+ console.print("[red]No valid videos found.[/red]")
53
+ return
54
+
55
+ for video in videos:
56
+ console.print(f"\n[bold]{'=' * 60}[/bold]")
57
+ console.print(f"[bold]Video:[/bold] {video.video_id}")
58
+ console.print(f"[bold]Source:[/bold] {video.source_language} -> {video.target_language_name}")
59
+ console.print(f"[bold]Link:[/bold] {video.video_link or 'N/A'}")
60
+
61
+ for platform in target_platforms:
62
+ console.print(f"\n[cyan]--- {platform.upper()} ---[/cyan]")
63
+ try:
64
+ caption_data = generate_caption(video, platform)
65
+ result = format_caption(caption_data, video, platform)
66
+
67
+ if platform == "youtube":
68
+ title, desc = result
69
+ console.print(f"[bold]Title:[/bold] {title}")
70
+ console.print(f"[bold]Description:[/bold]\n{desc}")
71
+ else:
72
+ console.print(f"[bold]Caption:[/bold]\n{result}")
73
+ except Exception as e:
74
+ console.print(f"[red]Caption generation failed: {e}[/red]")
75
+
76
+
77
+ # ── Post command ─────────────────────────────────────────────────────────
78
+
79
+
80
+ @cli.command()
81
+ @click.argument("folders", nargs=-1, required=True)
82
+ @click.option("--platforms", "-p", default="instagram,tiktok,youtube", help="Comma-separated platforms")
83
+ @click.option("--force", is_flag=True, help="Re-post even if already posted")
84
+ @click.option("--dry-run", is_flag=True, help="Generate captions but don't post")
85
+ @click.option("--delay", default=None, type=int, help="Seconds between posts (default: from env)")
86
+ @click.option("--headless/--no-headless", default=None, help="Run browser headlessly")
87
+ @click.option("--lang-override", default=None, help="Override target language name (e.g. 'Urdu')")
88
+ def post(
89
+ folders: tuple[str, ...],
90
+ platforms: str,
91
+ force: bool,
92
+ dry_run: bool,
93
+ delay: int | None,
94
+ headless: bool | None,
95
+ lang_override: str | None,
96
+ ):
97
+ """Post dubbed videos to social media platforms."""
98
+ asyncio.run(
99
+ _post_async(list(folders), platforms, force, dry_run, delay, headless, lang_override)
100
+ )
101
+
102
+
103
+ async def _post_async(
104
+ folders: list[str],
105
+ platforms_str: str,
106
+ force: bool,
107
+ dry_run: bool,
108
+ delay: int | None,
109
+ headless: bool | None,
110
+ lang_override: str | None,
111
+ ):
112
+ from playwright.async_api import async_playwright
113
+
114
+ from poster import post_log
115
+ from poster.auth.session import get_context, has_session
116
+ from poster.caption_gen import format_caption, generate_caption
117
+ from poster.config import POST_DELAY
118
+ from poster.creator_extract import extract_creator
119
+ from poster.models import PostResult
120
+ from poster.platforms.instagram import InstagramPoster
121
+ from poster.platforms.tiktok import TikTokPoster
122
+ from poster.platforms.youtube import YouTubePoster
123
+ from poster.video_loader import load_videos
124
+
125
+ target_platforms = [p.strip() for p in platforms_str.split(",")]
126
+ post_delay = delay if delay is not None else POST_DELAY
127
+
128
+ # Validate sessions exist
129
+ for platform in target_platforms:
130
+ if not has_session(platform):
131
+ console.print(
132
+ f"[red]No session for {platform}. "
133
+ f"Run: python post.py login {platform}[/red]"
134
+ )
135
+ return
136
+
137
+ # Load videos
138
+ videos = load_videos(folders, lang_override)
139
+ if not videos:
140
+ console.print("[red]No valid videos found.[/red]")
141
+ return
142
+
143
+ console.print(f"\n[bold]Posting {len(videos)} video(s) to {', '.join(target_platforms)}[/bold]")
144
+ if dry_run:
145
+ console.print("[yellow]DRY RUN — captions will be generated but nothing will be posted[/yellow]")
146
+
147
+ results: list[PostResult] = []
148
+
149
+ async with async_playwright() as pw:
150
+ # Create browser contexts for each platform
151
+ contexts = {}
152
+ posters = {}
153
+ poster_classes = {
154
+ "instagram": InstagramPoster,
155
+ "tiktok": TikTokPoster,
156
+ "youtube": YouTubePoster,
157
+ }
158
+
159
+ for platform in target_platforms:
160
+ ctx = await get_context(pw, platform, headless=headless)
161
+ contexts[platform] = ctx
162
+ posters[platform] = poster_classes[platform](ctx)
163
+
164
+ # Use first available context for creator extraction
165
+ extract_ctx = next(iter(contexts.values()))
166
+
167
+ for i, video in enumerate(videos):
168
+ console.print(f"\n[bold]{'=' * 60}[/bold]")
169
+ console.print(f"[bold]Video {i + 1}/{len(videos)}:[/bold] {video.video_id}")
170
+
171
+ # Extract creator handle
172
+ creator_handle = await extract_creator(video.video_link, extract_ctx)
173
+
174
+ for platform in target_platforms:
175
+ console.print(f"\n[cyan]--- {platform.upper()} ---[/cyan]")
176
+
177
+ # Check if already posted
178
+ if not force and post_log.is_posted(video.video_id, platform):
179
+ console.print(f"[yellow]Already posted — skipping (use --force to re-post)[/yellow]")
180
+ results.append(PostResult(
181
+ video_id=video.video_id,
182
+ platform=platform,
183
+ status="skipped",
184
+ timestamp="",
185
+ ))
186
+ continue
187
+
188
+ # Generate caption
189
+ try:
190
+ caption_data = generate_caption(video, platform, creator_handle)
191
+ formatted = format_caption(caption_data, video, platform, creator_handle)
192
+ except Exception as e:
193
+ console.print(f"[red]Caption generation failed: {e}[/red]")
194
+ continue
195
+
196
+ if platform == "youtube":
197
+ title, description = formatted
198
+ console.print(f"[dim]Title: {title}[/dim]")
199
+ console.print(f"[dim]Description: {description[:150]}...[/dim]")
200
+ else:
201
+ description = formatted
202
+ title = None
203
+ console.print(f"[dim]Caption: {description[:150]}...[/dim]")
204
+
205
+ if dry_run:
206
+ console.print("[yellow]DRY RUN — skipping actual post[/yellow]")
207
+ continue
208
+
209
+ # Post
210
+ poster = posters[platform]
211
+ if platform == "youtube":
212
+ result = await poster.post(
213
+ video.output_path, description,
214
+ video_id=video.video_id, title=title,
215
+ )
216
+ else:
217
+ result = await poster.post(
218
+ video.output_path, description,
219
+ video_id=video.video_id,
220
+ )
221
+
222
+ result.caption_used = description if isinstance(description, str) else str(description)
223
+ results.append(result)
224
+ post_log.record(result)
225
+
226
+ if result.status == "success":
227
+ console.print(f"[green]Posted to {platform}![/green]")
228
+ else:
229
+ console.print(f"[red]Failed: {result.error}[/red]")
230
+
231
+ # Delay between posts
232
+ if post_delay > 0:
233
+ console.print(f"[dim]Waiting {post_delay}s before next post...[/dim]")
234
+ await asyncio.sleep(post_delay)
235
+
236
+ # Close all browser contexts
237
+ for ctx in contexts.values():
238
+ await ctx.browser.close()
239
+
240
+ # Print summary
241
+ _print_summary(results)
242
+
243
+
244
+ def _print_summary(results: list):
245
+ if not results:
246
+ return
247
+
248
+ table = Table(title="Posting Summary")
249
+ table.add_column("Video", style="bold")
250
+ table.add_column("Platform")
251
+ table.add_column("Status")
252
+ table.add_column("Error")
253
+
254
+ for r in results:
255
+ status_style = {
256
+ "success": "green",
257
+ "failed": "red",
258
+ "skipped": "yellow",
259
+ }.get(r.status, "white")
260
+
261
+ table.add_row(
262
+ r.video_id,
263
+ r.platform,
264
+ f"[{status_style}]{r.status}[/{status_style}]",
265
+ r.error or "",
266
+ )
267
+
268
+ console.print()
269
+ console.print(table)
270
+
271
+
272
+ # ── Status command ───────────────────────────────────────────────────────
273
+
274
+
275
+ @cli.command()
276
+ def status():
277
+ """Show posting history."""
278
+ from poster import post_log
279
+
280
+ data = post_log.get_all()
281
+ if not data:
282
+ console.print("[yellow]No posting history yet.[/yellow]")
283
+ return
284
+
285
+ table = Table(title="Posting History")
286
+ table.add_column("Video ID", style="bold")
287
+ table.add_column("Platform")
288
+ table.add_column("Status")
289
+ table.add_column("Timestamp")
290
+ table.add_column("Error")
291
+
292
+ for video_id, platforms in data.items():
293
+ for platform, info in platforms.items():
294
+ status_style = {
295
+ "success": "green",
296
+ "failed": "red",
297
+ }.get(info.get("status", ""), "white")
298
+
299
+ table.add_row(
300
+ video_id,
301
+ platform,
302
+ f"[{status_style}]{info.get('status', 'unknown')}[/{status_style}]",
303
+ info.get("timestamp", "")[:19],
304
+ info.get("error", "") or "",
305
+ )
306
+
307
+ console.print(table)
308
+
309
+
310
+ if __name__ == "__main__":
311
+ cli()
social_distributor/poster/__init__.py ADDED
File without changes
social_distributor/poster/auth/__init__.py ADDED
File without changes
social_distributor/poster/auth/session.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Browser session management — persistent login via Playwright storage state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from playwright.async_api import BrowserContext, Playwright, async_playwright
8
+ from rich.console import Console
9
+
10
+ from ..config import AUTH_STORAGE_DIR, HEADLESS
11
+
12
+ console = Console()
13
+
14
+ PLATFORM_LOGIN_URLS = {
15
+ "instagram": "https://www.instagram.com/accounts/login/",
16
+ "tiktok": "https://www.tiktok.com/login",
17
+ "youtube": "https://studio.youtube.com/",
18
+ }
19
+
20
+ # Mobile UA for Instagram (required for mobile web Reels upload)
21
+ MOBILE_USER_AGENT = (
22
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) "
23
+ "AppleWebKit/605.1.15 (KHTML, like Gecko) "
24
+ "Version/17.0 Mobile/15E148 Safari/604.1"
25
+ )
26
+
27
+ DESKTOP_USER_AGENT = (
28
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
29
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
30
+ "Chrome/120.0.0.0 Safari/537.36"
31
+ )
32
+
33
+
34
+ def _state_path(platform: str) -> Path:
35
+ AUTH_STORAGE_DIR.mkdir(parents=True, exist_ok=True)
36
+ return AUTH_STORAGE_DIR / f"{platform}_state.json"
37
+
38
+
39
+ def has_session(platform: str) -> bool:
40
+ return _state_path(platform).exists()
41
+
42
+
43
+ async def interactive_login(platform: str) -> None:
44
+ """Launch a headed browser for the user to log in manually.
45
+
46
+ After login, saves the browser storage state for future use.
47
+ """
48
+ login_url = PLATFORM_LOGIN_URLS.get(platform)
49
+ if not login_url:
50
+ console.print(f"[red]Unknown platform: {platform}[/red]")
51
+ return
52
+
53
+ console.print(f"\n[bold]Opening {platform.title()} login page...[/bold]")
54
+ console.print("[yellow]Please log in manually in the browser window.[/yellow]")
55
+ console.print("[yellow]Press Enter here when you're done logging in.[/yellow]\n")
56
+
57
+ use_mobile = platform == "instagram"
58
+
59
+ async with async_playwright() as pw:
60
+ browser = await pw.chromium.launch(headless=False)
61
+ context = await browser.new_context(
62
+ user_agent=MOBILE_USER_AGENT if use_mobile else DESKTOP_USER_AGENT,
63
+ viewport={"width": 414, "height": 896} if use_mobile else {"width": 1280, "height": 800},
64
+ is_mobile=use_mobile,
65
+ has_touch=use_mobile,
66
+ )
67
+
68
+ page = await context.new_page()
69
+ await page.goto(login_url, wait_until="domcontentloaded")
70
+
71
+ # Wait for user to finish logging in
72
+ input(">>> Press Enter after you've logged in... ")
73
+
74
+ # Save state
75
+ state_file = _state_path(platform)
76
+ await context.storage_state(path=str(state_file))
77
+ console.print(f"[green]Session saved for {platform.title()}![/green]")
78
+
79
+ await browser.close()
80
+
81
+
82
+ async def get_context(
83
+ pw: Playwright,
84
+ platform: str,
85
+ headless: bool | None = None,
86
+ ) -> BrowserContext:
87
+ """Get a browser context with saved session state.
88
+
89
+ Raises FileNotFoundError if no session exists — user must run login first.
90
+ """
91
+ state_file = _state_path(platform)
92
+ if not state_file.exists():
93
+ raise FileNotFoundError(
94
+ f"No saved session for {platform}. Run: python post.py login {platform}"
95
+ )
96
+
97
+ if headless is None:
98
+ headless = HEADLESS
99
+
100
+ use_mobile = platform == "instagram"
101
+
102
+ browser = await pw.chromium.launch(headless=headless)
103
+ context = await browser.new_context(
104
+ storage_state=str(state_file),
105
+ user_agent=MOBILE_USER_AGENT if use_mobile else DESKTOP_USER_AGENT,
106
+ viewport={"width": 414, "height": 896} if use_mobile else {"width": 1280, "height": 800},
107
+ is_mobile=use_mobile,
108
+ has_touch=use_mobile,
109
+ )
110
+
111
+ return context
social_distributor/poster/caption_gen.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM-based caption generation for social media posts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from rich.console import Console
8
+
9
+ from .config import (
10
+ INSTAGRAM_CAPTION_LIMIT,
11
+ POLLEN_MODEL,
12
+ TIKTOK_CAPTION_LIMIT,
13
+ YOUTUBE_DESCRIPTION_LIMIT,
14
+ YOUTUBE_TITLE_LIMIT,
15
+ bedrock_converse,
16
+ build_pollinations_client,
17
+ )
18
+ from .models import VideoData
19
+
20
+ console = Console()
21
+
22
+ PLATFORM_LIMITS = {
23
+ "instagram": INSTAGRAM_CAPTION_LIMIT,
24
+ "tiktok": TIKTOK_CAPTION_LIMIT,
25
+ "youtube": YOUTUBE_DESCRIPTION_LIMIT,
26
+ }
27
+
28
+ PLATFORM_HASHTAGS = {
29
+ "instagram": "#Reels #ReelsViral #ExplorePage",
30
+ "tiktok": "#fyp #foryou #foryoupage",
31
+ "youtube": "#Shorts #YouTubeShorts",
32
+ }
33
+
34
+
35
+ def _build_system_prompt(platform: str) -> str:
36
+ char_limit = PLATFORM_LIMITS.get(platform, 2200)
37
+ is_youtube = platform == "youtube"
38
+
39
+ return f"""You are a social media caption writer for VideoVoice, an AI voice-cloning video dubbing tool.
40
+
41
+ Your job: write a catchy, engaging caption for a dubbed video posted on {platform.title()}.
42
+
43
+ VideoVoice's key differentiator: platform tools give you an option (subtitle overlay). We give you a BRAND NEW video with cloned voice — same speaker, new language. Background music preserved. 23+ languages. "2x Reach, Same Effort."
44
+
45
+ Rules:
46
+ 1. Highlight the magic of hearing this content in the target language with the SAME voice (AI voice cloning, not just subtitles)
47
+ 2. Be conversational, create curiosity, make people want to watch
48
+ 3. ALWAYS include the original video link to credit the original creator
49
+ 4. If a creator handle is provided, tag them with @
50
+ 5. Stay within {char_limit} characters total
51
+ 6. Include relevant hashtags: #VideoVoice #AIDubbing #VoiceCloning + language-specific + {PLATFORM_HASHTAGS.get(platform, "")}
52
+ 7. Write the caption primarily in English
53
+
54
+ {"Return a JSON object with two fields: `title` (under " + str(YOUTUBE_TITLE_LIMIT) + " chars, punchy) and `description` (the full caption)." if is_youtube else "Return a JSON object with one field: `caption` (the full caption text)."}
55
+
56
+ Example tone: "What's more interesting than hearing the power of English motivation but in the magic of Turkish words? Same voice. Same energy. New language."
57
+
58
+ IMPORTANT: Return ONLY valid JSON, no markdown fences."""
59
+
60
+
61
+ def _build_user_prompt(video: VideoData, creator_handle: str | None) -> str:
62
+ # Truncate original text to avoid token limits
63
+ original_excerpt = video.original_text[:500]
64
+ translated_excerpt = video.translated_text[:300]
65
+
66
+ parts = [
67
+ f"Source language: {video.source_language}",
68
+ f"Target language: {video.target_language_name} ({video.target_language_code})",
69
+ f"Original transcript (excerpt): {original_excerpt}",
70
+ f"Translated text (excerpt): {translated_excerpt}",
71
+ ]
72
+
73
+ if video.video_link:
74
+ parts.append(f"Original video link: {video.video_link}")
75
+ if creator_handle:
76
+ parts.append(f"Original creator: @{creator_handle}")
77
+
78
+ return "\n".join(parts)
79
+
80
+
81
+ def _parse_response(raw: str, platform: str) -> dict:
82
+ """Parse the LLM JSON response, with fallback for markdown fences."""
83
+ raw = raw.strip()
84
+ # Strip markdown code fences if present
85
+ if raw.startswith("```"):
86
+ lines = raw.split("\n")
87
+ lines = [l for l in lines if not l.strip().startswith("```")]
88
+ raw = "\n".join(lines)
89
+
90
+ try:
91
+ return json.loads(raw)
92
+ except json.JSONDecodeError:
93
+ # If JSON parsing fails, treat the whole thing as a caption
94
+ if platform == "youtube":
95
+ return {"title": "Dubbed with AI Voice Cloning", "description": raw}
96
+ return {"caption": raw}
97
+
98
+
99
+ def generate_caption(
100
+ video: VideoData,
101
+ platform: str,
102
+ creator_handle: str | None = None,
103
+ ) -> dict:
104
+ """Generate a caption using Pollinations LLM, with Bedrock fallback.
105
+
106
+ Returns dict with 'caption' key (or 'title' + 'description' for YouTube).
107
+ """
108
+ system_prompt = _build_system_prompt(platform)
109
+ user_prompt = _build_user_prompt(video, creator_handle)
110
+
111
+ # Primary: Pollinations
112
+ try:
113
+ client = build_pollinations_client()
114
+ response = client.chat.completions.create(
115
+ model=POLLEN_MODEL,
116
+ messages=[
117
+ {"role": "system", "content": system_prompt},
118
+ {"role": "user", "content": user_prompt},
119
+ ],
120
+ temperature=0.7,
121
+ )
122
+ raw = response.choices[0].message.content
123
+ console.print(f"[green]Caption generated via Pollinations[/green] ({platform})")
124
+ return _parse_response(raw, platform)
125
+ except Exception as e:
126
+ console.print(f"[yellow]Pollinations failed: {e}. Trying Bedrock...[/yellow]")
127
+
128
+ # Fallback: AWS Bedrock
129
+ try:
130
+ raw = bedrock_converse(system_prompt, user_prompt, temperature=0.7)
131
+ console.print(f"[green]Caption generated via Bedrock[/green] ({platform})")
132
+ return _parse_response(raw, platform)
133
+ except Exception as e:
134
+ console.print(f"[red]Bedrock also failed: {e}[/red]")
135
+ raise RuntimeError(f"Caption generation failed for {video.video_id} on {platform}") from e
136
+
137
+
138
+ def format_caption(
139
+ caption_data: dict,
140
+ video: VideoData,
141
+ platform: str,
142
+ creator_handle: str | None = None,
143
+ ) -> str | tuple[str, str]:
144
+ """Ensure the final caption always contains the original link and creator credit.
145
+
146
+ Returns a string for Instagram/TikTok, or (title, description) tuple for YouTube.
147
+ """
148
+ if platform == "youtube":
149
+ title = caption_data.get("title", "AI Voice Dubbed")
150
+ desc = caption_data.get("description", caption_data.get("caption", ""))
151
+ else:
152
+ desc = caption_data.get("caption", "")
153
+
154
+ # Ensure original link is present
155
+ if video.video_link and video.video_link not in desc:
156
+ desc += f"\n\nOriginal: {video.video_link}"
157
+
158
+ # Ensure creator tag is present
159
+ if creator_handle and f"@{creator_handle}" not in desc:
160
+ desc += f"\nCredit: @{creator_handle}"
161
+
162
+ if platform == "youtube":
163
+ return title, desc
164
+ return desc
social_distributor/poster/config.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Central configuration — env loading, constants, language maps."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+ from openai import OpenAI
8
+
9
+ load_dotenv(Path(__file__).resolve().parent.parent / ".env")
10
+
11
+ # ── Paths ────────────────────────────────────────────────────────────────
12
+ POSTER_ROOT = Path(__file__).resolve().parent.parent
13
+ VIDEOVOICE_DATA_DIR = Path(
14
+ os.getenv("VIDEOVOICE_DATA_DIR", str(POSTER_ROOT.parent / "data"))
15
+ )
16
+ AUTH_STORAGE_DIR = POSTER_ROOT / "poster" / "auth" / "storage"
17
+ POST_LOG_PATH = POSTER_ROOT / "post_history.json"
18
+ CREATOR_CACHE_PATH = POSTER_ROOT / "creator_cache.json"
19
+
20
+ # ── Pollinations LLM (primary) ───────────────────────────────────────────
21
+ POLLINATIONS_BASE = "https://gen.pollinations.ai/v1"
22
+ POLLEN_MODEL = os.getenv("POLLEN_MODEL", "gemini-search")
23
+
24
+
25
+ def build_pollinations_client() -> OpenAI:
26
+ api_key = (
27
+ os.getenv("POLLEN_API_KEY_SECONDARY")
28
+ or os.getenv("POLLEN_API_KEY")
29
+ or os.getenv("POLLINATIONS_API_KEY")
30
+ or "pollinations"
31
+ )
32
+ return OpenAI(base_url=POLLINATIONS_BASE, api_key=api_key)
33
+
34
+
35
+ # ── Bedrock fallback ─────────────────────────────────────────────────────
36
+ BEDROCK_REGION = os.getenv("AWS_REGION", "us-east-1")
37
+ BEDROCK_MODEL = os.getenv("BEDROCK_MODEL", "qwen.qwen3-next-80b-a3b")
38
+
39
+
40
+ def bedrock_converse(system_prompt: str, user_text: str, temperature: float = 0.3) -> str:
41
+ import boto3
42
+
43
+ client = boto3.client("bedrock-runtime", region_name=BEDROCK_REGION)
44
+ response = client.converse(
45
+ modelId=BEDROCK_MODEL,
46
+ messages=[{"role": "user", "content": [{"text": user_text}]}],
47
+ system=[{"text": system_prompt}],
48
+ inferenceConfig={"temperature": temperature},
49
+ )
50
+ return response["output"]["message"]["content"][0]["text"].strip()
51
+
52
+
53
+ # ── Language code → name (reversed from pipeline.py LANGUAGE_CODES) ──────
54
+ LANGUAGE_CODE_TO_NAME: dict[str, str] = {
55
+ "ar": "Arabic",
56
+ "zh": "Chinese",
57
+ "da": "Danish",
58
+ "nl": "Dutch",
59
+ "en": "English",
60
+ "fi": "Finnish",
61
+ "fr": "French",
62
+ "de": "German",
63
+ "el": "Greek",
64
+ "he": "Hebrew",
65
+ "hi": "Hindi",
66
+ "it": "Italian",
67
+ "ja": "Japanese",
68
+ "ko": "Korean",
69
+ "ms": "Malay",
70
+ "no": "Norwegian",
71
+ "pl": "Polish",
72
+ "pt": "Portuguese",
73
+ "ru": "Russian",
74
+ "es": "Spanish",
75
+ "sw": "Swahili",
76
+ "sv": "Swedish",
77
+ "tr": "Turkish",
78
+ }
79
+
80
+ # ── Platform caption limits ──────────────────────────────────────────────
81
+ INSTAGRAM_CAPTION_LIMIT = 2200
82
+ TIKTOK_CAPTION_LIMIT = 4000
83
+ YOUTUBE_TITLE_LIMIT = 100
84
+ YOUTUBE_DESCRIPTION_LIMIT = 5000
85
+
86
+ # ── Posting settings ─────────────────────────────────────────────────────
87
+ POST_DELAY = int(os.getenv("POST_DELAY", "30"))
88
+ HEADLESS = os.getenv("HEADLESS", "true").lower() == "true"
social_distributor/poster/creator_extract.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract original creator @username from video URLs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+
8
+ from rich.console import Console
9
+
10
+ from .config import CREATOR_CACHE_PATH
11
+
12
+ console = Console()
13
+
14
+
15
+ def _load_cache() -> dict[str, str]:
16
+ if CREATOR_CACHE_PATH.exists():
17
+ with open(CREATOR_CACHE_PATH) as f:
18
+ return json.load(f)
19
+ return {}
20
+
21
+
22
+ def _save_cache(cache: dict[str, str]) -> None:
23
+ with open(CREATOR_CACHE_PATH, "w") as f:
24
+ json.dump(cache, f, indent=2)
25
+
26
+
27
+ async def extract_creator(video_link: str | None, browser_context=None) -> str | None:
28
+ """Extract the @username of the original creator from the video URL.
29
+
30
+ Uses Playwright browser context to visit the page and extract metadata.
31
+ Results are cached to avoid repeated page visits.
32
+ """
33
+ if not video_link:
34
+ return None
35
+
36
+ cache = _load_cache()
37
+ if video_link in cache:
38
+ return cache[video_link]
39
+
40
+ username = None
41
+
42
+ try:
43
+ if "instagram.com" in video_link:
44
+ username = await _extract_instagram(video_link, browser_context)
45
+ elif "tiktok.com" in video_link:
46
+ username = await _extract_tiktok(video_link, browser_context)
47
+ elif "youtube.com" in video_link or "youtu.be" in video_link:
48
+ username = await _extract_youtube(video_link, browser_context)
49
+ except Exception as e:
50
+ console.print(f"[yellow]Creator extraction failed: {e}[/yellow]")
51
+
52
+ if username:
53
+ # Clean up username
54
+ username = username.strip().lstrip("@")
55
+ cache[video_link] = username
56
+ _save_cache(cache)
57
+ console.print(f"[green]Creator found:[/green] @{username}")
58
+
59
+ return username
60
+
61
+
62
+ async def _extract_instagram(url: str, ctx) -> str | None:
63
+ """Instagram: visit reel, extract username from og:title or page URL."""
64
+ if not ctx:
65
+ return None
66
+
67
+ page = await ctx.new_page()
68
+ try:
69
+ await page.goto(url, wait_until="domcontentloaded", timeout=15000)
70
+ await page.wait_for_timeout(2000)
71
+
72
+ # Try og:title meta tag: "Username on Instagram: ..."
73
+ og_title = await page.query_selector('meta[property="og:title"]')
74
+ if og_title:
75
+ content = await og_title.get_attribute("content")
76
+ if content:
77
+ # Pattern: "Username on Instagram" or "@username"
78
+ match = re.match(r"^@?(\w[\w.]+)", content)
79
+ if match:
80
+ return match.group(1)
81
+
82
+ # Try the final URL which may contain /username/reel/ID
83
+ final_url = page.url
84
+ match = re.search(r"instagram\.com/([^/]+)/reel", final_url)
85
+ if match:
86
+ return match.group(1)
87
+
88
+ finally:
89
+ await page.close()
90
+
91
+ return None
92
+
93
+
94
+ async def _extract_tiktok(url: str, ctx) -> str | None:
95
+ """TikTok: follow redirect from short URL, parse /@username from final URL."""
96
+ if not ctx:
97
+ return None
98
+
99
+ page = await ctx.new_page()
100
+ try:
101
+ await page.goto(url, wait_until="domcontentloaded", timeout=15000)
102
+ await page.wait_for_timeout(2000)
103
+
104
+ final_url = page.url
105
+ match = re.search(r"/@([^/]+)", final_url)
106
+ if match:
107
+ return match.group(1)
108
+
109
+ # Fallback: check meta tags
110
+ og_title = await page.query_selector('meta[property="og:title"]')
111
+ if og_title:
112
+ content = await og_title.get_attribute("content")
113
+ if content:
114
+ match = re.search(r"@(\w[\w.]+)", content)
115
+ if match:
116
+ return match.group(1)
117
+ finally:
118
+ await page.close()
119
+
120
+ return None
121
+
122
+
123
+ async def _extract_youtube(url: str, ctx) -> str | None:
124
+ """YouTube: visit video page, extract channel name from meta tags."""
125
+ if not ctx:
126
+ return None
127
+
128
+ page = await ctx.new_page()
129
+ try:
130
+ await page.goto(url, wait_until="domcontentloaded", timeout=15000)
131
+ await page.wait_for_timeout(2000)
132
+
133
+ # Try link[itemprop="name"] inside the channel section
134
+ author = await page.query_selector('link[itemprop="name"]')
135
+ if author:
136
+ name = await author.get_attribute("content")
137
+ if name:
138
+ return name
139
+
140
+ # Fallback: og:title often has "Video Title - Channel Name"
141
+ og_title = await page.query_selector('meta[property="og:title"]')
142
+ if og_title:
143
+ content = await og_title.get_attribute("content")
144
+ if content and " - " in content:
145
+ return content.rsplit(" - ", 1)[-1].strip()
146
+ finally:
147
+ await page.close()
148
+
149
+ return None
social_distributor/poster/models.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for the poster pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class VideoData:
10
+ video_id: str
11
+ output_path: str
12
+ video_link: str | None
13
+ source_language: str
14
+ target_language_code: str
15
+ target_language_name: str
16
+ original_text: str
17
+ translated_text: str
18
+ platform_type: str | None # "instagram" | "tiktok" | "youtube" | None
19
+
20
+
21
+ @dataclass
22
+ class PostResult:
23
+ video_id: str
24
+ platform: str
25
+ status: str # "success" | "failed" | "skipped"
26
+ timestamp: str
27
+ caption_used: str = ""
28
+ error: str | None = None
29
+ url: str | None = None
social_distributor/poster/platforms/__init__.py ADDED
File without changes
social_distributor/poster/platforms/base.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Abstract base class for platform posters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import random
7
+ from abc import ABC, abstractmethod
8
+ from datetime import datetime, timezone
9
+
10
+ from playwright.async_api import BrowserContext, Page
11
+
12
+ from ..models import PostResult
13
+
14
+
15
+ class BasePoster(ABC):
16
+ platform: str = ""
17
+
18
+ def __init__(self, context: BrowserContext):
19
+ self.context = context
20
+
21
+ @abstractmethod
22
+ async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
23
+ ...
24
+
25
+ @abstractmethod
26
+ async def is_logged_in(self) -> bool:
27
+ ...
28
+
29
+ async def _human_delay(self, min_s: float = 1.0, max_s: float = 3.0) -> None:
30
+ await asyncio.sleep(random.uniform(min_s, max_s))
31
+
32
+ async def _screenshot_on_error(self, page: Page, video_id: str) -> None:
33
+ """Save a debug screenshot on failure."""
34
+ from ..config import POSTER_ROOT
35
+ errors_dir = POSTER_ROOT / "errors"
36
+ errors_dir.mkdir(exist_ok=True)
37
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
38
+ path = errors_dir / f"{self.platform}_{video_id}_{ts}.png"
39
+ await page.screenshot(path=str(path))
40
+
41
+ def _make_result(
42
+ self,
43
+ video_id: str,
44
+ status: str,
45
+ caption: str = "",
46
+ error: str | None = None,
47
+ url: str | None = None,
48
+ ) -> PostResult:
49
+ return PostResult(
50
+ video_id=video_id,
51
+ platform=self.platform,
52
+ status=status,
53
+ timestamp=datetime.now(timezone.utc).isoformat(),
54
+ caption_used=caption,
55
+ error=error,
56
+ url=url,
57
+ )
social_distributor/poster/platforms/instagram.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Instagram Reel posting via Playwright (mobile web viewport)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Console
6
+
7
+ from .base import BasePoster
8
+ from ..models import PostResult
9
+
10
+ console = Console()
11
+
12
+
13
+ class InstagramPoster(BasePoster):
14
+ platform = "instagram"
15
+
16
+ async def is_logged_in(self) -> bool:
17
+ page = await self.context.new_page()
18
+ try:
19
+ await page.goto("https://www.instagram.com/", wait_until="domcontentloaded", timeout=15000)
20
+ await page.wait_for_timeout(3000)
21
+ login_form = await page.query_selector('input[name="username"]')
22
+ if login_form:
23
+ return False
24
+ nav = await page.query_selector('nav, div[role="navigation"]')
25
+ return nav is not None
26
+ except Exception:
27
+ return False
28
+ finally:
29
+ await page.close()
30
+
31
+ async def _dismiss_popups(self, page) -> None:
32
+ """Dismiss common Instagram popups (notifications, app switch, cookies)."""
33
+ dismiss_selectors = [
34
+ 'button:has-text("Not Now")',
35
+ 'button:has-text("Cancel")',
36
+ 'button:has-text("Accept All")',
37
+ 'button:has-text("Allow All Cookies")',
38
+ 'button:has-text("Decline")',
39
+ ]
40
+ for selector in dismiss_selectors:
41
+ try:
42
+ btn = await page.query_selector(selector)
43
+ if btn and await btn.is_visible():
44
+ await btn.click()
45
+ await self._human_delay(0.5, 1)
46
+ except Exception:
47
+ pass
48
+
49
+ async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
50
+ video_id = kwargs.get("video_id", "unknown")
51
+ page = await self.context.new_page()
52
+
53
+ try:
54
+ console.print(f"[cyan]Instagram:[/cyan] Navigating to Instagram...")
55
+ await page.goto("https://www.instagram.com/", wait_until="domcontentloaded", timeout=20000)
56
+ await page.wait_for_timeout(3000)
57
+
58
+ await self._dismiss_popups(page)
59
+
60
+ login_form = await page.query_selector('input[name="username"]')
61
+ if login_form:
62
+ return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
63
+
64
+ await self._human_delay(1, 2)
65
+
66
+ # Click the create/new post button
67
+ console.print(f"[cyan]Instagram:[/cyan] Opening create dialog...")
68
+ create_selectors = [
69
+ 'svg[aria-label="New post"]',
70
+ 'a[href="/create/"]',
71
+ 'div[role="menuitem"] svg[aria-label*="New"]',
72
+ 'a[href="/create/select/"]',
73
+ '[aria-label="New post"]',
74
+ 'svg[aria-label="New Post"]',
75
+ ]
76
+
77
+ create_clicked = False
78
+ for selector in create_selectors:
79
+ el = await page.query_selector(selector)
80
+ if el:
81
+ await el.click()
82
+ create_clicked = True
83
+ break
84
+
85
+ if not create_clicked:
86
+ await page.goto("https://www.instagram.com/create/select/", wait_until="domcontentloaded")
87
+ await page.wait_for_timeout(2000)
88
+
89
+ await self._human_delay(2, 3)
90
+ await self._dismiss_popups(page)
91
+
92
+ # ── FIX: Instagram's file input is hidden by default.
93
+ # Wait for it to be *attached* to the DOM (not visible),
94
+ # then call set_input_files() which works on hidden inputs.
95
+ console.print(f"[cyan]Instagram:[/cyan] Uploading video...")
96
+ try:
97
+ file_input = await page.wait_for_selector(
98
+ 'input[type="file"]',
99
+ state="attached", # <-- was default "visible", which timed out
100
+ timeout=15000,
101
+ )
102
+ except Exception:
103
+ # Fallback: query directly without waiting
104
+ file_input = await page.query_selector('input[type="file"]')
105
+
106
+ if not file_input:
107
+ await self._screenshot_on_error(page, video_id)
108
+ return self._make_result(video_id, "failed", caption, error="File input not found in DOM")
109
+
110
+ # Unhide the input via JS as a safety measure, then set the file
111
+ await page.evaluate(
112
+ """el => {
113
+ el.style.display = 'block';
114
+ el.style.opacity = '1';
115
+ el.style.visibility = 'visible';
116
+ }""",
117
+ file_input,
118
+ )
119
+ await file_input.set_input_files(video_path)
120
+
121
+ await self._human_delay(3, 5)
122
+
123
+ # Instagram may show aspect ratio / crop screen — look for Reel tab
124
+ reel_tab = await page.query_selector('div:has-text("Reel"), button:has-text("Reel")')
125
+ if reel_tab:
126
+ await reel_tab.click()
127
+ await self._human_delay(1, 2)
128
+
129
+ # Click through editing steps (crop, filters, etc.)
130
+ for _ in range(3):
131
+ next_btn = await page.query_selector(
132
+ 'button:has-text("Next"), div[role="button"]:has-text("Next")'
133
+ )
134
+ if next_btn:
135
+ await next_btn.click()
136
+ await self._human_delay(2, 3)
137
+ await self._dismiss_popups(page)
138
+ else:
139
+ break
140
+
141
+ # Fill in the caption
142
+ console.print(f"[cyan]Instagram:[/cyan] Adding caption...")
143
+ caption_selectors = [
144
+ 'textarea[aria-label*="Write a caption"]',
145
+ 'textarea[placeholder*="Write a caption"]',
146
+ 'div[contenteditable="true"][role="textbox"]',
147
+ 'div[aria-label*="Write a caption"]',
148
+ ]
149
+
150
+ caption_filled = False
151
+ for selector in caption_selectors:
152
+ editor = await page.query_selector(selector)
153
+ if editor:
154
+ await editor.click()
155
+ await self._human_delay(0.5, 1)
156
+ await page.keyboard.type(caption, delay=10)
157
+ caption_filled = True
158
+ break
159
+
160
+ if not caption_filled:
161
+ console.print("[yellow]Instagram: Could not find caption field[/yellow]")
162
+
163
+ await self._human_delay(2, 3)
164
+
165
+ # Click Share
166
+ console.print(f"[cyan]Instagram:[/cyan] Sharing...")
167
+ share_btn = await page.query_selector(
168
+ 'button:has-text("Share"), div[role="button"]:has-text("Share")'
169
+ )
170
+ if share_btn:
171
+ await share_btn.click()
172
+ else:
173
+ await self._screenshot_on_error(page, video_id)
174
+ return self._make_result(video_id, "failed", caption, error="Could not find Share button")
175
+
176
+ # Wait for upload to complete
177
+ console.print(f"[cyan]Instagram:[/cyan] Waiting for upload to complete...")
178
+ await page.wait_for_timeout(10000)
179
+
180
+ # Check for success
181
+ try:
182
+ await page.wait_for_selector(
183
+ 'div:has-text("Your reel has been shared"), '
184
+ 'div:has-text("Reel shared"), '
185
+ 'span:has-text("Your reel has been shared"), '
186
+ 'img[alt="Animated checkmark"]',
187
+ timeout=60000,
188
+ )
189
+ console.print(f"[green]Instagram: Reel shared successfully![/green]")
190
+ return self._make_result(video_id, "success", caption)
191
+ except Exception:
192
+ if page.url == "https://www.instagram.com/" or "/create" not in page.url:
193
+ console.print(f"[green]Instagram: Likely posted (redirected to feed)[/green]")
194
+ return self._make_result(video_id, "success", caption)
195
+
196
+ await self._screenshot_on_error(page, video_id)
197
+ return self._make_result(video_id, "failed", caption, error="Share confirmation not detected")
198
+
199
+ except Exception as e:
200
+ try:
201
+ await self._screenshot_on_error(page, video_id)
202
+ except Exception:
203
+ pass
204
+ return self._make_result(video_id, "failed", caption, error=str(e))
205
+ finally:
206
+ await page.close()
social_distributor/poster/platforms/tiktok.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TikTok video posting via Playwright (tiktok.com/upload)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Console
6
+
7
+ from .base import BasePoster
8
+ from ..models import PostResult
9
+
10
+ console = Console()
11
+
12
+
13
+ class TikTokPoster(BasePoster):
14
+ platform = "tiktok"
15
+
16
+ async def is_logged_in(self) -> bool:
17
+ page = await self.context.new_page()
18
+ try:
19
+ await page.goto("https://www.tiktok.com/upload", wait_until="domcontentloaded", timeout=15000)
20
+ await page.wait_for_timeout(3000)
21
+ # If redirected to login, we're not logged in
22
+ if "/login" in page.url:
23
+ return False
24
+ # Look for upload area
25
+ upload_area = await page.query_selector('input[type="file"]')
26
+ return upload_area is not None
27
+ except Exception:
28
+ return False
29
+ finally:
30
+ await page.close()
31
+
32
+ async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
33
+ video_id = kwargs.get("video_id", "unknown")
34
+ page = await self.context.new_page()
35
+
36
+ try:
37
+ console.print(f"[cyan]TikTok:[/cyan] Navigating to upload page...")
38
+ await page.goto("https://www.tiktok.com/upload", wait_until="domcontentloaded", timeout=20000)
39
+ await page.wait_for_timeout(3000)
40
+
41
+ if "/login" in page.url:
42
+ return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
43
+
44
+ # Upload video via file input
45
+ console.print(f"[cyan]TikTok:[/cyan] Uploading video...")
46
+ file_input = await page.wait_for_selector('input[type="file"]', timeout=10000)
47
+ await file_input.set_input_files(video_path)
48
+
49
+ # Wait for video to process (the upload indicator / thumbnail appears)
50
+ await self._human_delay(3, 5)
51
+
52
+ # Wait for video processing — look for the editor/preview to appear
53
+ # TikTok shows a video preview once upload is complete
54
+ try:
55
+ await page.wait_for_selector(
56
+ 'div[class*="editor"], div[class*="preview"], div[class*="video-card"]',
57
+ timeout=60000,
58
+ )
59
+ except Exception:
60
+ console.print("[yellow]TikTok: Waiting for upload processing...[/yellow]")
61
+ await page.wait_for_timeout(10000)
62
+
63
+ await self._human_delay(2, 4)
64
+
65
+ # Fill in the caption
66
+ console.print(f"[cyan]TikTok:[/cyan] Adding caption...")
67
+
68
+ # TikTok uses a contenteditable div for the caption
69
+ # Try multiple selectors for the caption editor
70
+ caption_selectors = [
71
+ 'div[contenteditable="true"]',
72
+ 'div[data-placeholder*="caption"]',
73
+ 'div[class*="caption"] div[contenteditable="true"]',
74
+ '.public-DraftEditor-content',
75
+ ]
76
+
77
+ caption_editor = None
78
+ for selector in caption_selectors:
79
+ caption_editor = await page.query_selector(selector)
80
+ if caption_editor:
81
+ break
82
+
83
+ if caption_editor:
84
+ await caption_editor.click()
85
+ await self._human_delay(0.5, 1)
86
+ # Clear existing text and type new caption
87
+ await page.keyboard.press("Meta+a")
88
+ await page.keyboard.press("Backspace")
89
+ await self._human_delay(0.3, 0.5)
90
+ await page.keyboard.type(caption, delay=10)
91
+ else:
92
+ console.print("[yellow]TikTok: Could not find caption editor[/yellow]")
93
+
94
+ await self._human_delay(2, 3)
95
+
96
+ # Click Post button
97
+ console.print(f"[cyan]TikTok:[/cyan] Posting...")
98
+ post_button_selectors = [
99
+ 'button:has-text("Post")',
100
+ 'button[class*="post-button"]',
101
+ 'div[class*="btn-post"] button',
102
+ ]
103
+
104
+ posted = False
105
+ for selector in post_button_selectors:
106
+ btn = await page.query_selector(selector)
107
+ if btn and await btn.is_enabled():
108
+ await btn.click()
109
+ posted = True
110
+ break
111
+
112
+ if not posted:
113
+ # Fallback: try pressing the button by text
114
+ try:
115
+ await page.get_by_role("button", name="Post").click()
116
+ posted = True
117
+ except Exception:
118
+ pass
119
+
120
+ if not posted:
121
+ await self._screenshot_on_error(page, video_id)
122
+ return self._make_result(video_id, "failed", caption, error="Could not find Post button")
123
+
124
+ # Wait for upload to complete
125
+ console.print(f"[cyan]TikTok:[/cyan] Waiting for upload to complete...")
126
+ await page.wait_for_timeout(10000)
127
+
128
+ # Check for success indicators
129
+ success = False
130
+ try:
131
+ await page.wait_for_selector(
132
+ 'div:has-text("uploaded"), div:has-text("Your video"), div[class*="success"]',
133
+ timeout=30000,
134
+ )
135
+ success = True
136
+ except Exception:
137
+ # If URL changed away from upload page, likely success
138
+ if "/upload" not in page.url:
139
+ success = True
140
+
141
+ if success:
142
+ console.print(f"[green]TikTok: Posted successfully![/green]")
143
+ return self._make_result(video_id, "success", caption)
144
+ else:
145
+ await self._screenshot_on_error(page, video_id)
146
+ return self._make_result(video_id, "failed", caption, error="Upload may not have completed")
147
+
148
+ except Exception as e:
149
+ try:
150
+ await self._screenshot_on_error(page, video_id)
151
+ except Exception:
152
+ pass
153
+ return self._make_result(video_id, "failed", caption, error=str(e))
154
+ finally:
155
+ await page.close()
social_distributor/poster/platforms/youtube.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """YouTube Shorts posting via Playwright (studio.youtube.com)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Console
6
+
7
+ from .base import BasePoster
8
+ from ..models import PostResult
9
+
10
+ console = Console()
11
+
12
+
13
+ class YouTubePoster(BasePoster):
14
+ platform = "youtube"
15
+
16
+ async def is_logged_in(self) -> bool:
17
+ page = await self.context.new_page()
18
+ try:
19
+ await page.goto("https://studio.youtube.com/", wait_until="domcontentloaded", timeout=15000)
20
+ await page.wait_for_timeout(3000)
21
+ if "accounts.google.com" in page.url:
22
+ return False
23
+ # Look for the Create button in YouTube Studio
24
+ create_btn = await page.query_selector('#create-icon, button[aria-label="Create"]')
25
+ return create_btn is not None
26
+ except Exception:
27
+ return False
28
+ finally:
29
+ await page.close()
30
+
31
+ async def post(self, video_path: str, caption: str, **kwargs) -> PostResult:
32
+ video_id = kwargs.get("video_id", "unknown")
33
+ title = kwargs.get("title", "AI Voice Dubbed")
34
+ page = await self.context.new_page()
35
+
36
+ try:
37
+ console.print(f"[cyan]YouTube:[/cyan] Navigating to YouTube Studio...")
38
+ await page.goto("https://studio.youtube.com/", wait_until="domcontentloaded", timeout=20000)
39
+ await page.wait_for_timeout(3000)
40
+
41
+ if "accounts.google.com" in page.url:
42
+ return self._make_result(video_id, "failed", caption, error="Not logged in — session expired")
43
+
44
+ # Click Create button
45
+ console.print(f"[cyan]YouTube:[/cyan] Opening upload dialog...")
46
+ create_btn = await page.wait_for_selector(
47
+ '#create-icon, button[aria-label="Create"]', timeout=10000
48
+ )
49
+ await create_btn.click()
50
+ await self._human_delay(1, 2)
51
+
52
+ # Click "Upload videos"
53
+ upload_option = await page.wait_for_selector(
54
+ 'tp-yt-paper-item:has-text("Upload videos"), #text-item-0', timeout=5000
55
+ )
56
+ await upload_option.click()
57
+ await self._human_delay(1, 2)
58
+
59
+ # Upload video file
60
+ console.print(f"[cyan]YouTube:[/cyan] Uploading video...")
61
+ file_input = await page.wait_for_selector('input[type="file"]', timeout=10000)
62
+ await file_input.set_input_files(video_path)
63
+
64
+ # Wait for upload to start processing
65
+ await self._human_delay(3, 5)
66
+
67
+ # Wait for the details form to appear
68
+ try:
69
+ await page.wait_for_selector(
70
+ '#textbox[aria-label*="title"], div[id="textbox"]',
71
+ timeout=60000,
72
+ )
73
+ except Exception:
74
+ console.print("[yellow]YouTube: Waiting for upload form...[/yellow]")
75
+ await page.wait_for_timeout(10000)
76
+
77
+ await self._human_delay(1, 2)
78
+
79
+ # Fill in title
80
+ console.print(f"[cyan]YouTube:[/cyan] Setting title and description...")
81
+ title_input = await page.query_selector('#textbox[aria-label*="title"]')
82
+ if title_input:
83
+ await title_input.click()
84
+ await page.keyboard.press("Meta+a")
85
+ await page.keyboard.type(title[:100], delay=10)
86
+
87
+ await self._human_delay(1, 2)
88
+
89
+ # Fill in description
90
+ desc_input = await page.query_selector(
91
+ '#textbox[aria-label*="description"], '
92
+ 'div[aria-label*="Tell viewers about your video"]'
93
+ )
94
+ if desc_input:
95
+ await desc_input.click()
96
+ await page.keyboard.type(caption, delay=5)
97
+
98
+ await self._human_delay(1, 2)
99
+
100
+ # Handle "Made for kids" — select "No, it's not made for kids"
101
+ not_for_kids = await page.query_selector(
102
+ 'tp-yt-paper-radio-button[name="NOT_MADE_FOR_KIDS"], '
103
+ '#radioLabel:has-text("not made for kids")'
104
+ )
105
+ if not_for_kids:
106
+ await not_for_kids.click()
107
+ await self._human_delay(0.5, 1)
108
+
109
+ # Click Next through the wizard steps (Elements, Checks, Visibility)
110
+ for step_name in ["Elements", "Checks", "Visibility"]:
111
+ console.print(f"[cyan]YouTube:[/cyan] Step: {step_name}...")
112
+ next_btn = await page.query_selector('#next-button, button:has-text("Next")')
113
+ if next_btn:
114
+ await next_btn.click()
115
+ await self._human_delay(2, 3)
116
+
117
+ # Set visibility to Public
118
+ public_radio = await page.query_selector(
119
+ 'tp-yt-paper-radio-button[name="PUBLIC"], '
120
+ '#radioLabel:has-text("Public")'
121
+ )
122
+ if public_radio:
123
+ await public_radio.click()
124
+ await self._human_delay(1, 2)
125
+
126
+ # Click Publish / Done
127
+ console.print(f"[cyan]YouTube:[/cyan] Publishing...")
128
+ publish_btn = await page.query_selector(
129
+ '#done-button, button:has-text("Publish"), button:has-text("Done")'
130
+ )
131
+ if publish_btn:
132
+ await publish_btn.click()
133
+ else:
134
+ await self._screenshot_on_error(page, video_id)
135
+ return self._make_result(video_id, "failed", caption, error="Could not find Publish button")
136
+
137
+ # Wait for publish confirmation
138
+ await page.wait_for_timeout(10000)
139
+
140
+ # Check for success — dialog may show "Video published" or close
141
+ try:
142
+ await page.wait_for_selector(
143
+ 'div:has-text("Video published"), a[href*="youtu"]',
144
+ timeout=30000,
145
+ )
146
+ # Try to extract the video URL
147
+ link_el = await page.query_selector('a[href*="youtu.be"], a[href*="youtube.com/watch"]')
148
+ video_url = None
149
+ if link_el:
150
+ video_url = await link_el.get_attribute("href")
151
+
152
+ console.print(f"[green]YouTube: Published successfully![/green]")
153
+ return self._make_result(video_id, "success", caption, url=video_url)
154
+ except Exception:
155
+ await self._screenshot_on_error(page, video_id)
156
+ return self._make_result(video_id, "failed", caption, error="Publish confirmation not detected")
157
+
158
+ except Exception as e:
159
+ try:
160
+ await self._screenshot_on_error(page, video_id)
161
+ except Exception:
162
+ pass
163
+ return self._make_result(video_id, "failed", caption, error=str(e))
164
+ finally:
165
+ await page.close()
social_distributor/poster/post_log.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """JSON-based posting history for deduplication."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import datetime, timezone
7
+
8
+ from .config import POST_LOG_PATH
9
+ from .models import PostResult
10
+
11
+
12
+ def _load() -> dict:
13
+ if POST_LOG_PATH.exists():
14
+ with open(POST_LOG_PATH) as f:
15
+ return json.load(f)
16
+ return {}
17
+
18
+
19
+ def _save(data: dict) -> None:
20
+ with open(POST_LOG_PATH, "w") as f:
21
+ json.dump(data, f, indent=2)
22
+
23
+
24
+ def is_posted(video_id: str, platform: str) -> bool:
25
+ data = _load()
26
+ entry = data.get(video_id, {}).get(platform, {})
27
+ return entry.get("status") == "success"
28
+
29
+
30
+ def record(result: PostResult) -> None:
31
+ data = _load()
32
+ if result.video_id not in data:
33
+ data[result.video_id] = {}
34
+ data[result.video_id][result.platform] = {
35
+ "status": result.status,
36
+ "timestamp": result.timestamp,
37
+ "caption": result.caption_used,
38
+ "error": result.error,
39
+ "url": result.url,
40
+ }
41
+ _save(data)
42
+
43
+
44
+ def get_all() -> dict:
45
+ return _load()
social_distributor/poster/video_loader.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load and validate VideoVoice data folders into VideoData objects."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from pathlib import Path
8
+
9
+ from rich.console import Console
10
+
11
+ from .config import LANGUAGE_CODE_TO_NAME, VIDEOVOICE_DATA_DIR
12
+ from .models import VideoData
13
+
14
+ console = Console()
15
+
16
+
17
+ def _detect_platform(video_link: str | None) -> str | None:
18
+ if not video_link:
19
+ return None
20
+ if re.search(r"/reels?/", video_link):
21
+ return "instagram"
22
+ if "tiktok.com" in video_link:
23
+ return "tiktok"
24
+ if "youtube.com" in video_link or "youtu.be" in video_link:
25
+ return "youtube"
26
+ return None
27
+
28
+
29
+ def load_video(folder_name: str, lang_override: str | None = None) -> VideoData | None:
30
+ """Load a single video folder. Returns None if the folder is invalid."""
31
+ folder = VIDEOVOICE_DATA_DIR / folder_name
32
+ if not folder.is_dir():
33
+ console.print(f"[red]Folder not found:[/red] {folder}")
34
+ return None
35
+
36
+ output_mp4 = folder / "output.mp4"
37
+ if not output_mp4.exists():
38
+ console.print(f"[red]No output.mp4 in:[/red] {folder_name}")
39
+ return None
40
+
41
+ # Read transcription.json
42
+ transcription_path = folder / "transcription.json"
43
+ if not transcription_path.exists():
44
+ console.print(f"[red]No transcription.json in:[/red] {folder_name}")
45
+ return None
46
+
47
+ with open(transcription_path) as f:
48
+ transcription = json.load(f)
49
+
50
+ video_link = transcription.get("video_link")
51
+ source_language = transcription.get("source_language", "en")
52
+ original_text = " ".join(
53
+ seg.get("text", "") for seg in transcription.get("segments", [])
54
+ )
55
+
56
+ # Read segment_comparison.json
57
+ seg_comp_path = folder / "segment_comparison.json"
58
+ target_lang_code = "en"
59
+ translated_text = ""
60
+
61
+ if seg_comp_path.exists():
62
+ with open(seg_comp_path) as f:
63
+ segments = json.load(f)
64
+ if segments and isinstance(segments, list):
65
+ target_lang_code = segments[0].get("language_id", "en")
66
+ translated_text = " ".join(
67
+ seg.get("tts_text", "") or seg.get("translated_text", "")
68
+ for seg in segments
69
+ )
70
+
71
+ target_lang_name = lang_override or LANGUAGE_CODE_TO_NAME.get(
72
+ target_lang_code, target_lang_code
73
+ )
74
+
75
+ return VideoData(
76
+ video_id=folder_name,
77
+ output_path=str(output_mp4),
78
+ video_link=video_link,
79
+ source_language=source_language,
80
+ target_language_code=target_lang_code,
81
+ target_language_name=target_lang_name,
82
+ original_text=original_text,
83
+ translated_text=translated_text,
84
+ platform_type=_detect_platform(video_link),
85
+ )
86
+
87
+
88
+ def load_videos(
89
+ folder_names: list[str], lang_override: str | None = None
90
+ ) -> list[VideoData]:
91
+ """Load multiple video folders, skipping invalid ones."""
92
+ videos = []
93
+ for name in folder_names:
94
+ video = load_video(name, lang_override)
95
+ if video:
96
+ videos.append(video)
97
+ console.print(
98
+ f"[green]Loaded:[/green] {name} "
99
+ f"({video.source_language} -> {video.target_language_name})"
100
+ )
101
+ return videos
social_distributor/pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "videovoice-poster"
3
+ version = "0.1.0"
4
+ description = "Automated social media posting for VideoVoice dubbed videos"
5
+ requires-python = ">=3.10"
6
+ dependencies = [
7
+ "playwright>=1.40",
8
+ "openai>=1.0",
9
+ "boto3>=1.34",
10
+ "click>=8.0",
11
+ "rich>=13.0",
12
+ "python-dotenv>=1.0",
13
+ ]
14
+
15
+ [tool.hatch.build.targets.wheel]
16
+ packages = ["poster"]
17
+
18
+ [build-system]
19
+ requires = ["hatchling"]
20
+ build-backend = "hatchling.build"
social_distributor/uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
steps/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Video Translation Pipeline — steps package
steps/lang/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Language-specific handlers for the translation pipeline.
2
+
3
+ Each language that needs special handling gets its own module (e.g. urdu.py).
4
+ This package provides a simple dispatcher so s3_translate.py stays language-agnostic.
5
+ """
6
+
7
+
8
+ def _get_handler(target_language: str):
9
+ """Lazy-import language handler module if it exists."""
10
+ lang = target_language.lower()
11
+ if lang == "urdu":
12
+ from . import urdu
13
+ return urdu
14
+ return None
15
+
16
+
17
+ def get_translation_prompt(target_language: str, default_prompt: str) -> str:
18
+ """Return a language-specific translation prompt, or the default."""
19
+ handler = _get_handler(target_language)
20
+ if handler and hasattr(handler, 'get_translation_prompt'):
21
+ return handler.get_translation_prompt()
22
+ return default_prompt
23
+
24
+
25
+ def get_fallback_mode(target_language: str) -> str:
26
+ """Return 'bedrock' or 'google' depending on the language."""
27
+ handler = _get_handler(target_language)
28
+ if handler and hasattr(handler, 'get_fallback_mode'):
29
+ return handler.get_fallback_mode()
30
+ return "google"
31
+
32
+
33
+ def post_translate(segments: list[dict], target_language: str) -> list[dict]:
34
+ """Run any language-specific post-processing after translation."""
35
+ handler = _get_handler(target_language)
36
+ if handler and hasattr(handler, 'post_translate'):
37
+ return handler.post_translate(segments)
38
+ return segments
steps/lang/_shared.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared utilities for language-specific translation handlers."""
2
+ import json
3
+ import os
4
+ import re
5
+ from datetime import datetime, timezone
6
+
7
+ from openai import OpenAI
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ POLLINATIONS_BASE = "https://gen.pollinations.ai/v1"
13
+ MODEL = os.getenv("POLLEN_MODEL", "openai-large")
14
+
15
+
16
+ def build_client() -> OpenAI:
17
+ """Build an OpenAI-compatible client pointing at Pollinations."""
18
+ api_key = (
19
+ os.getenv("POLLEN_API_KEY_SECONDARY")
20
+ or os.getenv("POLLEN_API_KEY")
21
+ or os.getenv("POLLINATIONS_API_KEY")
22
+ or "pollinations"
23
+ )
24
+ return OpenAI(base_url=POLLINATIONS_BASE, api_key=api_key)
25
+
26
+
27
+ _LLM_LOG_PATH = "tmp/llm_calls.json"
28
+
29
+
30
+ def log_llm_call(
31
+ step: str,
32
+ provider: str,
33
+ model: str,
34
+ system_prompt: str,
35
+ user_prompt: str,
36
+ response: str,
37
+ temperature: float,
38
+ ) -> None:
39
+ """Append an LLM call record to tmp/llm_calls.json."""
40
+ entry = {
41
+ "timestamp": datetime.now(timezone.utc).isoformat(),
42
+ "step": step,
43
+ "provider": provider,
44
+ "model": model,
45
+ "temperature": temperature,
46
+ "system_prompt": system_prompt,
47
+ "user_prompt": user_prompt,
48
+ "response": response,
49
+ }
50
+
51
+ try:
52
+ with open(_LLM_LOG_PATH, "r", encoding="utf-8") as f:
53
+ calls = json.load(f)
54
+ except (FileNotFoundError, json.JSONDecodeError):
55
+ calls = []
56
+
57
+ calls.append(entry)
58
+
59
+ os.makedirs(os.path.dirname(_LLM_LOG_PATH) or ".", exist_ok=True)
60
+ with open(_LLM_LOG_PATH, "w", encoding="utf-8") as f:
61
+ json.dump(calls, f, indent=2, ensure_ascii=False)
62
+
63
+
64
+ def parse_json_array(raw: str) -> list:
65
+ """Parse a JSON array from LLM output, with regex fallback for markdown fences etc."""
66
+ raw = raw.strip()
67
+
68
+ # Direct parse
69
+ try:
70
+ result = json.loads(raw)
71
+ if isinstance(result, dict):
72
+ return list(result.values())
73
+ if isinstance(result, list):
74
+ return [item[0] if isinstance(item, list) and len(item) > 0 else str(item) for item in result]
75
+ return result
76
+ except json.JSONDecodeError:
77
+ pass
78
+
79
+ # Fallback: extract [...] with regex
80
+ match = re.search(r'\[.*\]', raw, re.DOTALL)
81
+ if match:
82
+ result = json.loads(match.group())
83
+ if isinstance(result, list):
84
+ return [item[0] if isinstance(item, list) and len(item) > 0 else str(item) for item in result]
85
+ return result
86
+
87
+ # Fallback: extract {...} and convert dict values
88
+ match_dict = re.search(r'\{.*\}', raw, re.DOTALL)
89
+ if match_dict:
90
+ result = json.loads(match_dict.group())
91
+ if isinstance(result, dict):
92
+ return list(result.values())
93
+ return result
94
+
95
+ raise ValueError(f"Could not parse JSON array from LLM response:\n{raw[:200]}")
96
+
97
+
98
+ def bedrock_converse(system_prompt: str, user_text: str, temperature: float = 0.1, step: str = "bedrock") -> str:
99
+ """Make a single Bedrock converse call and return the raw response text."""
100
+ import boto3
101
+
102
+ region = os.getenv("AWS_REGION", "us-east-1")
103
+ model_id = os.getenv("BEDROCK_MODEL", "qwen.qwen3-next-80b-a3b")
104
+
105
+ client = boto3.client("bedrock-runtime", region_name=region)
106
+ response = client.converse(
107
+ modelId=model_id,
108
+ messages=[{"role": "user", "content": [{"text": user_text}]}],
109
+ system=[{"text": system_prompt}],
110
+ inferenceConfig={"temperature": temperature},
111
+ )
112
+ result = response["output"]["message"]["content"][0]["text"].strip()
113
+
114
+ log_llm_call(
115
+ step=step, provider="bedrock", model=model_id,
116
+ system_prompt=system_prompt, user_prompt=user_text,
117
+ response=result, temperature=temperature,
118
+ )
119
+
120
+ return result
121
+
122
+
123
+ def bedrock_fallback(segments: list[dict], numbered: str, system_prompt: str, max_retries: int = 2) -> list[dict]:
124
+ """Fallback translator using AWS Bedrock. Retries on count mismatch."""
125
+ expected = len(segments)
126
+ strict_prompt = (
127
+ system_prompt
128
+ + f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
129
+ f"— one per input line. Do NOT merge, skip, or split any lines."
130
+ )
131
+
132
+ print(f"[lang] Bedrock fallback: translating {expected} segments")
133
+
134
+ for attempt in range(1, max_retries + 1):
135
+ raw = bedrock_converse(strict_prompt, numbered, step="s3_translate_bedrock")
136
+ translated_list = parse_json_array(raw)
137
+
138
+ if len(translated_list) == expected:
139
+ break
140
+
141
+ print(f"[lang] Bedrock returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
142
+ if attempt == max_retries:
143
+ raise ValueError(
144
+ f"Bedrock translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
145
+ )
146
+
147
+ cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]
148
+ result = [{**seg, "translated_text": t} for seg, t in zip(segments, cleaned)]
149
+ print("[lang] Bedrock fallback translation complete ✓")
150
+ return result
steps/lang/omnivoice_languages.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AUTO-GENERATED from k2-fsa/OmniVoice omnivoice/utils/lang_map.py
2
+ # Source: https://github.com/k2-fsa/OmniVoice/blob/master/omnivoice/utils/lang_map.py
3
+ """Omnivoice-supported languages (display name -> Omnivoice language id)."""
4
+
5
+ OMNIVOICE_LANGUAGE_CODES: dict[str, str] = {
6
+ "Abadi": "kbt",
7
+ "Abkhazian": "ab",
8
+ "Abron": "abr",
9
+ "Abua": "abn",
10
+ "Adamawa Fulfulde": "fub",
11
+ "Adyghe": "ady",
12
+ "Afade": "aal",
13
+ "Afrikaans": "af",
14
+ "Agwagwune": "yay",
15
+ "Aja (Benin)": "ajg",
16
+ "Akebu": "keu",
17
+ "Alago": "ala",
18
+ "Albanian": "sq",
19
+ "Algerian Arabic": "arq",
20
+ "Algerian Saharan Arabic": "aao",
21
+ "Ambo-Pasco Quechua": "qva",
22
+ "Ambonese Malay": "abs",
23
+ "Amdo Tibetan": "adx",
24
+ "Amharic": "am",
25
+ "Anaang": "anw",
26
+ "Angika": "anp",
27
+ "Antankarana Malagasy": "xmv",
28
+ "Aragonese": "an",
29
+ "Arbëreshë Albanian": "aae",
30
+ "Arequipa-La Unión Quechua": "qxu",
31
+ "Armenian": "hy",
32
+ "Ashe": "ahs",
33
+ "Ashéninka Perené": "prq",
34
+ "Askopan": "eiv",
35
+ "Assamese": "as",
36
+ "Asturian": "ast",
37
+ "Atayal": "tay",
38
+ "Awak": "awo",
39
+ "Ayacucho Quechua": "quy",
40
+ "Azerbaijani": "az",
41
+ "Baatonum": "bba",
42
+ "Bacama": "bcy",
43
+ "Bade": "bde",
44
+ "Bafia": "ksf",
45
+ "Bafut": "bfd",
46
+ "Bagirmi Fulfulde": "fui",
47
+ "Bago-Kusuntu": "bqg",
48
+ "Baharna Arabic": "abv",
49
+ "Bakoko": "bkh",
50
+ "Balanta-Ganja": "bjt",
51
+ "Balti": "bft",
52
+ "Bamenyam": "bce",
53
+ "Bamun": "bax",
54
+ "Bangwinji": "bsj",
55
+ "Banjar": "bjn",
56
+ "Bankon": "abb",
57
+ "Baoulé": "bci",
58
+ "Bara Malagasy": "bhr",
59
+ "Barok": "bjk",
60
+ "Basa (Cameroon)": "bas",
61
+ "Basa (Nigeria)": "bzw",
62
+ "Bashkir": "ba",
63
+ "Basque": "eu",
64
+ "Batak Mandailing": "btm",
65
+ "Batanga": "bnm",
66
+ "Bateri": "btv",
67
+ "Bats": "bbl",
68
+ "Bayot": "bda",
69
+ "Bebele": "beb",
70
+ "Belarusian": "be",
71
+ "Bengali": "bn",
72
+ "Betawi": "bew",
73
+ "Bhili": "bhb",
74
+ "Bhojpuri": "bho",
75
+ "Bilur": "bxf",
76
+ "Bima": "bhp",
77
+ "Bodo": "brx",
78
+ "Boghom": "bux",
79
+ "Bokyi": "bky",
80
+ "Bomu": "bmq",
81
+ "Bondei": "bou",
82
+ "Borgu Fulfulde": "fue",
83
+ "Bosnian": "bs",
84
+ "Brahui": "brh",
85
+ "Braj": "bra",
86
+ "Breton": "br",
87
+ "Buduma": "bdm",
88
+ "Buginese": "bug",
89
+ "Bukharic": "bhh",
90
+ "Bulgarian": "bg",
91
+ "Bulu (Cameroon)": "bum",
92
+ "Bundeli": "bns",
93
+ "Bunun": "bnn",
94
+ "Bura-Pabir": "bwr",
95
+ "Burak": "bys",
96
+ "Burmese": "my",
97
+ "Burushaski": "bsk",
98
+ "Cacaloxtepec Mixtec": "miu",
99
+ "Cajatambo North Lima Quechua": "qvl",
100
+ "Cakfem-Mushere": "cky",
101
+ "Cameroon Pidgin": "wes",
102
+ "Campidanese Sardinian": "sro",
103
+ "Cantonese": "yue",
104
+ "Catalan": "ca",
105
+ "Cebuano": "ceb",
106
+ "Cen": "cen",
107
+ "Central Kurdish": "ckb",
108
+ "Central Nahuatl": "nhn",
109
+ "Central Pame": "pbs",
110
+ "Central Pashto": "pst",
111
+ "Central Puebla Nahuatl": "ncx",
112
+ "Central Tarahumara": "tar",
113
+ "Central Yupik": "esu",
114
+ "Central-Eastern Niger Fulfulde": "fuq",
115
+ "Chadian Arabic": "shu",
116
+ "Chichewa": "ny",
117
+ "Chichicapan Zapotec": "zpv",
118
+ "Chiga": "cgg",
119
+ "Chimalapa Zoque": "zoh",
120
+ "Chimborazo Highland Quichua": "qug",
121
+ "Chinese": "zh",
122
+ "Chiquián Ancash Quechua": "qxa",
123
+ "Chitwania Tharu": "the",
124
+ "Chokwe": "cjk",
125
+ "Chuvash": "cv",
126
+ "Cibak": "ckl",
127
+ "Coastal Konjo": "kjc",
128
+ "Copainalá Zoque": "zoc",
129
+ "Cornish": "kw",
130
+ "Corongo Ancash Quechua": "qwa",
131
+ "Croatian": "hr",
132
+ "Cross River Mbembe": "mfn",
133
+ "Cuyamecalco Mixtec": "xtu",
134
+ "Czech": "cs",
135
+ "Dadiya": "dbd",
136
+ "Dagbani": "dag",
137
+ "Dameli": "dml",
138
+ "Danish": "da",
139
+ "Dargwa": "dar",
140
+ "Dazaga": "dzg",
141
+ "Deccan": "dcc",
142
+ "Degema": "deg",
143
+ "Dera (Nigeria)": "kna",
144
+ "Dghwede": "dgh",
145
+ "Dhatki": "mki",
146
+ "Dhivehi": "dv",
147
+ "Dhofari Arabic": "adf",
148
+ "Dijim-Bwilim": "cfa",
149
+ "Dogri": "dgo",
150
+ "Domaaki": "dmk",
151
+ "Dotyali": "dty",
152
+ "Duala": "dua",
153
+ "Dutch": "nl",
154
+ "DũYa": "ldb",
155
+ "Dyula": "dyu",
156
+ "Eastern Balochi": "bgp",
157
+ "Eastern Bolivian Guaraní": "gui",
158
+ "Eastern Egyptian Bedawi Arabic": "avl",
159
+ "Eastern Krahn": "kqo",
160
+ "Eastern Mari": "mhr",
161
+ "Eastern Yiddish": "ydd",
162
+ "Ebrié": "ebr",
163
+ "Eggon": "ego",
164
+ "Egyptian Arabic": "arz",
165
+ "Ejagham": "etu",
166
+ "Eleme": "elm",
167
+ "Eloyi": "afo",
168
+ "Embu": "ebu",
169
+ "English": "en",
170
+ "Erzya": "myv",
171
+ "Esan": "ish",
172
+ "Esperanto": "eo",
173
+ "Estonian": "et",
174
+ "Eton (Cameroon)": "eto",
175
+ "Ewondo": "ewo",
176
+ "Extremaduran": "ext",
177
+ "Fang (Equatorial Guinea)": "fan",
178
+ "Fanti": "fat",
179
+ "Farefare": "gur",
180
+ "Fe'fe'": "fmp",
181
+ "Filipino": "fil",
182
+ "Filomena Mata-Coahuitlán Totonac": "tlp",
183
+ "Finnish": "fi",
184
+ "Fipa": "fip",
185
+ "French": "fr",
186
+ "Fulah": "ff",
187
+ "Galician": "gl",
188
+ "Gambian Wolof": "wof",
189
+ "Ganda": "lg",
190
+ "Garhwali": "gbm",
191
+ "Gawar-Bati": "gwt",
192
+ "Gawri": "gwc",
193
+ "Gbagyi": "gbr",
194
+ "Gbari": "gby",
195
+ "Geji": "gyz",
196
+ "Gen": "gej",
197
+ "Georgian": "ka",
198
+ "German": "de",
199
+ "Geser-Gorom": "ges",
200
+ "Gheg Albanian": "aln",
201
+ "Ghomálá'": "bbj",
202
+ "Gidar": "gid",
203
+ "Glavda": "glw",
204
+ "Goan Konkani": "gom",
205
+ "Goaria": "gig",
206
+ "Goemai": "ank",
207
+ "Gola": "gol",
208
+ "Greek": "el",
209
+ "Guarani": "gn",
210
+ "Guduf-Gava": "gdf",
211
+ "Guerrero Amuzgo": "amu",
212
+ "Gujarati": "gu",
213
+ "Gujari": "gju",
214
+ "Gulf Arabic": "afb",
215
+ "Gurgula": "ggg",
216
+ "Gusii": "guz",
217
+ "Gusilay": "gsl",
218
+ "Gweno": "gwe",
219
+ "Güilá Zapotec": "ztu",
220
+ "Hadothi": "hoj",
221
+ "Hahon": "hah",
222
+ "Haitian": "ht",
223
+ "Hakha Chin": "cnh",
224
+ "Hakö": "hao",
225
+ "Halia": "hla",
226
+ "Hausa": "ha",
227
+ "Hawaiian": "haw",
228
+ "Hazaragi": "haz",
229
+ "Hebrew": "he",
230
+ "Hemba": "hem",
231
+ "Herero": "hz",
232
+ "Highland Konjo": "kjk",
233
+ "Hijazi Arabic": "acw",
234
+ "Hindi": "hi",
235
+ "Huarijio": "var",
236
+ "Huautla Mazatec": "mau",
237
+ "Huaxcaleca Nahuatl": "nhq",
238
+ "Huba": "hbb",
239
+ "Huitepec Mixtec": "mxs",
240
+ "Hula": "hul",
241
+ "Hungarian": "hu",
242
+ "Hunjara-Kaina Ke": "hkk",
243
+ "Hwana": "hwo",
244
+ "Ibibio": "ibb",
245
+ "Icelandic": "is",
246
+ "Idakho-Isukha-Tiriki": "ida",
247
+ "Idoma": "idu",
248
+ "Igbo": "ig",
249
+ "Igo": "ahl",
250
+ "Ikposo": "kpo",
251
+ "Ikwere": "ikw",
252
+ "Imbabura Highland Quichua": "qvi",
253
+ "Indonesian": "id",
254
+ "Indus Kohistani": "mvy",
255
+ "Interlingua (International Auxiliary Language Association)": "ia",
256
+ "Inupiaq": "ik",
257
+ "Irish": "ga",
258
+ "Iron Ossetic": "os",
259
+ "Isekiri": "its",
260
+ "Isoko": "iso",
261
+ "Italian": "it",
262
+ "Ito": "itw",
263
+ "Itzá": "itz",
264
+ "Ixtayutla Mixtec": "vmj",
265
+ "Izon": "ijc",
266
+ "Jambi Malay": "jax",
267
+ "Japanese": "ja",
268
+ "Jaqaru": "jqr",
269
+ "Jauja Wanca Quechua": "qxw",
270
+ "Jaunsari": "jns",
271
+ "Javanese": "jv",
272
+ "Jiba": "juo",
273
+ "Jju": "kaj",
274
+ "Judeo-Moroccan Arabic": "aju",
275
+ "Juxtlahuaca Mixtec": "vmc",
276
+ "Kabardian": "kbd",
277
+ "Kabras": "lkb",
278
+ "Kabuverdianu": "kea",
279
+ "Kabyle": "kab",
280
+ "Kachi Koli": "gjk",
281
+ "Kairak": "ckr",
282
+ "Kalabari": "ijn",
283
+ "Kalasha": "kls",
284
+ "Kalenjin": "kln",
285
+ "Kalkoti": "xka",
286
+ "Kamba": "kam",
287
+ "Kamo": "kcq",
288
+ "Kanauji": "bjj",
289
+ "Kanembu": "kbl",
290
+ "Kannada": "kn",
291
+ "Karekare": "kai",
292
+ "Kashmiri": "ks",
293
+ "Kathoriya Tharu": "tkt",
294
+ "Kati": "bsh",
295
+ "Kazakh": "kk",
296
+ "Keiyo": "eyo",
297
+ "Khams Tibetan": "khg",
298
+ "Khana": "ogo",
299
+ "Khetrani": "xhe",
300
+ "Khmer": "km",
301
+ "Khowar": "khw",
302
+ "Kinga": "zga",
303
+ "Kinnauri": "kfk",
304
+ "Kinyarwanda": "rw",
305
+ "Kirghiz": "ky",
306
+ "Kirya-Konzəl": "fkk",
307
+ "Kochila Tharu": "thq",
308
+ "Kohistani Shina": "plk",
309
+ "Kohumono": "bcs",
310
+ "Kok Borok": "trp",
311
+ "Kol (Papua New Guinea)": "kol",
312
+ "Kom (Cameroon)": "bkm",
313
+ "Koma": "kmy",
314
+ "Konkani": "knn",
315
+ "Konzo": "koo",
316
+ "Korean": "ko",
317
+ "Korwa": "kfp",
318
+ "Kota (India)": "kfe",
319
+ "Koti": "eko",
320
+ "Kuanua": "ksd",
321
+ "Kuanyama": "kj",
322
+ "Kui (India)": "uki",
323
+ "Kulung (Nigeria)": "bbu",
324
+ "Kuot": "kto",
325
+ "Kushi": "kuh",
326
+ "Kwambi": "kwm",
327
+ "Kwasio": "nmg",
328
+ "Lala-Roba": "lla",
329
+ "Lamang": "hia",
330
+ "Lao": "lo",
331
+ "Larike-Wakasihu": "alo",
332
+ "Lasi": "lss",
333
+ "Latgalian": "ltg",
334
+ "Latvian": "lv",
335
+ "Levantine Arabic": "apc",
336
+ "Liana-Seti": "ste",
337
+ "Liberia Kpelle": "xpe",
338
+ "Liberian English": "lir",
339
+ "Libyan Arabic": "ayl",
340
+ "Ligurian": "lij",
341
+ "Lijili": "mgi",
342
+ "Lingala": "ln",
343
+ "Lithuanian": "lt",
344
+ "Loarki": "lrk",
345
+ "Logooli": "rag",
346
+ "Logudorese Sardinian": "src",
347
+ "Loja Highland Quichua": "qvj",
348
+ "Loloda": "loa",
349
+ "Longuda": "lnu",
350
+ "Loxicha Zapotec": "ztp",
351
+ "Luba-Lulua": "lua",
352
+ "Luo": "luo",
353
+ "Lushai": "lus",
354
+ "Luxembourgish": "lb",
355
+ "Maasina Fulfulde": "ffm",
356
+ "Maba (Chad)": "mde",
357
+ "Macedo-Romanian": "rup",
358
+ "Macedonian": "mk",
359
+ "Mada (Cameroon)": "mxu",
360
+ "Mafa": "maf",
361
+ "Maithili": "mai",
362
+ "Malay": "ms",
363
+ "Malayalam": "ml",
364
+ "Mali": "gcc",
365
+ "Malinaltepec Me'phaa": "tcf",
366
+ "Maltese": "mt",
367
+ "Mandara": "tbf",
368
+ "Mandjak": "mfv",
369
+ "Manggarai": "mqy",
370
+ "Manipuri": "mni",
371
+ "Mansoanka": "msw",
372
+ "Manx": "gv",
373
+ "Maori": "mi",
374
+ "Marathi": "mr",
375
+ "Marghi Central": "mrt",
376
+ "Marghi South": "mfm",
377
+ "Maria (India)": "mrr",
378
+ "Marwari (Pakistan)": "mve",
379
+ "Masana": "mcn",
380
+ "Masikoro Malagasy": "msh",
381
+ "Matsés": "mcf",
382
+ "Mazaltepec Zapotec": "zpy",
383
+ "Mazatlán Mazatec": "vmz",
384
+ "Mazatlán Mixe": "mzl",
385
+ "Mbe": "mfo",
386
+ "Mbo (Cameroon)": "mbo",
387
+ "Mbum": "mdd",
388
+ "Medumba": "byv",
389
+ "Mekeo": "mek",
390
+ "Meru": "mer",
391
+ "Mesopotamian Arabic": "acm",
392
+ "Mewari": "mtr",
393
+ "Min Nan Chinese": "nan",
394
+ "Mingrelian": "xmf",
395
+ "Mitlatongo Mixtec": "vmm",
396
+ "Miya": "mkf",
397
+ "Mokpwe": "bri",
398
+ "Moksha": "mdf",
399
+ "Mom Jango": "ver",
400
+ "Mongolian": "mn",
401
+ "Moroccan Arabic": "ary",
402
+ "Motu": "meu",
403
+ "Mpiemo": "mcx",
404
+ "Mpumpong": "mgg",
405
+ "Mundang": "mua",
406
+ "Mungaka": "mhk",
407
+ "Musey": "mse",
408
+ "Musgu": "mug",
409
+ "Musi": "mui",
410
+ "Naba": "mne",
411
+ "Najdi Arabic": "ars",
412
+ "Nalik": "nal",
413
+ "Nawdm": "nmz",
414
+ "Ndonga": "ng",
415
+ "Neapolitan": "nap",
416
+ "Nepali": "npi",
417
+ "Ngamo": "nbh",
418
+ "Ngas": "anc",
419
+ "Ngiemboon": "nnh",
420
+ "Ngizim": "ngi",
421
+ "Ngomba": "jgo",
422
+ "Ngombale": "nla",
423
+ "Nigerian Fulfulde": "fuv",
424
+ "Nigerian Pidgin": "pcm",
425
+ "Nimadi": "noe",
426
+ "Nobiin": "fia",
427
+ "North Mesopotamian Arabic": "ayp",
428
+ "North Moluccan Malay": "max",
429
+ "Northern Betsimisaraka Malagasy": "bmm",
430
+ "Northern Hindko": "hno",
431
+ "Northern Kurdish": "kmr",
432
+ "Northern Pame": "pmq",
433
+ "Northern Pashto": "pbu",
434
+ "Northern Uzbek": "uzn",
435
+ "Northwest Gbaya": "gya",
436
+ "Norwegian": "no",
437
+ "Norwegian Bokmål": "nb",
438
+ "Norwegian Nynorsk": "nn",
439
+ "Notsi": "ncf",
440
+ "Nyankpa": "yes",
441
+ "Nyungwe": "nyu",
442
+ "Nzanyi": "nja",
443
+ "Nüpode Huitoto": "hux",
444
+ "Occitan": "oc",
445
+ "Od": "odk",
446
+ "Odia": "ory",
447
+ "Odual": "odu",
448
+ "Omani Arabic": "acx",
449
+ "Orizaba Nahuatl": "nlv",
450
+ "Orma": "orc",
451
+ "Ormuri": "oru",
452
+ "Oromo": "om",
453
+ "Pahari-Potwari": "phr",
454
+ "Paiwan": "pwn",
455
+ "Panjabi": "pa",
456
+ "Papuan Malay": "pmy",
457
+ "Parkari Koli": "kvx",
458
+ "Pedi": "nso",
459
+ "Pero": "pip",
460
+ "Persian": "fa",
461
+ "Petats": "pex",
462
+ "Phalura": "phl",
463
+ "Piemontese": "pms",
464
+ "Piya-Kwonci": "piy",
465
+ "Plateau Malagasy": "plt",
466
+ "Polish": "pl",
467
+ "Poqomam": "poc",
468
+ "Portuguese": "pt",
469
+ "Pulaar": "fuc",
470
+ "Pular": "fuf",
471
+ "Puno Quechua": "qxp",
472
+ "Pushto": "ps",
473
+ "Pökoot": "pko",
474
+ "Qaqet": "byx",
475
+ "Quiotepec Chinantec": "chq",
476
+ "Rana Tharu": "thr",
477
+ "Rangi": "lag",
478
+ "Rapoisi": "kyx",
479
+ "Ratahan": "rth",
480
+ "Rayón Zoque": "zor",
481
+ "Romanian": "ro",
482
+ "Romansh": "rm",
483
+ "Rombo": "rof",
484
+ "Rotokas": "roo",
485
+ "Rukai": "dru",
486
+ "Russian": "ru",
487
+ "Sacapulteco": "quv",
488
+ "Saidi Arabic": "aec",
489
+ "Sakalava Malagasy": "skg",
490
+ "Sakizaya": "szy",
491
+ "Saleman": "sau",
492
+ "Samba Daka": "ccg",
493
+ "Samba Leko": "ndi",
494
+ "San Felipe Otlaltepec Popoloca": "pow",
495
+ "San Francisco Del Mar Huave": "hue",
496
+ "San Juan Atzingo Popoloca": "poe",
497
+ "San Martín Itunyoso Triqui": "trq",
498
+ "San Miguel El Grande Mixtec": "mig",
499
+ "Sansi": "ssi",
500
+ "Sanskrit": "sa",
501
+ "Santa Ana de Tusi Pasco Quechua": "qxt",
502
+ "Santa Catarina Albarradas Zapotec": "ztn",
503
+ "Santali": "sat",
504
+ "Santiago del Estero Quichua": "qus",
505
+ "Saposa": "sps",
506
+ "Saraiki": "skr",
507
+ "Sardinian": "sc",
508
+ "Saya": "say",
509
+ "Sediq": "trv",
510
+ "Serbian": "sr",
511
+ "Seri": "sei",
512
+ "Shina": "scl",
513
+ "Shona": "sn",
514
+ "Siar-Lak": "sjr",
515
+ "Sibe": "nco",
516
+ "Sicilian": "scn",
517
+ "Sihuas Ancash Quechua": "qws",
518
+ "Sikkimese": "sip",
519
+ "Sinaugoro": "snc",
520
+ "Sindhi": "sd",
521
+ "Sindhi Bhil": "sbn",
522
+ "Sinhala": "si",
523
+ "Sinicahua Mixtec": "xti",
524
+ "Sipacapense": "qum",
525
+ "Siwai": "siw",
526
+ "Slovak": "sk",
527
+ "Slovenian": "sl",
528
+ "Solos": "sol",
529
+ "Somali": "so",
530
+ "Soninke": "snk",
531
+ "South Giziga": "giz",
532
+ "South Ucayali Ashéninka": "cpy",
533
+ "Southeastern Nochixtlán Mixtec": "mxy",
534
+ "Southern Betsimisaraka Malagasy": "bzc",
535
+ "Southern Pashto": "pbt",
536
+ "Southern Pastaza Quechua": "qup",
537
+ "Soyaltepec Mazatec": "vmp",
538
+ "Spanish": "es",
539
+ "Standard Arabic": "arb",
540
+ "Standard Moroccan Tamazight": "zgh",
541
+ "Sudanese Arabic": "apd",
542
+ "Sulka": "sua",
543
+ "Svan": "sva",
544
+ "Swahili": "sw",
545
+ "Swedish": "sv",
546
+ "Tae'": "rob",
547
+ "Tahaggart Tamahaq": "thv",
548
+ "Taita": "dav",
549
+ "Tajik": "tg",
550
+ "Tamil": "ta",
551
+ "Tandroy-Mahafaly Malagasy": "tdx",
552
+ "Tangale": "tan",
553
+ "Tanosy Malagasy": "txy",
554
+ "Tarok": "yer",
555
+ "Tatar": "tt",
556
+ "Tedaga": "tuq",
557
+ "Telugu": "te",
558
+ "Tem": "kdh",
559
+ "Teop": "tio",
560
+ "Tepeuxila Cuicatec": "cux",
561
+ "Tepinapa Chinantec": "cte",
562
+ "Tera": "ttr",
563
+ "Terei": "buo",
564
+ "Termanu": "twu",
565
+ "Tesaka Malagasy": "tkg",
566
+ "Tetelcingo Nahuatl": "nhg",
567
+ "Teutila Cuicatec": "cut",
568
+ "Thai": "th",
569
+ "Tibetan": "bo",
570
+ "Tidaá Mixtec": "mtx",
571
+ "Tidore": "tvo",
572
+ "Tigak": "tgc",
573
+ "Tigre": "tig",
574
+ "Tigrinya": "ti",
575
+ "Tilquiapan Zapotec": "zts",
576
+ "Tinputz": "tpz",
577
+ "Tlacoapa Me'phaa": "tpl",
578
+ "Tlacoatzintepec Chinantec": "ctl",
579
+ "Tlingit": "tli",
580
+ "Toki Pona": "tok",
581
+ "Tomoip": "tqp",
582
+ "Tondano": "tdn",
583
+ "Tonsea": "txs",
584
+ "Tooro": "ttj",
585
+ "Torau": "ttu",
586
+ "Torwali": "trw",
587
+ "Tsimihety Malagasy": "xmw",
588
+ "Tsotso": "lto",
589
+ "Tswana": "tn",
590
+ "Tugen": "tuy",
591
+ "Tuki": "bag",
592
+ "Tula": "tul",
593
+ "Tulu": "tcy",
594
+ "Tunen": "tvu",
595
+ "Tungag": "lcm",
596
+ "Tunisian Arabic": "aeb",
597
+ "Tupuri": "tui",
598
+ "Turkana": "tuv",
599
+ "Turkish": "tr",
600
+ "Turkmen": "tk",
601
+ "Tututepec Mixtec": "mtu",
602
+ "Twi": "tw",
603
+ "Ubaghara": "byc",
604
+ "Uighur": "ug",
605
+ "Ukrainian": "uk",
606
+ "Umbundu": "umb",
607
+ "Upper Sorbian": "hsb",
608
+ "Urdu": "ur",
609
+ "Ushojo": "ush",
610
+ "Uzbek": "uz",
611
+ "Vai": "vai",
612
+ "Vietnamese": "vi",
613
+ "Votic": "vot",
614
+ "Võro": "vro",
615
+ "Waci Gbe": "wci",
616
+ "Wadiyara Koli": "kxp",
617
+ "Waja": "wja",
618
+ "Wakhi": "wbl",
619
+ "Wanga": "lwg",
620
+ "Wapan": "juk",
621
+ "Warji": "wji",
622
+ "Welsh": "cy",
623
+ "Wemale": "weo",
624
+ "Western Frisian": "fy",
625
+ "Western Highland Purepecha": "pua",
626
+ "Western Juxtlahuaca Mixtec": "jmx",
627
+ "Western Maninkakan": "mlq",
628
+ "Western Mari": "mrj",
629
+ "Western Niger Fulfulde": "fuh",
630
+ "Western Panjabi": "pnb",
631
+ "Wolof": "wo",
632
+ "Wuzlam": "udl",
633
+ "Xanaguía Zapotec": "ztg",
634
+ "Xhosa": "xh",
635
+ "Yace": "ekr",
636
+ "Yakut": "sah",
637
+ "Yalahatan": "jal",
638
+ "Yanahuanca Pasco Quechua": "qur",
639
+ "Yangben": "yav",
640
+ "Yaqui": "yaq",
641
+ "Yauyos Quechua": "qux",
642
+ "Yekhee": "ets",
643
+ "Yiddish": "yi",
644
+ "Yidgha": "ydg",
645
+ "Yoruba": "yo",
646
+ "Yutanduchi Mixtec": "mab",
647
+ "Zacatlán-Ahuacatlán-Tepetzintla Nahuatl": "nhi",
648
+ "Zarma": "dje",
649
+ "Zaza": "zza",
650
+ "Zulu": "zu",
651
+ "Ömie": "aom",
652
+ }
steps/lang/qwen3_languages.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen3-TTS supported target languages.
2
+ # Source: https://huggingface.co/spaces/Qwen/Qwen3-TTS (LANGUAGES constant in app.py)
3
+ """Qwen3-supported languages (display name -> ISO-639-1 code)."""
4
+
5
+ QWEN3_LANGUAGE_CODES: dict[str, str] = {
6
+ "Chinese": "zh",
7
+ "English": "en",
8
+ "French": "fr",
9
+ "German": "de",
10
+ "Japanese": "ja",
11
+ "Korean": "ko",
12
+ "Portuguese": "pt",
13
+ "Russian": "ru",
14
+ "Spanish": "es",
15
+ }
steps/lang/urdu.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Urdu-specific translation handlers.
2
+
3
+ Handles:
4
+ - Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary)
5
+ - Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari)
6
+ - Devanagari → Urdu script conversion for captions
7
+ """
8
+ import json
9
+ import re
10
+
11
+ from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call
12
+
13
+
14
+ # ── Public dispatcher hooks ──────────────────────────────────────────────────
15
+
16
+ def get_translation_prompt() -> str:
17
+ """Return the Urdu-specific system prompt for translation."""
18
+ return (
19
+ "You are a professional voice-over translator for commonly spoken Urdu. "
20
+ "Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n"
21
+ "LANGUAGE RULES:\n"
22
+ "- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n"
23
+ "- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. "
24
+ "NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, "
25
+ "وقت not سمے, لیکن not پرنتو, اگر not یدی).\n"
26
+ "- Keep it natural and conversational, not literary or formal.\n"
27
+ "- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). "
28
+ "Translate ALL such expressions into Urdu equivalents.\n\n"
29
+ "CRITICAL — DURATION CONSTRAINT:\n"
30
+ "Each line shows its spoken duration in brackets (e.g. [4.6s]). "
31
+ "The translation will be spoken by TTS and MUST fit within that duration.\n"
32
+ "STRICT RULE: Your translation MUST have FEWER words than the original English. "
33
+ "If the English has 10 words, aim for 7-8 Urdu words maximum.\n"
34
+ "Every word must earn its place — if removing a word doesn't lose core meaning, remove it. "
35
+ "Paraphrase aggressively. Use shorter synonyms. Merge clauses. "
36
+ "A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n"
37
+ "TTS COMPATIBILITY — IMPORTANT:\n"
38
+ "The TTS model struggles with long sentences that have multiple commas or clauses. "
39
+ "Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. "
40
+ "Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n"
41
+ "Each output line is still ONE item in the array (one per input line). "
42
+ "You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n"
43
+ "Write ONLY in Urdu script (Nastaliq/Arabic script). "
44
+ "Return ONLY a JSON array of translated strings, in order, no extra text. "
45
+ "Do NOT include the duration prefix or numbering in the output — only the translated text itself. "
46
+ 'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye '
47
+ 'Example output: ["سلام", "خدا حافظ"]'
48
+ )
49
+
50
+
51
+ def get_fallback_mode() -> str:
52
+ """Urdu uses Bedrock instead of Google Translate as fallback."""
53
+ return "bedrock"
54
+
55
+
56
+ _ENGLISH_FILLERS = re.compile(
57
+ r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*',
58
+ re.IGNORECASE,
59
+ )
60
+
61
+
62
+ def post_translate(segments: list[dict]) -> list[dict]:
63
+ """Run Urdu-specific post-processing after translation.
64
+
65
+ - Strips leaked English fillers.
66
+ - Transliterates Urdu script → Devanagari for TTS (sets 'tts_text').
67
+ - Captions use translated_text directly (already Urdu/Nastaliq script).
68
+ """
69
+ for seg in segments:
70
+ text = seg.get("translated_text", "")
71
+ # Strip leaked English fillers
72
+ clean_text = _ENGLISH_FILLERS.sub("", text).strip()
73
+ seg["translated_text"] = clean_text
74
+
75
+ return transliterate_to_devanagari(segments)
76
+
77
+
78
+ # ── Transliteration: Urdu → Devanagari (for TTS) ────────────────────────────
79
+
80
+ _URDU_TO_DEVA = {
81
+ 'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स',
82
+ 'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड',
83
+ 'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स',
84
+ 'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ',
85
+ 'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग',
86
+ 'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह',
87
+ 'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ',
88
+ 'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न',
89
+ 'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';',
90
+ }
91
+
92
+
93
+ def _urdu_to_rough_devanagari(text: str) -> str:
94
+ """Deterministic character mapping from Urdu to Devanagari.
95
+ Consonants are mapped correctly, but short vowels are omitted/incorrect
96
+ because Urdu script doesn't explicitly mark them."""
97
+ result = []
98
+ for i, ch in enumerate(text):
99
+ if ch == 'ا':
100
+ # Word-initial alif is 'अ', otherwise 'ा'
101
+ result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा')
102
+ elif ch in _URDU_TO_DEVA:
103
+ result.append(_URDU_TO_DEVA[ch])
104
+ else:
105
+ result.append(ch)
106
+
107
+ # Fix a common edge case: ئ + ے (e.g., in بروئے)
108
+ rough = ''.join(result)
109
+ rough = rough.replace('इे', 'ए')
110
+ return rough
111
+
112
+
113
+ def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2):
114
+ """Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary."""
115
+ prompt = (
116
+ "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
117
+ "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
118
+ "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n"
119
+ "STRICT RULES:\n"
120
+ "- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n"
121
+ "- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n"
122
+ "- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n"
123
+ "- Add halant (्) for conjuncts where needed\n\n"
124
+ "EXAMPLES:\n"
125
+ "Urdu: محبت | rough: महबत | fixed: मोहब्बत\n"
126
+ "Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
127
+ "Urdu: حکمت | rough: हकमत | fixed: हिकमत\n"
128
+ "Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n"
129
+ "Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n"
130
+ "Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n"
131
+ "Urdu: کریم | rough: करयम | fixed: करीम\n\n"
132
+ "Return ONLY a JSON array of corrected Devanagari strings, in order, one per input."
133
+ )
134
+
135
+ for attempt in range(1, max_attempts + 1):
136
+ try:
137
+ response = client.chat.completions.create(
138
+ model=model,
139
+ messages=[
140
+ {"role": "system", "content": prompt},
141
+ {"role": "user", "content": numbered},
142
+ ],
143
+ temperature=0.1,
144
+ )
145
+ raw = response.choices[0].message.content.strip()
146
+ log_llm_call(
147
+ step="urdu_vowel_polish", provider="pollinations", model=model,
148
+ system_prompt=prompt, user_prompt=numbered,
149
+ response=raw, temperature=0.1,
150
+ )
151
+
152
+ try:
153
+ polished_list = parse_json_array(raw)
154
+ except (json.JSONDecodeError, ValueError):
155
+ print(f"[urdu] Attempt {attempt}: Could not parse response as JSON")
156
+ continue
157
+
158
+ if len(polished_list) != expected_count:
159
+ print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}")
160
+ continue
161
+
162
+ # Quick check if it's returning Arabic/Urdu script instead
163
+ sample = " ".join(polished_list[:3])
164
+ bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
165
+ if bad_chars > 0:
166
+ print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying")
167
+ prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt
168
+ continue
169
+
170
+ return polished_list
171
+
172
+ except Exception as e:
173
+ print(f"[urdu] LLM error on attempt {attempt}: {e}")
174
+
175
+ return None
176
+
177
+
178
+ def transliterate_to_devanagari(segments: list[dict]) -> list[dict]:
179
+ """Convert Urdu script translations to Devanagari for TTS.
180
+ Adds 'tts_text' field to each segment.
181
+ Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing."""
182
+ if not segments:
183
+ return segments
184
+
185
+ print("[urdu] Starting Hybrid Urdu → Devanagari conversion...")
186
+
187
+ # Step 1: Deterministic mapping to rough Devanagari
188
+ rough_texts = []
189
+ for seg in segments:
190
+ urdu_text = seg.get("translated_text", "")
191
+ rough_deva = _urdu_to_rough_devanagari(urdu_text)
192
+ rough_texts.append(rough_deva)
193
+
194
+ expected = len(segments)
195
+ numbered = "\n".join(
196
+ f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}"
197
+ for i, seg in enumerate(segments)
198
+ )
199
+
200
+ # Try Pollinations
201
+ client = build_client()
202
+ polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected)
203
+
204
+ if polished_list:
205
+ for seg, deva_text in zip(segments, polished_list):
206
+ seg["tts_text"] = deva_text
207
+ print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓")
208
+ return segments
209
+
210
+ print("[urdu] Pollinations Polish failed ��� trying Bedrock fallback...")
211
+
212
+ # Bedrock Fallback
213
+ try:
214
+ system_prompt = (
215
+ "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
216
+ "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
217
+ "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n"
218
+ "EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
219
+ "حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n"
220
+ "Return ONLY a JSON array of corrected Devanagari strings."
221
+ )
222
+
223
+ for attempt in range(1, 3):
224
+ raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock")
225
+
226
+ try:
227
+ polished_list = parse_json_array(raw)
228
+ except (json.JSONDecodeError, ValueError):
229
+ print(f"[urdu] Bedrock attempt {attempt}: Could not parse response")
230
+ continue
231
+
232
+ if len(polished_list) != expected:
233
+ print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}")
234
+ continue
235
+
236
+ sample = " ".join(polished_list[:3])
237
+ bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
238
+ if bad_chars > 0:
239
+ print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying")
240
+ system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt
241
+ continue
242
+
243
+ for seg, deva_text in zip(segments, polished_list):
244
+ seg["tts_text"] = deva_text
245
+ print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓")
246
+ return segments
247
+
248
+ except Exception as e:
249
+ print(f"[urdu] WARNING: Bedrock fallback failed ({e})")
250
+
251
+ print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.")
252
+ for seg, r_text in zip(segments, rough_texts):
253
+ seg["tts_text"] = r_text
254
+ return segments
255
+
256
+
257
+ # ── Transliteration: Devanagari → Urdu script (for captions) ────────────────
258
+
259
+ def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]:
260
+ """Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles.
261
+ Adds 'caption_text' field to each segment."""
262
+ if not segments:
263
+ return segments
264
+
265
+ texts = [seg.get("translated_text", "") for seg in segments]
266
+ numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts))
267
+
268
+ system_prompt = (
269
+ "You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). "
270
+ "This is NOT translation — the language is already Urdu, just written in Devanagari. "
271
+ "Convert it to proper Urdu script preserving every word exactly.\n\n"
272
+ "Return ONLY a JSON array of converted strings, in order, no extra text. "
273
+ "Do NOT include numbering in the output."
274
+ )
275
+
276
+ client = build_client()
277
+ try:
278
+ response = client.chat.completions.create(
279
+ model=MODEL,
280
+ messages=[
281
+ {"role": "system", "content": system_prompt},
282
+ {"role": "user", "content": numbered},
283
+ ],
284
+ temperature=0.1,
285
+ )
286
+
287
+ raw = response.choices[0].message.content.strip()
288
+ log_llm_call(
289
+ step="urdu_script_convert", provider="pollinations", model=MODEL,
290
+ system_prompt=system_prompt, user_prompt=numbered,
291
+ response=raw, temperature=0.1,
292
+ )
293
+ urdu_list = parse_json_array(raw)
294
+
295
+ if len(urdu_list) != len(segments):
296
+ print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
297
+ return segments
298
+
299
+ for seg, urdu_text in zip(segments, urdu_list):
300
+ seg["caption_text"] = urdu_text
301
+
302
+ print("[urdu] Urdu script transliteration complete ✓")
303
+ return segments
304
+
305
+ except Exception as e:
306
+ print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...")
307
+
308
+ try:
309
+ raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock")
310
+ urdu_list = parse_json_array(raw)
311
+
312
+ if len(urdu_list) != len(segments):
313
+ print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
314
+ return segments
315
+
316
+ for seg, urdu_text in zip(segments, urdu_list):
317
+ seg["caption_text"] = urdu_text
318
+
319
+ print("[urdu] Urdu script transliteration (Bedrock) complete ✓")
320
+ return segments
321
+
322
+ except Exception as e2:
323
+ print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions")
324
+ return segments
steps/s1_extract_audio.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 1-2: Extract audio track from input video.
3
+ Outputs a 16 kHz mono WAV suitable for Whisper + Chatterbox.
4
+ """
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+
9
+ def extract_audio(video_path: str, output_path: str = "tmp/audio/source/extracted_audio.wav") -> str:
10
+ """
11
+ Extract audio from video using ffmpeg.
12
+
13
+ Args:
14
+ video_path: Path to the input video file.
15
+ output_path: Where to save the extracted audio (WAV).
16
+
17
+ Returns:
18
+ Absolute path to the extracted audio file.
19
+ """
20
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
21
+
22
+ cmd = [
23
+ "ffmpeg", "-y",
24
+ "-i", video_path,
25
+ "-vn", # no video
26
+ "-acodec", "pcm_s16le", # PCM 16-bit
27
+ "-ar", "16000", # 16 kHz (Whisper standard)
28
+ "-ac", "1", # mono
29
+ output_path,
30
+ ]
31
+
32
+ result = subprocess.run(cmd, capture_output=True, text=True)
33
+ if result.returncode != 0:
34
+ raise RuntimeError(f"FFmpeg audio extraction failed:\n{result.stderr}")
35
+
36
+ print(f"[s1] Audio extracted → {output_path}")
37
+ return output_path
38
+
39
+
40
+ def extract_audio_hq(video_path: str, output_path: str = "tmp/audio/source/extracted_audio_hq.wav") -> str:
41
+ """
42
+ Extract high-quality 44.1 kHz stereo audio for source separation (Demucs).
43
+
44
+ Args:
45
+ video_path: Path to the input video file.
46
+ output_path: Where to save the HQ audio (WAV).
47
+
48
+ Returns:
49
+ Absolute path to the extracted HQ audio file.
50
+ """
51
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ cmd = [
54
+ "ffmpeg", "-y",
55
+ "-i", video_path,
56
+ "-vn",
57
+ "-acodec", "pcm_s16le",
58
+ "-ar", "44100", # 44.1 kHz for Demucs
59
+ "-ac", "2", # stereo
60
+ output_path,
61
+ ]
62
+
63
+ result = subprocess.run(cmd, capture_output=True, text=True)
64
+ if result.returncode != 0:
65
+ raise RuntimeError(f"FFmpeg HQ audio extraction failed:\n{result.stderr}")
66
+
67
+ print(f"[s1] HQ audio extracted → {output_path}")
68
+ return output_path
steps/s1b_separate.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 1b: Separate vocals from accompaniment using Demucs (Python API).
3
+
4
+ In-process inference so ZeroGPU can intercept the GPU allocation via
5
+ `@spaces.GPU`. Works on CUDA, MPS, and CPU without code changes.
6
+ Only runs when preserve_music=True.
7
+ """
8
+ import shutil
9
+ import subprocess
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ import torchaudio
14
+
15
+ import spaces
16
+
17
+
18
+ _MODEL = None
19
+
20
+
21
+ def _select_device() -> str:
22
+ if torch.cuda.is_available():
23
+ return "cuda"
24
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
25
+ return "mps"
26
+ return "cpu"
27
+
28
+
29
+ def _get_model():
30
+ """Lazy-load htdemucs once per process. Module-level semantics; we load
31
+ on first call so the import itself stays cheap on non-GPU envs."""
32
+ global _MODEL
33
+ if _MODEL is None:
34
+ from demucs.pretrained import get_model
35
+ print("[s1b] Loading htdemucs on cpu...")
36
+ model = get_model("htdemucs")
37
+ model.eval()
38
+ model.to("cpu")
39
+ _MODEL = model
40
+ return _MODEL
41
+
42
+
43
+ @spaces.GPU(duration=120)
44
+ def _apply_demucs(mix: torch.Tensor, device: str) -> torch.Tensor:
45
+ """GPU-bound inference call. `mix` shape: [1, channels, time]."""
46
+ from demucs.apply import apply_model
47
+
48
+ model = _get_model()
49
+ if next(model.parameters()).device.type != device:
50
+ print(f"[s1b] Moving htdemucs to {device} inside GPU scope...")
51
+ model = model.to(device)
52
+ with torch.no_grad():
53
+ # apply_model returns [batch, sources, channels, time]
54
+ sources = apply_model(
55
+ model,
56
+ mix.to(device),
57
+ shifts=1,
58
+ split=True,
59
+ overlap=0.25,
60
+ device=device,
61
+ )
62
+ return sources.cpu()
63
+
64
+
65
+ def _load_and_normalise(audio_hq_path: str, target_sr: int, target_ch: int) -> tuple[torch.Tensor, float, float]:
66
+ """Load WAV, resample/remix to match model requirements, z-normalise."""
67
+ wav, sr = torchaudio.load(audio_hq_path)
68
+
69
+ if sr != target_sr:
70
+ wav = torchaudio.functional.resample(wav, sr, target_sr)
71
+
72
+ if wav.shape[0] == 1 and target_ch == 2:
73
+ wav = wav.repeat(2, 1)
74
+ elif wav.shape[0] > target_ch:
75
+ wav = wav[:target_ch]
76
+
77
+ mean = wav.mean()
78
+ std = wav.std().clamp_min(1e-8)
79
+ wav_norm = (wav - mean) / std
80
+ return wav_norm.unsqueeze(0), mean.item(), std.item()
81
+
82
+
83
+ def separate_audio(
84
+ audio_hq_path: str,
85
+ output_dir: str = "tmp",
86
+ ) -> tuple[str, str]:
87
+ """
88
+ Separate vocals from accompaniment using Demucs htdemucs (Python API).
89
+
90
+ Args:
91
+ audio_hq_path: Path to input audio (any sample rate / channels).
92
+ output_dir: Directory to write output stems.
93
+
94
+ Returns:
95
+ (vocals_16k_path, accompaniment_path)
96
+ """
97
+ out = Path(output_dir)
98
+ out.mkdir(parents=True, exist_ok=True)
99
+
100
+ model = _get_model()
101
+ device = _select_device()
102
+ target_sr = model.samplerate
103
+ target_ch = model.audio_channels
104
+ source_names = list(model.sources)
105
+
106
+ print(f"[s1b] Running Demucs htdemucs on {device} (Python API)...")
107
+ mix, mean, std = _load_and_normalise(audio_hq_path, target_sr, target_ch)
108
+
109
+ sources = _apply_demucs(mix, device)
110
+ sources = sources * std + mean
111
+ sources = sources[0] # drop batch dim → [num_sources, channels, time]
112
+
113
+ try:
114
+ vocals_idx = source_names.index("vocals")
115
+ except ValueError as e:
116
+ raise RuntimeError(f"htdemucs is missing 'vocals' source: {source_names}") from e
117
+
118
+ vocals = sources[vocals_idx]
119
+ no_vocals = sum(
120
+ sources[i] for i in range(sources.shape[0]) if i != vocals_idx
121
+ )
122
+
123
+ vocals_path = str(out / "vocals.wav")
124
+ accompaniment_path = str(out / "accompaniment.wav")
125
+ vocals_16k_path = str(out / "vocals_16k.wav")
126
+
127
+ torchaudio.save(vocals_path, vocals, target_sr)
128
+ torchaudio.save(accompaniment_path, no_vocals, target_sr)
129
+ print(f"[s1b] Vocals saved → {vocals_path}")
130
+ print(f"[s1b] Accompaniment saved → {accompaniment_path}")
131
+
132
+ # Resample vocals to 16 kHz mono for Whisper/TTS via ffmpeg
133
+ # (torchaudio resample works but ffmpeg is more predictable for downstream)
134
+ cmd = [
135
+ "ffmpeg", "-y",
136
+ "-i", vocals_path,
137
+ "-ar", "16000",
138
+ "-ac", "1",
139
+ vocals_16k_path,
140
+ ]
141
+ result = subprocess.run(cmd, capture_output=True, text=True)
142
+ if result.returncode != 0:
143
+ raise RuntimeError(f"FFmpeg vocals resample failed:\n{result.stderr}")
144
+
145
+ print(f"[s1b] Vocals (16 kHz) saved → {vocals_16k_path}")
146
+
147
+ # Leftover cleanup for any previously-shelled-out demucs runs
148
+ old_demucs_dir = out / "demucs"
149
+ if old_demucs_dir.exists():
150
+ shutil.rmtree(str(old_demucs_dir), ignore_errors=True)
151
+
152
+ return vocals_16k_path, accompaniment_path
steps/s2_transcribe.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 3: Transcribe audio with timestamps.
3
+
4
+ Primary local backend (device-dependent):
5
+ - Apple MPS: mlx-whisper
6
+ - CUDA: faster-whisper
7
+ - CPU: faster-whisper
8
+
9
+ Outermost fallback:
10
+ - Pollinations Whisper API (verbose_json)
11
+ """
12
+ import os
13
+
14
+ import requests
15
+ import torch
16
+ from dotenv import load_dotenv
17
+
18
+ import spaces
19
+
20
+ load_dotenv()
21
+
22
+ POLLINATIONS_URL = "https://gen.pollinations.ai/v1/audio/transcriptions"
23
+ POLLEN_TRANSCRIBE_MODEL = os.getenv("POLLEN_TRANSCRIBE_MODEL", "whisper-large-v3")
24
+ MLX_MODEL = os.getenv("MLX_WHISPER_MODEL", "mlx-community/whisper-large-mlx")
25
+ FASTER_WHISPER_MODEL = os.getenv("FASTER_WHISPER_MODEL", "large-v3")
26
+ OPENAI_WHISPER_MODEL = os.getenv("OPENAI_WHISPER_MODEL", "large-v3")
27
+ LOCAL_WHISPER_BACKEND_ENV = "VIDEOVOICE_WHISPER_BACKEND"
28
+ _VALID_LOCAL_BACKENDS = {
29
+ "mlx-whisper",
30
+ "openai-whisper-cuda",
31
+ "faster-whisper-cpu",
32
+ }
33
+
34
+ _FASTER_WHISPER_MODELS = {}
35
+ _OPENAI_WHISPER_MODEL = None
36
+
37
+
38
+ def _running_on_hf_space() -> bool:
39
+ return bool(
40
+ os.getenv("SPACE_ID")
41
+ or os.getenv("SPACE_HOST")
42
+ or os.getenv("HF_SPACE_ID")
43
+ )
44
+
45
+
46
+ def _get_local_whisper_backend() -> str:
47
+ """
48
+ Resolve the local transcription backend lazily.
49
+
50
+ On HF Spaces, default to CPU faster-whisper unless explicitly overridden.
51
+ ZeroGPU can report CUDA availability outside an active @spaces.GPU call,
52
+ which makes import-time backend selection unreliable.
53
+ """
54
+ override = os.getenv(LOCAL_WHISPER_BACKEND_ENV, "").strip().lower()
55
+ if override:
56
+ if override not in _VALID_LOCAL_BACKENDS:
57
+ raise ValueError(
58
+ f"Invalid {LOCAL_WHISPER_BACKEND_ENV}={override!r}. "
59
+ f"Expected one of: {', '.join(sorted(_VALID_LOCAL_BACKENDS))}."
60
+ )
61
+ return override
62
+
63
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
64
+ return "mlx-whisper"
65
+
66
+ if _running_on_hf_space():
67
+ return "faster-whisper-cpu"
68
+
69
+ if torch.cuda.is_available():
70
+ # PyTorch-based path so @spaces.GPU can intercept the CUDA allocation.
71
+ # faster-whisper uses CTranslate2 which bypasses PyTorch and breaks ZeroGPU.
72
+ return "openai-whisper-cuda"
73
+
74
+ return "faster-whisper-cpu"
75
+
76
+
77
+ def _extract_words(raw_words: list[dict]) -> list[dict]:
78
+ """Normalise word timestamps into {word, start, end}."""
79
+ output = []
80
+ for raw in raw_words or []:
81
+ start = raw.get("start")
82
+ end = raw.get("end")
83
+ if start is None or end is None:
84
+ continue
85
+ output.append(
86
+ {
87
+ "word": str(raw.get("word", "")).strip(),
88
+ "start": float(start),
89
+ "end": float(end),
90
+ }
91
+ )
92
+ return output
93
+
94
+
95
+ def _normalise_segments(segments: list[dict]) -> list[dict]:
96
+ """Return canonical segment schema with word-level timestamps."""
97
+ output = []
98
+ for seg in segments:
99
+ start = seg.get("start")
100
+ end = seg.get("end")
101
+ if start is None or end is None:
102
+ continue
103
+ words = _extract_words(seg.get("words", []))
104
+ output.append(
105
+ {
106
+ "start": float(start),
107
+ "end": float(end),
108
+ "text": str(seg.get("text", "")).strip(),
109
+ "words": words,
110
+ }
111
+ )
112
+ return output
113
+
114
+
115
+ # Max duration (seconds) before a segment is considered oversized and needs splitting.
116
+ _MAX_SEGMENT_DURATION = 15.0
117
+ # Preferred pause gap (seconds) to use as a split point.
118
+ _PAUSE_THRESHOLD = 0.4
119
+
120
+
121
+ def _split_oversized_segments(segments: list[dict]) -> list[dict]:
122
+ """Split segments longer than _MAX_SEGMENT_DURATION using word timings."""
123
+ output = []
124
+ for seg in segments:
125
+ duration = seg["end"] - seg["start"]
126
+ words = seg.get("words", [])
127
+ real_words = [w for w in words if w["word"]]
128
+
129
+ if duration <= _MAX_SEGMENT_DURATION or len(real_words) < 2:
130
+ output.append(seg)
131
+ continue
132
+
133
+ chunks = []
134
+ chunk_start_idx = 0
135
+ chunk_start_time = real_words[0]["start"]
136
+
137
+ for i in range(len(real_words) - 1):
138
+ elapsed = real_words[i]["end"] - chunk_start_time
139
+ gap = real_words[i + 1]["start"] - real_words[i]["end"]
140
+ should_split = (
141
+ (elapsed >= _MAX_SEGMENT_DURATION and gap >= 0.15)
142
+ or (elapsed >= _MAX_SEGMENT_DURATION * 0.5 and gap >= _PAUSE_THRESHOLD)
143
+ )
144
+ if should_split:
145
+ chunks.append(real_words[chunk_start_idx : i + 1])
146
+ chunk_start_idx = i + 1
147
+ chunk_start_time = real_words[i + 1]["start"]
148
+
149
+ if chunk_start_idx < len(real_words):
150
+ chunks.append(real_words[chunk_start_idx:])
151
+
152
+ for chunk_words in chunks:
153
+ output.append(
154
+ {
155
+ "start": chunk_words[0]["start"],
156
+ "end": chunk_words[-1]["end"],
157
+ "text": " ".join(w["word"] for w in chunk_words).strip(),
158
+ "words": chunk_words,
159
+ }
160
+ )
161
+
162
+ return output
163
+
164
+
165
+ def _assign_words_to_segments(segments: list[dict], words: list[dict]) -> None:
166
+ """Distribute top-level word list into segments by timestamp overlap."""
167
+ normalised = _extract_words(words)
168
+ for seg in segments:
169
+ seg["words"] = [
170
+ w for w in normalised if w["start"] >= seg["start"] and w["end"] <= seg["end"]
171
+ ]
172
+
173
+
174
+ def _segments_from_pollinations(audio_path: str, language: str) -> list[dict]:
175
+ """Call Pollinations Whisper API and return canonical segments."""
176
+ api_key = (
177
+ os.getenv("POLLEN_API_KEY_SECONDARY")
178
+ or os.getenv("POLLEN_API_KEY")
179
+ or os.getenv("POLLINATIONS_API_KEY", "")
180
+ )
181
+ headers = {"Authorization": f"Bearer {api_key}"}
182
+
183
+ with open(audio_path, "rb") as audio_file:
184
+ files = {"file": (os.path.basename(audio_path), audio_file, "audio/wav")}
185
+ # When the caller passes "auto" (or empty), omit the `language` field so
186
+ # Whisper auto-detects. Forcing a wrong language code makes Whisper
187
+ # silently switch to translate-mode (e.g. Hindi audio + language="en"
188
+ # produces an English translation, not a Hindi transcript).
189
+ data = {
190
+ "model": POLLEN_TRANSCRIBE_MODEL,
191
+ "response_format": "verbose_json",
192
+ "temperature": 0,
193
+ "timestamp_granularities[]": "word",
194
+ }
195
+ if language and language.lower() not in ("auto", ""):
196
+ data["language"] = language
197
+ response = requests.post(
198
+ POLLINATIONS_URL,
199
+ headers=headers,
200
+ files=files,
201
+ data=data,
202
+ timeout=120,
203
+ )
204
+
205
+ response.raise_for_status()
206
+ result = response.json()
207
+
208
+ segments = _normalise_segments(result.get("segments", []))
209
+ if not any(seg.get("words") for seg in segments) and "words" in result:
210
+ _assign_words_to_segments(segments, result["words"])
211
+
212
+ return _normalise_segments(segments)
213
+
214
+
215
+ def _segments_from_mlx(audio_path: str, language: str) -> list[dict]:
216
+ """Run mlx-whisper locally."""
217
+ print("[s2] Using mlx-whisper backend...")
218
+ try:
219
+ import mlx_whisper
220
+ except ImportError:
221
+ raise ImportError("mlx-whisper is not installed. Run: uv add mlx-whisper")
222
+
223
+ result = mlx_whisper.transcribe(
224
+ audio_path,
225
+ path_or_hf_repo=MLX_MODEL,
226
+ language=language if language != "auto" else None,
227
+ word_timestamps=True,
228
+ )
229
+ return _normalise_segments(result.get("segments", []))
230
+
231
+
232
+ def _get_faster_whisper_model(device: str, compute_type: str):
233
+ """Load/cached faster-whisper model."""
234
+ from faster_whisper import WhisperModel
235
+
236
+ key = (device, compute_type)
237
+ if key not in _FASTER_WHISPER_MODELS:
238
+ _FASTER_WHISPER_MODELS[key] = WhisperModel(
239
+ FASTER_WHISPER_MODEL,
240
+ device=device,
241
+ compute_type=compute_type,
242
+ )
243
+ return _FASTER_WHISPER_MODELS[key]
244
+
245
+
246
+ def _segments_from_faster_whisper_impl(
247
+ audio_path: str,
248
+ language: str,
249
+ device: str,
250
+ compute_type: str,
251
+ ) -> list[dict]:
252
+ model = _get_faster_whisper_model(device=device, compute_type=compute_type)
253
+ segments, _ = model.transcribe(
254
+ audio_path,
255
+ language=None if language == "auto" else language,
256
+ word_timestamps=True,
257
+ )
258
+
259
+ output = []
260
+ for seg in segments:
261
+ words = []
262
+ for word in seg.words or []:
263
+ if word.start is None or word.end is None:
264
+ continue
265
+ words.append(
266
+ {
267
+ "word": str(word.word or "").strip(),
268
+ "start": float(word.start),
269
+ "end": float(word.end),
270
+ }
271
+ )
272
+ output.append(
273
+ {
274
+ "start": float(seg.start),
275
+ "end": float(seg.end),
276
+ "text": str(seg.text or "").strip(),
277
+ "words": words,
278
+ }
279
+ )
280
+ return output
281
+
282
+
283
+ def _segments_from_faster_whisper_cpu(
284
+ audio_path: str,
285
+ language: str,
286
+ ) -> list[dict]:
287
+ """CPU-only faster-whisper (no GPU decorator — runs outside ZeroGPU budget)."""
288
+ return _segments_from_faster_whisper_impl(audio_path, language, "cpu", "int8")
289
+
290
+
291
+ def _get_openai_whisper_model():
292
+ """Load openai-whisper once per process. CUDA if available."""
293
+ global _OPENAI_WHISPER_MODEL
294
+ if _OPENAI_WHISPER_MODEL is None:
295
+ try:
296
+ import whisper as openai_whisper
297
+ except ImportError as exc:
298
+ raise ImportError("openai-whisper is not installed") from exc
299
+
300
+ device = "cuda" if torch.cuda.is_available() else "cpu"
301
+ print(f"[s2] Loading openai-whisper ({OPENAI_WHISPER_MODEL}) on {device}...")
302
+ _OPENAI_WHISPER_MODEL = openai_whisper.load_model(OPENAI_WHISPER_MODEL, device=device)
303
+ return _OPENAI_WHISPER_MODEL
304
+
305
+
306
+ @spaces.GPU(duration=60)
307
+ def _segments_from_openai_whisper(
308
+ audio_path: str,
309
+ language: str,
310
+ ) -> list[dict]:
311
+ """GPU-decorated openai-whisper execution (PyTorch-native, ZeroGPU-compatible)."""
312
+ model = _get_openai_whisper_model()
313
+ result = model.transcribe(
314
+ audio_path,
315
+ language=None if language == "auto" else language,
316
+ word_timestamps=True,
317
+ verbose=False,
318
+ )
319
+ return _normalise_segments(result.get("segments", []))
320
+
321
+
322
+ def _segments_from_local_backend(audio_path: str, language: str) -> list[dict]:
323
+ """Dispatch local whisper backend from runtime device detection."""
324
+ backend = _get_local_whisper_backend()
325
+
326
+ if backend == "mlx-whisper":
327
+ return _segments_from_mlx(audio_path, language)
328
+
329
+ if backend == "openai-whisper-cuda":
330
+ print("[s2] Using openai-whisper backend (cuda)...")
331
+ try:
332
+ return _segments_from_openai_whisper(audio_path, language)
333
+ except ImportError:
334
+ print("[s2] openai-whisper unavailable; falling back to faster-whisper (cpu).")
335
+ return _segments_from_faster_whisper_cpu(audio_path, language)
336
+
337
+ print("[s2] Using faster-whisper backend (cpu)...")
338
+ return _segments_from_faster_whisper_cpu(audio_path, language)
339
+
340
+
341
+ def transcribe(audio_path: str, language: str = "en") -> list[dict]:
342
+ """
343
+ Transcribe audio and return canonical segment schema.
344
+
345
+ Priority:
346
+ 1. Pollinations API (fast, offloads computation)
347
+ 2. Local backend (GPU/MPS if available, otherwise CPU)
348
+ """
349
+ print(f"[s2] Transcribing {audio_path} (lang={language})...")
350
+
351
+ segments = None
352
+ pollinations_error = None
353
+ local_error = None
354
+
355
+ # 1. Try Pollinations API first
356
+ try:
357
+ print("[s2] Trying Pollinations API...")
358
+ segments = _segments_from_pollinations(audio_path, language)
359
+ if segments:
360
+ print(f"[s2] Pollinations returned {len(segments)} segments ✓")
361
+ else:
362
+ segments = None
363
+ except Exception as exc:
364
+ print(f"[s2] Pollinations error ({exc}) — falling back to local backend.")
365
+ pollinations_error = exc
366
+ segments = None
367
+
368
+ # 2. Try Local Backend (GPU or CPU)
369
+ if segments is None:
370
+ try:
371
+ backend = _get_local_whisper_backend()
372
+ print(f"[s2] Trying local backend ({backend})...")
373
+ segments = _segments_from_local_backend(audio_path, language)
374
+ if segments:
375
+ print(f"[s2] Local backend returned {len(segments)} segments ✓")
376
+ except Exception as exc:
377
+ print(f"[s2] Local backend error ({exc}).")
378
+ local_error = exc
379
+ segments = None
380
+
381
+ if segments is None:
382
+ details = []
383
+ if pollinations_error is not None:
384
+ details.append(f"Pollinations: {pollinations_error}")
385
+ if local_error is not None:
386
+ details.append(f"Local backend: {local_error}")
387
+ suffix = f" Details: {' | '.join(details)}" if details else ""
388
+ raise RuntimeError(f"Transcription failed on all available backends.{suffix}")
389
+
390
+ before = len(segments)
391
+ segments = _split_oversized_segments(segments)
392
+ if len(segments) != before:
393
+ print(f"[s2] Split {before} oversized segment(s) → {len(segments)} segments")
394
+
395
+ return _normalise_segments(segments)
steps/s3_translate.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Step 4: Translate segment texts using Pollinations chat completions API
3
+ (OpenAI-compatible endpoint, no extra API key needed beyond POLLEN_API_KEY).
4
+ """
5
+ import re
6
+
7
+ from .lang._shared import build_client, bedrock_fallback, parse_json_array, MODEL, log_llm_call
8
+ from .lang import get_translation_prompt, get_fallback_mode, post_translate
9
+
10
+
11
+ def _translate_batch(segments: list[dict], target_language: str) -> list[dict]:
12
+ """Translate a batch of segments into target_language."""
13
+ if not segments:
14
+ return segments
15
+
16
+ # Build single-shot batch: include duration so the LLM can match spoken length
17
+ numbered = "\n".join(
18
+ f"{i+1}. [{s['end'] - s['start']:.1f}s] {s['text']}"
19
+ for i, s in enumerate(segments)
20
+ )
21
+
22
+ # Default prompt (generic, works for most languages)
23
+ default_prompt = (
24
+ f"You are a voice-over dubbing writer — not a translator. "
25
+ f"Your job is to write what a native {target_language} speaker would *actually say out loud* "
26
+ f"in a casual, natural conversation. Forget the source words. Capture the meaning, tone, and energy.\n\n"
27
+
28
+ f"INPUT FORMAT:\n"
29
+ f"Numbered lines with a spoken duration in brackets, e.g.: 1. [4.6s] Hello there\n\n"
30
+
31
+ f"OUTPUT FORMAT:\n"
32
+ f"A JSON array of {target_language} strings — one per input line, in order. "
33
+ f"No numbering, no brackets, no extra text.\n"
34
+ f'Shape: ["<first line translated into {target_language}>", "<second line translated into {target_language}>"]\n\n'
35
+
36
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
37
+ f"SCORING RUBRIC — evaluate every line against these before outputting:\n"
38
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
39
+
40
+ f"[1] NATURALNESS — weight: HIGH\n"
41
+ f" Would a native speaker actually say this in real life?\n"
42
+ f" ✗ Fail: dictionary phrasing, formal register, textbook grammar\n"
43
+ f" ✓ Pass: contractions, colloquial rhythm, everyday vocabulary\n"
44
+ f" Ask yourself: 'Would I hear this in a TV show or on the street?' If no → rewrite.\n\n"
45
+
46
+ f"[2] SPOKEN FIT — weight: CRITICAL\n"
47
+ f" The line will be read by TTS within the duration shown in brackets.\n"
48
+ f" Fewer words is almost always safer. Aim for 70–80% of the original word count.\n"
49
+ f" ✗ Fail: translation is longer or same length as the English\n"
50
+ f" ✓ Pass: shorter, with no loss of core meaning or emotional tone\n"
51
+ f" Trick: cut filler, merge ideas, use contractions and short-form spoken words.\n\n"
52
+
53
+ f"[3] TTS READABILITY — weight: HIGH\n"
54
+ f" Long sentences with multiple commas trip up TTS engines.\n"
55
+ f" ✗ Fail: 'She met him, her true love, on a rainy evening, in the city she once fled.'\n"
56
+ f" ✓ Pass: 'She met him on a rainy evening. Her true love. In the city she once fled.'\n"
57
+ f" Short beats. Natural pauses. Each sentence punches clean.\n\n"
58
+
59
+ f"[4] EMOTIONAL REGISTER — weight: HIGH\n"
60
+ f" Match the tone of the original: casual, urgent, tender, funny, sarcastic — whatever it is.\n"
61
+ f" ✗ Fail: a sarcastic line becomes polite; a tender moment becomes clinical\n"
62
+ f" ✓ Pass: the emotional texture is preserved even if the words are completely different\n\n"
63
+
64
+ f"[5] TRANSLATION PURITY — weight: MEDIUM\n"
65
+ f" Every word in the output must be {target_language}. No words from the original "
66
+ f"language should leak through.\n"
67
+ f" This includes: filler words (Oh, Hmm, Well, So, Right when not native to "
68
+ f"{target_language}), names used as exclamations, brand-style interjections. "
69
+ f"Find the {target_language} equivalent every time.\n\n"
70
+
71
+ f"[6] WORD-FOR-WORD TRAP — weight: HIGH (avoid this)\n"
72
+ f" Do NOT translate word by word. No one speaks that way.\n"
73
+ f" ✗ Fail: a literal one-to-one rendering that preserves the source word order\n"
74
+ f" ✓ Pass: a restructured line that reads naturally in {target_language} "
75
+ f"while keeping the same meaning\n"
76
+ f" Restructure freely. {target_language} has its own natural word order — use it.\n\n"
77
+
78
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
79
+ f"BEFORE RETURNING OUTPUT:\n"
80
+ f"For each line, silently run this checklist:\n"
81
+ f" □ Would a native speaker say this naturally out loud?\n"
82
+ f" □ Is it shorter than the English original?\n"
83
+ f" □ Are there any commas that create awkward TTS pauses? → break into short sentences\n"
84
+ f" □ Does the emotional tone match?\n"
85
+ f" □ Are there any English words hiding in the output?\n"
86
+ f"If any box fails → rewrite that line. Then output.\n"
87
+ f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
88
+
89
+ f"Return ONLY the JSON array. No preamble, no explanation, no duration prefixes."
90
+ )
91
+
92
+ # Let language-specific handler override the prompt if needed
93
+ system_prompt = get_translation_prompt(target_language, default_prompt)
94
+
95
+ expected = len(segments)
96
+ strict_prompt = (
97
+ system_prompt
98
+ + f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
99
+ f"— one per input line. Do NOT merge, skip, or split any lines."
100
+ )
101
+
102
+ client = build_client()
103
+ max_retries = 2
104
+ try:
105
+ for attempt in range(1, max_retries + 1):
106
+ response = client.chat.completions.create(
107
+ model=MODEL,
108
+ messages=[
109
+ {"role": "system", "content": strict_prompt},
110
+ {"role": "user", "content": numbered},
111
+ ],
112
+ temperature=0.2,
113
+ )
114
+
115
+ raw = response.choices[0].message.content.strip()
116
+ log_llm_call(
117
+ step="s3_translate", provider="pollinations", model=MODEL,
118
+ system_prompt=strict_prompt, user_prompt=numbered,
119
+ response=raw, temperature=0.2,
120
+ )
121
+ translated_list = parse_json_array(raw)
122
+
123
+ if len(translated_list) == expected:
124
+ break
125
+
126
+ print(f"[s3] Pollinations returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
127
+ if attempt == max_retries:
128
+ raise ValueError(
129
+ f"Translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
130
+ )
131
+
132
+ cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]
133
+
134
+ result = []
135
+ for seg, translated_text in zip(segments, cleaned):
136
+ result.append({**seg, "translated_text": translated_text})
137
+
138
+ print(f"[s3] Translating via Pollinations complete ✓")
139
+ return result
140
+
141
+ except Exception as e:
142
+ print(f"[s3] Pollinations translation error ({e}) — using fallback.")
143
+
144
+ # Language-specific fallback routing
145
+ if get_fallback_mode(target_language) == "bedrock":
146
+ return bedrock_fallback(segments, numbered, system_prompt)
147
+
148
+ # Default: Google Translate
149
+ from deep_translator import GoogleTranslator
150
+ try:
151
+ translator = GoogleTranslator(source='auto', target=target_language.lower())
152
+ except Exception as e2:
153
+ print(f"[s3] Fallback failed to init translator ({e2})")
154
+ raise
155
+
156
+ result = []
157
+ for seg in segments:
158
+ translated_text = translator.translate(seg["text"])
159
+ result.append({**seg, "translated_text": translated_text})
160
+
161
+ print(f"[s3] Translation via fallback complete ✓")
162
+ return result
163
+
164
+
165
+ def translate(segments: list[dict], target_language: str) -> list[dict]:
166
+ """
167
+ Translate the text of each segment into target_language in batches.
168
+
169
+ Args:
170
+ segments: List of {start, end, text} dicts.
171
+ target_language: Full language name, e.g. "Spanish", "French", "Hindi".
172
+
173
+ Returns:
174
+ Same list with 'translated_text' added to each segment.
175
+ Language-specific fields (e.g. 'tts_text') may also be added.
176
+ """
177
+ if not segments:
178
+ return segments
179
+
180
+ print(f"[s3] Translating {len(segments)} segments → {target_language} (in batches)...")
181
+
182
+ BATCH_SIZE = 15
183
+ final_result = []
184
+
185
+ for i in range(0, len(segments), BATCH_SIZE):
186
+ batch = segments[i:i + BATCH_SIZE]
187
+ if len(segments) > BATCH_SIZE:
188
+ print(f"[s3] Processing batch {i//BATCH_SIZE + 1} ({len(batch)} items)...")
189
+ batch_result = _translate_batch(batch, target_language)
190
+ final_result.extend(batch_result)
191
+
192
+ # Run language-specific post-processing (e.g., Urdu transliteration)
193
+ final_result = post_translate(final_result, target_language)
194
+
195
+ return final_result