Spaces:
Running on Zero
Running on Zero
| """Urdu-specific translation handlers. | |
| Handles: | |
| - Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary) | |
| - Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari) | |
| - Devanagari → Urdu script conversion for captions | |
| """ | |
| import json | |
| import re | |
| from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call | |
| # ── Public dispatcher hooks ────────────────────────────────────────────────── | |
| def get_translation_prompt() -> str: | |
| """Return the Urdu-specific system prompt for translation.""" | |
| return ( | |
| "You are a professional voice-over translator for commonly spoken Urdu. " | |
| "Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n" | |
| "LANGUAGE RULES:\n" | |
| "- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n" | |
| "- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. " | |
| "NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, " | |
| "وقت not سمے, لیکن not پرنتو, اگر not یدی).\n" | |
| "- Keep it natural and conversational, not literary or formal.\n" | |
| "- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). " | |
| "Translate ALL such expressions into Urdu equivalents.\n\n" | |
| "CRITICAL — DURATION CONSTRAINT:\n" | |
| "Each line shows its spoken duration in brackets (e.g. [4.6s]). " | |
| "The translation will be spoken by TTS and MUST fit within that duration.\n" | |
| "STRICT RULE: Your translation MUST have FEWER words than the original English. " | |
| "If the English has 10 words, aim for 7-8 Urdu words maximum.\n" | |
| "Every word must earn its place — if removing a word doesn't lose core meaning, remove it. " | |
| "Paraphrase aggressively. Use shorter synonyms. Merge clauses. " | |
| "A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n" | |
| "TTS COMPATIBILITY — IMPORTANT:\n" | |
| "The TTS model struggles with long sentences that have multiple commas or clauses. " | |
| "Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. " | |
| "Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n" | |
| "Each output line is still ONE item in the array (one per input line). " | |
| "You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n" | |
| "Write ONLY in Urdu script (Nastaliq/Arabic script). " | |
| "Return ONLY a JSON array of translated strings, in order, no extra text. " | |
| "Do NOT include the duration prefix or numbering in the output — only the translated text itself. " | |
| 'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye ' | |
| 'Example output: ["سلام", "خدا حافظ"]' | |
| ) | |
| def get_fallback_mode() -> str: | |
| """Urdu uses Bedrock instead of Google Translate as fallback.""" | |
| return "bedrock" | |
| _ENGLISH_FILLERS = re.compile( | |
| r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*', | |
| re.IGNORECASE, | |
| ) | |
| def post_translate(segments: list[dict]) -> list[dict]: | |
| """Run Urdu-specific post-processing after translation. | |
| - Strips leaked English fillers. | |
| - Transliterates Urdu script → Devanagari for TTS (sets 'tts_text'). | |
| - Captions use translated_text directly (already Urdu/Nastaliq script). | |
| """ | |
| for seg in segments: | |
| text = seg.get("translated_text", "") | |
| # Strip leaked English fillers | |
| clean_text = _ENGLISH_FILLERS.sub("", text).strip() | |
| seg["translated_text"] = clean_text | |
| return transliterate_to_devanagari(segments) | |
| # ── Transliteration: Urdu → Devanagari (for TTS) ──────────────────────────── | |
| _URDU_TO_DEVA = { | |
| 'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स', | |
| 'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड', | |
| 'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स', | |
| 'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ', | |
| 'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग', | |
| 'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह', | |
| 'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ', | |
| 'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न', | |
| 'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';', | |
| } | |
| def _urdu_to_rough_devanagari(text: str) -> str: | |
| """Deterministic character mapping from Urdu to Devanagari. | |
| Consonants are mapped correctly, but short vowels are omitted/incorrect | |
| because Urdu script doesn't explicitly mark them.""" | |
| result = [] | |
| for i, ch in enumerate(text): | |
| if ch == 'ا': | |
| # Word-initial alif is 'अ', otherwise 'ा' | |
| result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा') | |
| elif ch in _URDU_TO_DEVA: | |
| result.append(_URDU_TO_DEVA[ch]) | |
| else: | |
| result.append(ch) | |
| # Fix a common edge case: ئ + ے (e.g., in بروئے) | |
| rough = ''.join(result) | |
| rough = rough.replace('इे', 'ए') | |
| return rough | |
| def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2): | |
| """Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary.""" | |
| prompt = ( | |
| "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH " | |
| "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n" | |
| "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n" | |
| "STRICT RULES:\n" | |
| "- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n" | |
| "- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n" | |
| "- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n" | |
| "- Add halant (्) for conjuncts where needed\n\n" | |
| "EXAMPLES:\n" | |
| "Urdu: محبت | rough: महबत | fixed: मोहब्बत\n" | |
| "Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n" | |
| "Urdu: حکمت | rough: हकमत | fixed: हिकमत\n" | |
| "Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n" | |
| "Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n" | |
| "Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n" | |
| "Urdu: کریم | rough: करयम | fixed: करीम\n\n" | |
| "Return ONLY a JSON array of corrected Devanagari strings, in order, one per input." | |
| ) | |
| for attempt in range(1, max_attempts + 1): | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": prompt}, | |
| {"role": "user", "content": numbered}, | |
| ], | |
| temperature=0.1, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| log_llm_call( | |
| step="urdu_vowel_polish", provider="pollinations", model=model, | |
| system_prompt=prompt, user_prompt=numbered, | |
| response=raw, temperature=0.1, | |
| ) | |
| try: | |
| polished_list = parse_json_array(raw) | |
| except (json.JSONDecodeError, ValueError): | |
| print(f"[urdu] Attempt {attempt}: Could not parse response as JSON") | |
| continue | |
| if len(polished_list) != expected_count: | |
| print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}") | |
| continue | |
| # Quick check if it's returning Arabic/Urdu script instead | |
| sample = " ".join(polished_list[:3]) | |
| bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF') | |
| if bad_chars > 0: | |
| print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying") | |
| prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt | |
| continue | |
| return polished_list | |
| except Exception as e: | |
| print(f"[urdu] LLM error on attempt {attempt}: {e}") | |
| return None | |
| def transliterate_to_devanagari(segments: list[dict]) -> list[dict]: | |
| """Convert Urdu script translations to Devanagari for TTS. | |
| Adds 'tts_text' field to each segment. | |
| Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing.""" | |
| if not segments: | |
| return segments | |
| print("[urdu] Starting Hybrid Urdu → Devanagari conversion...") | |
| # Step 1: Deterministic mapping to rough Devanagari | |
| rough_texts = [] | |
| for seg in segments: | |
| urdu_text = seg.get("translated_text", "") | |
| rough_deva = _urdu_to_rough_devanagari(urdu_text) | |
| rough_texts.append(rough_deva) | |
| expected = len(segments) | |
| numbered = "\n".join( | |
| f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}" | |
| for i, seg in enumerate(segments) | |
| ) | |
| # Try Pollinations | |
| client = build_client() | |
| polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected) | |
| if polished_list: | |
| for seg, deva_text in zip(segments, polished_list): | |
| seg["tts_text"] = deva_text | |
| print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓") | |
| return segments | |
| print("[urdu] Pollinations Polish failed — trying Bedrock fallback...") | |
| # Bedrock Fallback | |
| try: | |
| system_prompt = ( | |
| "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH " | |
| "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n" | |
| "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n" | |
| "EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n" | |
| "حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n" | |
| "Return ONLY a JSON array of corrected Devanagari strings." | |
| ) | |
| for attempt in range(1, 3): | |
| raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock") | |
| try: | |
| polished_list = parse_json_array(raw) | |
| except (json.JSONDecodeError, ValueError): | |
| print(f"[urdu] Bedrock attempt {attempt}: Could not parse response") | |
| continue | |
| if len(polished_list) != expected: | |
| print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}") | |
| continue | |
| sample = " ".join(polished_list[:3]) | |
| bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF') | |
| if bad_chars > 0: | |
| print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying") | |
| system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt | |
| continue | |
| for seg, deva_text in zip(segments, polished_list): | |
| seg["tts_text"] = deva_text | |
| print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓") | |
| return segments | |
| except Exception as e: | |
| print(f"[urdu] WARNING: Bedrock fallback failed ({e})") | |
| print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.") | |
| for seg, r_text in zip(segments, rough_texts): | |
| seg["tts_text"] = r_text | |
| return segments | |
| # ── Transliteration: Devanagari → Urdu script (for captions) ──────────────── | |
| def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]: | |
| """Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles. | |
| Adds 'caption_text' field to each segment.""" | |
| if not segments: | |
| return segments | |
| texts = [seg.get("translated_text", "") for seg in segments] | |
| numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts)) | |
| system_prompt = ( | |
| "You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). " | |
| "This is NOT translation — the language is already Urdu, just written in Devanagari. " | |
| "Convert it to proper Urdu script preserving every word exactly.\n\n" | |
| "Return ONLY a JSON array of converted strings, in order, no extra text. " | |
| "Do NOT include numbering in the output." | |
| ) | |
| client = build_client() | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": numbered}, | |
| ], | |
| temperature=0.1, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| log_llm_call( | |
| step="urdu_script_convert", provider="pollinations", model=MODEL, | |
| system_prompt=system_prompt, user_prompt=numbered, | |
| response=raw, temperature=0.1, | |
| ) | |
| urdu_list = parse_json_array(raw) | |
| if len(urdu_list) != len(segments): | |
| print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions") | |
| return segments | |
| for seg, urdu_text in zip(segments, urdu_list): | |
| seg["caption_text"] = urdu_text | |
| print("[urdu] Urdu script transliteration complete ✓") | |
| return segments | |
| except Exception as e: | |
| print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...") | |
| try: | |
| raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock") | |
| urdu_list = parse_json_array(raw) | |
| if len(urdu_list) != len(segments): | |
| print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions") | |
| return segments | |
| for seg, urdu_text in zip(segments, urdu_list): | |
| seg["caption_text"] = urdu_text | |
| print("[urdu] Urdu script transliteration (Bedrock) complete ✓") | |
| return segments | |
| except Exception as e2: | |
| print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions") | |
| return segments | |