"""Urdu-specific translation handlers. Handles: - Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary) - Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari) - Devanagari → Urdu script conversion for captions """ import json import re from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call # ── Public dispatcher hooks ────────────────────────────────────────────────── def get_translation_prompt() -> str: """Return the Urdu-specific system prompt for translation.""" return ( "You are a professional voice-over translator for commonly spoken Urdu. " "Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n" "LANGUAGE RULES:\n" "- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n" "- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. " "NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, " "وقت not سمے, لیکن not پرنتو, اگر not یدی).\n" "- Keep it natural and conversational, not literary or formal.\n" "- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). " "Translate ALL such expressions into Urdu equivalents.\n\n" "CRITICAL — DURATION CONSTRAINT:\n" "Each line shows its spoken duration in brackets (e.g. [4.6s]). " "The translation will be spoken by TTS and MUST fit within that duration.\n" "STRICT RULE: Your translation MUST have FEWER words than the original English. " "If the English has 10 words, aim for 7-8 Urdu words maximum.\n" "Every word must earn its place — if removing a word doesn't lose core meaning, remove it. " "Paraphrase aggressively. Use shorter synonyms. Merge clauses. " "A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n" "TTS COMPATIBILITY — IMPORTANT:\n" "The TTS model struggles with long sentences that have multiple commas or clauses. " "Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. " "Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n" "Each output line is still ONE item in the array (one per input line). " "You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n" "Write ONLY in Urdu script (Nastaliq/Arabic script). " "Return ONLY a JSON array of translated strings, in order, no extra text. " "Do NOT include the duration prefix or numbering in the output — only the translated text itself. " 'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye ' 'Example output: ["سلام", "خدا حافظ"]' ) def get_fallback_mode() -> str: """Urdu uses Bedrock instead of Google Translate as fallback.""" return "bedrock" _ENGLISH_FILLERS = re.compile( r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*', re.IGNORECASE, ) def post_translate(segments: list[dict]) -> list[dict]: """Run Urdu-specific post-processing after translation. - Strips leaked English fillers. - Transliterates Urdu script → Devanagari for TTS (sets 'tts_text'). - Captions use translated_text directly (already Urdu/Nastaliq script). """ for seg in segments: text = seg.get("translated_text", "") # Strip leaked English fillers clean_text = _ENGLISH_FILLERS.sub("", text).strip() seg["translated_text"] = clean_text return transliterate_to_devanagari(segments) # ── Transliteration: Urdu → Devanagari (for TTS) ──────────────────────────── _URDU_TO_DEVA = { 'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स', 'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड', 'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स', 'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ', 'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग', 'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह', 'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ', 'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न', 'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';', } def _urdu_to_rough_devanagari(text: str) -> str: """Deterministic character mapping from Urdu to Devanagari. Consonants are mapped correctly, but short vowels are omitted/incorrect because Urdu script doesn't explicitly mark them.""" result = [] for i, ch in enumerate(text): if ch == 'ا': # Word-initial alif is 'अ', otherwise 'ा' result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा') elif ch in _URDU_TO_DEVA: result.append(_URDU_TO_DEVA[ch]) else: result.append(ch) # Fix a common edge case: ئ + ے (e.g., in بروئے) rough = ''.join(result) rough = rough.replace('इे', 'ए') return rough def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2): """Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary.""" prompt = ( "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH " "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n" "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n" "STRICT RULES:\n" "- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n" "- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n" "- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n" "- Add halant (्) for conjuncts where needed\n\n" "EXAMPLES:\n" "Urdu: محبت | rough: महबत | fixed: मोहब्बत\n" "Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n" "Urdu: حکمت | rough: हकमत | fixed: हिकमत\n" "Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n" "Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n" "Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n" "Urdu: کریم | rough: करयम | fixed: करीम\n\n" "Return ONLY a JSON array of corrected Devanagari strings, in order, one per input." ) for attempt in range(1, max_attempts + 1): try: response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": numbered}, ], temperature=0.1, ) raw = response.choices[0].message.content.strip() log_llm_call( step="urdu_vowel_polish", provider="pollinations", model=model, system_prompt=prompt, user_prompt=numbered, response=raw, temperature=0.1, ) try: polished_list = parse_json_array(raw) except (json.JSONDecodeError, ValueError): print(f"[urdu] Attempt {attempt}: Could not parse response as JSON") continue if len(polished_list) != expected_count: print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}") continue # Quick check if it's returning Arabic/Urdu script instead sample = " ".join(polished_list[:3]) bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF') if bad_chars > 0: print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying") prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt continue return polished_list except Exception as e: print(f"[urdu] LLM error on attempt {attempt}: {e}") return None def transliterate_to_devanagari(segments: list[dict]) -> list[dict]: """Convert Urdu script translations to Devanagari for TTS. Adds 'tts_text' field to each segment. Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing.""" if not segments: return segments print("[urdu] Starting Hybrid Urdu → Devanagari conversion...") # Step 1: Deterministic mapping to rough Devanagari rough_texts = [] for seg in segments: urdu_text = seg.get("translated_text", "") rough_deva = _urdu_to_rough_devanagari(urdu_text) rough_texts.append(rough_deva) expected = len(segments) numbered = "\n".join( f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}" for i, seg in enumerate(segments) ) # Try Pollinations client = build_client() polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected) if polished_list: for seg, deva_text in zip(segments, polished_list): seg["tts_text"] = deva_text print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓") return segments print("[urdu] Pollinations Polish failed — trying Bedrock fallback...") # Bedrock Fallback try: system_prompt = ( "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH " "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n" "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n" "EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n" "حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n" "Return ONLY a JSON array of corrected Devanagari strings." ) for attempt in range(1, 3): raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock") try: polished_list = parse_json_array(raw) except (json.JSONDecodeError, ValueError): print(f"[urdu] Bedrock attempt {attempt}: Could not parse response") continue if len(polished_list) != expected: print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}") continue sample = " ".join(polished_list[:3]) bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF') if bad_chars > 0: print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying") system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt continue for seg, deva_text in zip(segments, polished_list): seg["tts_text"] = deva_text print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓") return segments except Exception as e: print(f"[urdu] WARNING: Bedrock fallback failed ({e})") print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.") for seg, r_text in zip(segments, rough_texts): seg["tts_text"] = r_text return segments # ── Transliteration: Devanagari → Urdu script (for captions) ──────────────── def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]: """Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles. Adds 'caption_text' field to each segment.""" if not segments: return segments texts = [seg.get("translated_text", "") for seg in segments] numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts)) system_prompt = ( "You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). " "This is NOT translation — the language is already Urdu, just written in Devanagari. " "Convert it to proper Urdu script preserving every word exactly.\n\n" "Return ONLY a JSON array of converted strings, in order, no extra text. " "Do NOT include numbering in the output." ) client = build_client() try: response = client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": numbered}, ], temperature=0.1, ) raw = response.choices[0].message.content.strip() log_llm_call( step="urdu_script_convert", provider="pollinations", model=MODEL, system_prompt=system_prompt, user_prompt=numbered, response=raw, temperature=0.1, ) urdu_list = parse_json_array(raw) if len(urdu_list) != len(segments): print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions") return segments for seg, urdu_text in zip(segments, urdu_list): seg["caption_text"] = urdu_text print("[urdu] Urdu script transliteration complete ✓") return segments except Exception as e: print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...") try: raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock") urdu_list = parse_json_array(raw) if len(urdu_list) != len(segments): print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions") return segments for seg, urdu_text in zip(segments, urdu_list): seg["caption_text"] = urdu_text print("[urdu] Urdu script transliteration (Bedrock) complete ✓") return segments except Exception as e2: print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions") return segments