videovoice / steps /lang /urdu.py
Rafii's picture
deploy: switch to chatterbox requirements @ 787c1dc
02ad302
"""Urdu-specific translation handlers.
Handles:
- Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary)
- Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari)
- Devanagari → Urdu script conversion for captions
"""
import json
import re
from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call
# ── Public dispatcher hooks ──────────────────────────────────────────────────
def get_translation_prompt() -> str:
"""Return the Urdu-specific system prompt for translation."""
return (
"You are a professional voice-over translator for commonly spoken Urdu. "
"Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n"
"LANGUAGE RULES:\n"
"- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n"
"- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. "
"NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, "
"وقت not سمے, لیکن not پرنتو, اگر not یدی).\n"
"- Keep it natural and conversational, not literary or formal.\n"
"- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). "
"Translate ALL such expressions into Urdu equivalents.\n\n"
"CRITICAL — DURATION CONSTRAINT:\n"
"Each line shows its spoken duration in brackets (e.g. [4.6s]). "
"The translation will be spoken by TTS and MUST fit within that duration.\n"
"STRICT RULE: Your translation MUST have FEWER words than the original English. "
"If the English has 10 words, aim for 7-8 Urdu words maximum.\n"
"Every word must earn its place — if removing a word doesn't lose core meaning, remove it. "
"Paraphrase aggressively. Use shorter synonyms. Merge clauses. "
"A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n"
"TTS COMPATIBILITY — IMPORTANT:\n"
"The TTS model struggles with long sentences that have multiple commas or clauses. "
"Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. "
"Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n"
"Each output line is still ONE item in the array (one per input line). "
"You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n"
"Write ONLY in Urdu script (Nastaliq/Arabic script). "
"Return ONLY a JSON array of translated strings, in order, no extra text. "
"Do NOT include the duration prefix or numbering in the output — only the translated text itself. "
'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye '
'Example output: ["سلام", "خدا حافظ"]'
)
def get_fallback_mode() -> str:
"""Urdu uses Bedrock instead of Google Translate as fallback."""
return "bedrock"
_ENGLISH_FILLERS = re.compile(
r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*',
re.IGNORECASE,
)
def post_translate(segments: list[dict]) -> list[dict]:
"""Run Urdu-specific post-processing after translation.
- Strips leaked English fillers.
- Transliterates Urdu script → Devanagari for TTS (sets 'tts_text').
- Captions use translated_text directly (already Urdu/Nastaliq script).
"""
for seg in segments:
text = seg.get("translated_text", "")
# Strip leaked English fillers
clean_text = _ENGLISH_FILLERS.sub("", text).strip()
seg["translated_text"] = clean_text
return transliterate_to_devanagari(segments)
# ── Transliteration: Urdu → Devanagari (for TTS) ────────────────────────────
_URDU_TO_DEVA = {
'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स',
'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड',
'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स',
'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ',
'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग',
'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह',
'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ',
'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न',
'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';',
}
def _urdu_to_rough_devanagari(text: str) -> str:
"""Deterministic character mapping from Urdu to Devanagari.
Consonants are mapped correctly, but short vowels are omitted/incorrect
because Urdu script doesn't explicitly mark them."""
result = []
for i, ch in enumerate(text):
if ch == 'ا':
# Word-initial alif is 'अ', otherwise 'ा'
result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा')
elif ch in _URDU_TO_DEVA:
result.append(_URDU_TO_DEVA[ch])
else:
result.append(ch)
# Fix a common edge case: ئ + ے (e.g., in بروئے)
rough = ''.join(result)
rough = rough.replace('इे', 'ए')
return rough
def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2):
"""Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary."""
prompt = (
"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n"
"STRICT RULES:\n"
"- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n"
"- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n"
"- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n"
"- Add halant (्) for conjuncts where needed\n\n"
"EXAMPLES:\n"
"Urdu: محبت | rough: महबत | fixed: मोहब्बत\n"
"Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
"Urdu: حکمت | rough: हकमत | fixed: हिकमत\n"
"Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n"
"Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n"
"Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n"
"Urdu: کریم | rough: करयम | fixed: करीम\n\n"
"Return ONLY a JSON array of corrected Devanagari strings, in order, one per input."
)
for attempt in range(1, max_attempts + 1):
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": prompt},
{"role": "user", "content": numbered},
],
temperature=0.1,
)
raw = response.choices[0].message.content.strip()
log_llm_call(
step="urdu_vowel_polish", provider="pollinations", model=model,
system_prompt=prompt, user_prompt=numbered,
response=raw, temperature=0.1,
)
try:
polished_list = parse_json_array(raw)
except (json.JSONDecodeError, ValueError):
print(f"[urdu] Attempt {attempt}: Could not parse response as JSON")
continue
if len(polished_list) != expected_count:
print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}")
continue
# Quick check if it's returning Arabic/Urdu script instead
sample = " ".join(polished_list[:3])
bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
if bad_chars > 0:
print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying")
prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt
continue
return polished_list
except Exception as e:
print(f"[urdu] LLM error on attempt {attempt}: {e}")
return None
def transliterate_to_devanagari(segments: list[dict]) -> list[dict]:
"""Convert Urdu script translations to Devanagari for TTS.
Adds 'tts_text' field to each segment.
Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing."""
if not segments:
return segments
print("[urdu] Starting Hybrid Urdu → Devanagari conversion...")
# Step 1: Deterministic mapping to rough Devanagari
rough_texts = []
for seg in segments:
urdu_text = seg.get("translated_text", "")
rough_deva = _urdu_to_rough_devanagari(urdu_text)
rough_texts.append(rough_deva)
expected = len(segments)
numbered = "\n".join(
f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}"
for i, seg in enumerate(segments)
)
# Try Pollinations
client = build_client()
polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected)
if polished_list:
for seg, deva_text in zip(segments, polished_list):
seg["tts_text"] = deva_text
print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓")
return segments
print("[urdu] Pollinations Polish failed — trying Bedrock fallback...")
# Bedrock Fallback
try:
system_prompt = (
"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n"
"EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
"حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n"
"Return ONLY a JSON array of corrected Devanagari strings."
)
for attempt in range(1, 3):
raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock")
try:
polished_list = parse_json_array(raw)
except (json.JSONDecodeError, ValueError):
print(f"[urdu] Bedrock attempt {attempt}: Could not parse response")
continue
if len(polished_list) != expected:
print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}")
continue
sample = " ".join(polished_list[:3])
bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
if bad_chars > 0:
print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying")
system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt
continue
for seg, deva_text in zip(segments, polished_list):
seg["tts_text"] = deva_text
print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓")
return segments
except Exception as e:
print(f"[urdu] WARNING: Bedrock fallback failed ({e})")
print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.")
for seg, r_text in zip(segments, rough_texts):
seg["tts_text"] = r_text
return segments
# ── Transliteration: Devanagari → Urdu script (for captions) ────────────────
def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]:
"""Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles.
Adds 'caption_text' field to each segment."""
if not segments:
return segments
texts = [seg.get("translated_text", "") for seg in segments]
numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts))
system_prompt = (
"You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). "
"This is NOT translation — the language is already Urdu, just written in Devanagari. "
"Convert it to proper Urdu script preserving every word exactly.\n\n"
"Return ONLY a JSON array of converted strings, in order, no extra text. "
"Do NOT include numbering in the output."
)
client = build_client()
try:
response = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": numbered},
],
temperature=0.1,
)
raw = response.choices[0].message.content.strip()
log_llm_call(
step="urdu_script_convert", provider="pollinations", model=MODEL,
system_prompt=system_prompt, user_prompt=numbered,
response=raw, temperature=0.1,
)
urdu_list = parse_json_array(raw)
if len(urdu_list) != len(segments):
print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
return segments
for seg, urdu_text in zip(segments, urdu_list):
seg["caption_text"] = urdu_text
print("[urdu] Urdu script transliteration complete ✓")
return segments
except Exception as e:
print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...")
try:
raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock")
urdu_list = parse_json_array(raw)
if len(urdu_list) != len(segments):
print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
return segments
for seg, urdu_text in zip(segments, urdu_list):
seg["caption_text"] = urdu_text
print("[urdu] Urdu script transliteration (Bedrock) complete ✓")
return segments
except Exception as e2:
print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions")
return segments