Spaces:

Rafii
/

videovoice

Running on Zero

App Files Files Community

videovoice / steps /lang /urdu.py

Rafii

deploy: switch to chatterbox requirements @ 787c1dc

02ad302 about 1 month ago

raw

history blame contribute delete

15.2 kB

	"""Urdu-specific translation handlers.

	Handles:
	- Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary)
	- Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari)
	- Devanagari → Urdu script conversion for captions
	"""
	import json
	import re

	from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call


	# ── Public dispatcher hooks ──────────────────────────────────────────────────

	def get_translation_prompt() -> str:
	"""Return the Urdu-specific system prompt for translation."""
	return (
	"You are a professional voice-over translator for commonly spoken Urdu. "
	"Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n"
	"LANGUAGE RULES:\n"
	"- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n"
	"- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. "
	"NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, "
	"وقت not سمے, لیکن not پرنتو, اگر not یدی).\n"
	"- Keep it natural and conversational, not literary or formal.\n"
	"- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). "
	"Translate ALL such expressions into Urdu equivalents.\n\n"
	"CRITICAL — DURATION CONSTRAINT:\n"
	"Each line shows its spoken duration in brackets (e.g. [4.6s]). "
	"The translation will be spoken by TTS and MUST fit within that duration.\n"
	"STRICT RULE: Your translation MUST have FEWER words than the original English. "
	"If the English has 10 words, aim for 7-8 Urdu words maximum.\n"
	"Every word must earn its place — if removing a word doesn't lose core meaning, remove it. "
	"Paraphrase aggressively. Use shorter synonyms. Merge clauses. "
	"A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n"
	"TTS COMPATIBILITY — IMPORTANT:\n"
	"The TTS model struggles with long sentences that have multiple commas or clauses. "
	"Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. "
	"Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n"
	"Each output line is still ONE item in the array (one per input line). "
	"You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n"
	"Write ONLY in Urdu script (Nastaliq/Arabic script). "
	"Return ONLY a JSON array of translated strings, in order, no extra text. "
	"Do NOT include the duration prefix or numbering in the output — only the translated text itself. "
	'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye '
	'Example output: ["سلام", "خدا حافظ"]'
	)


	def get_fallback_mode() -> str:
	"""Urdu uses Bedrock instead of Google Translate as fallback."""
	return "bedrock"


	_ENGLISH_FILLERS = re.compile(
	r'\b(Oh\|Ah\|Hmm\|Well\|So\|Right\|Okay\|OK\|Um\|Uh\|Hey\|Wow\|Ooh\|Aah)[\.\!\,]?\s*',
	re.IGNORECASE,
	)


	def post_translate(segments: list[dict]) -> list[dict]:
	"""Run Urdu-specific post-processing after translation.

	- Strips leaked English fillers.
	- Transliterates Urdu script → Devanagari for TTS (sets 'tts_text').
	- Captions use translated_text directly (already Urdu/Nastaliq script).
	"""
	for seg in segments:
	text = seg.get("translated_text", "")
	# Strip leaked English fillers
	clean_text = _ENGLISH_FILLERS.sub("", text).strip()
	seg["translated_text"] = clean_text

	return transliterate_to_devanagari(segments)


	# ── Transliteration: Urdu → Devanagari (for TTS) ────────────────────────────

	_URDU_TO_DEVA = {
	'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स',
	'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड',
	'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स',
	'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ',
	'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग',
	'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह',
	'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ',
	'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न',
	'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';',
	}


	def _urdu_to_rough_devanagari(text: str) -> str:
	"""Deterministic character mapping from Urdu to Devanagari.
	Consonants are mapped correctly, but short vowels are omitted/incorrect
	because Urdu script doesn't explicitly mark them."""
	result = []
	for i, ch in enumerate(text):
	if ch == 'ا':
	# Word-initial alif is 'अ', otherwise 'ा'
	result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा')
	elif ch in _URDU_TO_DEVA:
	result.append(_URDU_TO_DEVA[ch])
	else:
	result.append(ch)

	# Fix a common edge case: ئ + ے (e.g., in بروئے)
	rough = ''.join(result)
	rough = rough.replace('इे', 'ए')
	return rough


	def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2):
	"""Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary."""
	prompt = (
	"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
	"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
	"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n"
	"STRICT RULES:\n"
	"- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n"
	"- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n"
	"- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n"
	"- Add halant (्) for conjuncts where needed\n\n"
	"EXAMPLES:\n"
	"Urdu: محبت \| rough: महबत \| fixed: मोहब्बत\n"
	"Urdu: استعمال \| rough: असतअमाल \| fixed: इस्तेमाल\n"
	"Urdu: حکمت \| rough: हकमत \| fixed: हिकमत\n"
	"Urdu: طاقت \| rough: ताक़त \| fixed: ताक़त\n"
	"Urdu: ہمدردی \| rough: हमदरदय \| fixed: हमदर्दी\n"
	"Urdu: پیروی \| rough: पयरवय \| fixed: पैरवी\n"
	"Urdu: کریم \| rough: करयम \| fixed: करीम\n\n"
	"Return ONLY a JSON array of corrected Devanagari strings, in order, one per input."
	)

	for attempt in range(1, max_attempts + 1):
	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": prompt},
	{"role": "user", "content": numbered},
	],
	temperature=0.1,
	)
	raw = response.choices[0].message.content.strip()
	log_llm_call(
	step="urdu_vowel_polish", provider="pollinations", model=model,
	system_prompt=prompt, user_prompt=numbered,
	response=raw, temperature=0.1,
	)

	try:
	polished_list = parse_json_array(raw)
	except (json.JSONDecodeError, ValueError):
	print(f"[urdu] Attempt {attempt}: Could not parse response as JSON")
	continue

	if len(polished_list) != expected_count:
	print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}")
	continue

	# Quick check if it's returning Arabic/Urdu script instead
	sample = " ".join(polished_list[:3])
	bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
	if bad_chars > 0:
	print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying")
	prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt
	continue

	return polished_list

	except Exception as e:
	print(f"[urdu] LLM error on attempt {attempt}: {e}")

	return None


	def transliterate_to_devanagari(segments: list[dict]) -> list[dict]:
	"""Convert Urdu script translations to Devanagari for TTS.
	Adds 'tts_text' field to each segment.
	Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing."""
	if not segments:
	return segments

	print("[urdu] Starting Hybrid Urdu → Devanagari conversion...")

	# Step 1: Deterministic mapping to rough Devanagari
	rough_texts = []
	for seg in segments:
	urdu_text = seg.get("translated_text", "")
	rough_deva = _urdu_to_rough_devanagari(urdu_text)
	rough_texts.append(rough_deva)

	expected = len(segments)
	numbered = "\n".join(
	f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n Rough: {rough_texts[i]}"
	for i, seg in enumerate(segments)
	)

	# Try Pollinations
	client = build_client()
	polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected)

	if polished_list:
	for seg, deva_text in zip(segments, polished_list):
	seg["tts_text"] = deva_text
	print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓")
	return segments

	print("[urdu] Pollinations Polish failed — trying Bedrock fallback...")

	# Bedrock Fallback
	try:
	system_prompt = (
	"You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
	"character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
	"YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n"
	"EXAMPLES:\nمحبت \| rough: महबत \| fixed: मोहब्बत\nاستعمال \| rough: असतअमाल \| fixed: इस्तेमाल\n"
	"حکمت \| rough: हकमत \| fixed: हिकमत\nहमदरदی \| rough: हमदरदय \| fixed: हमदर्दी\n\n"
	"Return ONLY a JSON array of corrected Devanagari strings."
	)

	for attempt in range(1, 3):
	raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock")

	try:
	polished_list = parse_json_array(raw)
	except (json.JSONDecodeError, ValueError):
	print(f"[urdu] Bedrock attempt {attempt}: Could not parse response")
	continue

	if len(polished_list) != expected:
	print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}")
	continue

	sample = " ".join(polished_list[:3])
	bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
	if bad_chars > 0:
	print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying")
	system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt
	continue

	for seg, deva_text in zip(segments, polished_list):
	seg["tts_text"] = deva_text
	print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓")
	return segments

	except Exception as e:
	print(f"[urdu] WARNING: Bedrock fallback failed ({e})")

	print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.")
	for seg, r_text in zip(segments, rough_texts):
	seg["tts_text"] = r_text
	return segments


	# ── Transliteration: Devanagari → Urdu script (for captions) ────────────────

	def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]:
	"""Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles.
	Adds 'caption_text' field to each segment."""
	if not segments:
	return segments

	texts = [seg.get("translated_text", "") for seg in segments]
	numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts))

	system_prompt = (
	"You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). "
	"This is NOT translation — the language is already Urdu, just written in Devanagari. "
	"Convert it to proper Urdu script preserving every word exactly.\n\n"
	"Return ONLY a JSON array of converted strings, in order, no extra text. "
	"Do NOT include numbering in the output."
	)

	client = build_client()
	try:
	response = client.chat.completions.create(
	model=MODEL,
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": numbered},
	],
	temperature=0.1,
	)

	raw = response.choices[0].message.content.strip()
	log_llm_call(
	step="urdu_script_convert", provider="pollinations", model=MODEL,
	system_prompt=system_prompt, user_prompt=numbered,
	response=raw, temperature=0.1,
	)
	urdu_list = parse_json_array(raw)

	if len(urdu_list) != len(segments):
	print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
	return segments

	for seg, urdu_text in zip(segments, urdu_list):
	seg["caption_text"] = urdu_text

	print("[urdu] Urdu script transliteration complete ✓")
	return segments

	except Exception as e:
	print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...")

	try:
	raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock")
	urdu_list = parse_json_array(raw)

	if len(urdu_list) != len(segments):
	print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
	return segments

	for seg, urdu_text in zip(segments, urdu_list):
	seg["caption_text"] = urdu_text

	print("[urdu] Urdu script transliteration (Bedrock) complete ✓")
	return segments

	except Exception as e2:
	print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions")
	return segments