Spaces:

Rafii
/

videovoice

Running on Zero

github-actions[bot]

deploy: switch to chatterbox requirements @ c80cad4

ffee483 8 days ago

8.89 kB

	"""
	Step 4: Translate segment texts using Pollinations chat completions API
	(OpenAI-compatible endpoint, no extra API key needed beyond POLLEN_API_KEY).
	"""
	import re

	from .lang._shared import build_client, bedrock_fallback, parse_json_array, MODEL, log_llm_call
	from .lang import get_translation_prompt, get_fallback_mode, post_translate


	def _translate_batch(segments: list[dict], target_language: str) -> list[dict]:
	"""Translate a batch of segments into target_language."""
	if not segments:
	return segments

	# Build single-shot batch: include duration so the LLM can match spoken length
	numbered = "\n".join(
	f"{i+1}. [{s['end'] - s['start']:.1f}s] {s['text']}"
	for i, s in enumerate(segments)
	)

	# Default prompt (generic, works for most languages)
	default_prompt = (
	f"You are a voice-over dubbing writer — not a translator. "
	f"Your job is to write what a native {target_language} speaker would actually say out loud "
	f"in a casual, natural conversation. Forget the source words. Capture the meaning, tone, and energy.\n\n"

	f"INPUT FORMAT:\n"
	f"Numbered lines with a spoken duration in brackets, e.g.: 1. [4.6s] Hello there\n\n"

	f"OUTPUT FORMAT:\n"
	f"A JSON array of {target_language} strings — one per input line, in order. "
	f"No numbering, no brackets, no extra text.\n"
	f'Shape: ["<first line translated into {target_language}>", "<second line translated into {target_language}>"]\n\n'

	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	f"SCORING RUBRIC — evaluate every line against these before outputting:\n"
	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

	f"[1] NATURALNESS — weight: HIGH\n"
	f" Would a native speaker actually say this in real life?\n"
	f" ✗ Fail: dictionary phrasing, formal register, textbook grammar\n"
	f" ✓ Pass: contractions, colloquial rhythm, everyday vocabulary\n"
	f" Ask yourself: 'Would I hear this in a TV show or on the street?' If no → rewrite.\n\n"

	f"[2] SPOKEN FIT — weight: CRITICAL\n"
	f" The line will be read by TTS within the duration shown in brackets.\n"
	f" Fewer words is almost always safer. Aim for 70–80% of the original word count.\n"
	f" ✗ Fail: translation is longer or same length as the English\n"
	f" ✓ Pass: shorter, with no loss of core meaning or emotional tone\n"
	f" Trick: cut filler, merge ideas, use contractions and short-form spoken words.\n\n"

	f"[3] TTS READABILITY — weight: HIGH\n"
	f" Long sentences with multiple commas trip up TTS engines.\n"
	f" ✗ Fail: 'She met him, her true love, on a rainy evening, in the city she once fled.'\n"
	f" ✓ Pass: 'She met him on a rainy evening. Her true love. In the city she once fled.'\n"
	f" Short beats. Natural pauses. Each sentence punches clean.\n\n"

	f"[4] EMOTIONAL REGISTER — weight: HIGH\n"
	f" Match the tone of the original: casual, urgent, tender, funny, sarcastic — whatever it is.\n"
	f" ✗ Fail: a sarcastic line becomes polite; a tender moment becomes clinical\n"
	f" ✓ Pass: the emotional texture is preserved even if the words are completely different\n\n"

	f"[5] TRANSLATION PURITY — weight: MEDIUM\n"
	f" Every word in the output must be {target_language}. No words from the original "
	f"language should leak through.\n"
	f" This includes: filler words (Oh, Hmm, Well, So, Right when not native to "
	f"{target_language}), names used as exclamations, brand-style interjections. "
	f"Find the {target_language} equivalent every time.\n\n"

	f"[6] WORD-FOR-WORD TRAP — weight: HIGH (avoid this)\n"
	f" Do NOT translate word by word. No one speaks that way.\n"
	f" ✗ Fail: a literal one-to-one rendering that preserves the source word order\n"
	f" ✓ Pass: a restructured line that reads naturally in {target_language} "
	f"while keeping the same meaning\n"
	f" Restructure freely. {target_language} has its own natural word order — use it.\n\n"

	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
	f"BEFORE RETURNING OUTPUT:\n"
	f"For each line, silently run this checklist:\n"
	f" □ Would a native speaker say this naturally out loud?\n"
	f" □ Is it shorter than the English original?\n"
	f" □ Are there any commas that create awkward TTS pauses? → break into short sentences\n"
	f" □ Does the emotional tone match?\n"
	f" □ Are there any English words hiding in the output?\n"
	f"If any box fails → rewrite that line. Then output.\n"
	f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

	f"Return ONLY the JSON array. No preamble, no explanation, no duration prefixes."
	)

	# Let language-specific handler override the prompt if needed
	system_prompt = get_translation_prompt(target_language, default_prompt)

	expected = len(segments)
	strict_prompt = (
	system_prompt
	+ f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
	f"— one per input line. Do NOT merge, skip, or split any lines."
	)

	client = build_client()
	max_retries = 2
	try:
	for attempt in range(1, max_retries + 1):
	response = client.chat.completions.create(
	model=MODEL,
	messages=[
	{"role": "system", "content": strict_prompt},
	{"role": "user", "content": numbered},
	],
	temperature=0.2,
	)

	raw = response.choices[0].message.content.strip()
	log_llm_call(
	step="s3_translate", provider="pollinations", model=MODEL,
	system_prompt=strict_prompt, user_prompt=numbered,
	response=raw, temperature=0.2,
	)
	translated_list = parse_json_array(raw)

	if len(translated_list) == expected:
	break

	print(f"[s3] Pollinations returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
	if attempt == max_retries:
	raise ValueError(
	f"Translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
	)

	cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]

	result = []
	for seg, translated_text in zip(segments, cleaned):
	result.append({**seg, "translated_text": translated_text})

	print(f"[s3] Translating via Pollinations complete ✓")
	return result

	except Exception as e:
	print(f"[s3] Pollinations translation error ({e}) — using fallback.")

	# Language-specific fallback routing
	if get_fallback_mode(target_language) == "bedrock":
	return bedrock_fallback(segments, numbered, system_prompt)

	# Default: Google Translate
	from deep_translator import GoogleTranslator
	try:
	translator = GoogleTranslator(source='auto', target=target_language.lower())
	except Exception as e2:
	print(f"[s3] Fallback failed to init translator ({e2})")
	raise

	result = []
	for seg in segments:
	translated_text = translator.translate(seg["text"])
	result.append({**seg, "translated_text": translated_text})

	print(f"[s3] Translation via fallback complete ✓")
	return result


	def translate(segments: list[dict], target_language: str) -> list[dict]:
	"""
	Translate the text of each segment into target_language in batches.

	Args:
	segments: List of {start, end, text} dicts.
	target_language: Full language name, e.g. "Spanish", "French", "Hindi".

	Returns:
	Same list with 'translated_text' added to each segment.
	Language-specific fields (e.g. 'tts_text') may also be added.
	"""
	if not segments:
	return segments

	print(f"[s3] Translating {len(segments)} segments → {target_language} (in batches)...")

	BATCH_SIZE = 15
	final_result = []

	for i in range(0, len(segments), BATCH_SIZE):
	batch = segments[i:i + BATCH_SIZE]
	if len(segments) > BATCH_SIZE:
	print(f"[s3] Processing batch {i//BATCH_SIZE + 1} ({len(batch)} items)...")
	batch_result = _translate_batch(batch, target_language)
	final_result.extend(batch_result)

	# Run language-specific post-processing (e.g., Urdu transliteration)
	final_result = post_translate(final_result, target_language)

	return final_result