videovoice / steps /s3_translate.py
github-actions[bot]
deploy: switch to chatterbox requirements @ c80cad4
ffee483
"""
Step 4: Translate segment texts using Pollinations chat completions API
(OpenAI-compatible endpoint, no extra API key needed beyond POLLEN_API_KEY).
"""
import re
from .lang._shared import build_client, bedrock_fallback, parse_json_array, MODEL, log_llm_call
from .lang import get_translation_prompt, get_fallback_mode, post_translate
def _translate_batch(segments: list[dict], target_language: str) -> list[dict]:
"""Translate a batch of segments into target_language."""
if not segments:
return segments
# Build single-shot batch: include duration so the LLM can match spoken length
numbered = "\n".join(
f"{i+1}. [{s['end'] - s['start']:.1f}s] {s['text']}"
for i, s in enumerate(segments)
)
# Default prompt (generic, works for most languages)
default_prompt = (
f"You are a voice-over dubbing writer β€” not a translator. "
f"Your job is to write what a native {target_language} speaker would *actually say out loud* "
f"in a casual, natural conversation. Forget the source words. Capture the meaning, tone, and energy.\n\n"
f"INPUT FORMAT:\n"
f"Numbered lines with a spoken duration in brackets, e.g.: 1. [4.6s] Hello there\n\n"
f"OUTPUT FORMAT:\n"
f"A JSON array of {target_language} strings β€” one per input line, in order. "
f"No numbering, no brackets, no extra text.\n"
f'Shape: ["<first line translated into {target_language}>", "<second line translated into {target_language}>"]\n\n'
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
f"SCORING RUBRIC β€” evaluate every line against these before outputting:\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
f"[1] NATURALNESS β€” weight: HIGH\n"
f" Would a native speaker actually say this in real life?\n"
f" βœ— Fail: dictionary phrasing, formal register, textbook grammar\n"
f" βœ“ Pass: contractions, colloquial rhythm, everyday vocabulary\n"
f" Ask yourself: 'Would I hear this in a TV show or on the street?' If no β†’ rewrite.\n\n"
f"[2] SPOKEN FIT β€” weight: CRITICAL\n"
f" The line will be read by TTS within the duration shown in brackets.\n"
f" Fewer words is almost always safer. Aim for 70–80% of the original word count.\n"
f" βœ— Fail: translation is longer or same length as the English\n"
f" βœ“ Pass: shorter, with no loss of core meaning or emotional tone\n"
f" Trick: cut filler, merge ideas, use contractions and short-form spoken words.\n\n"
f"[3] TTS READABILITY β€” weight: HIGH\n"
f" Long sentences with multiple commas trip up TTS engines.\n"
f" βœ— Fail: 'She met him, her true love, on a rainy evening, in the city she once fled.'\n"
f" βœ“ Pass: 'She met him on a rainy evening. Her true love. In the city she once fled.'\n"
f" Short beats. Natural pauses. Each sentence punches clean.\n\n"
f"[4] EMOTIONAL REGISTER β€” weight: HIGH\n"
f" Match the tone of the original: casual, urgent, tender, funny, sarcastic β€” whatever it is.\n"
f" βœ— Fail: a sarcastic line becomes polite; a tender moment becomes clinical\n"
f" βœ“ Pass: the emotional texture is preserved even if the words are completely different\n\n"
f"[5] TRANSLATION PURITY β€” weight: MEDIUM\n"
f" Every word in the output must be {target_language}. No words from the original "
f"language should leak through.\n"
f" This includes: filler words (Oh, Hmm, Well, So, Right when not native to "
f"{target_language}), names used as exclamations, brand-style interjections. "
f"Find the {target_language} equivalent every time.\n\n"
f"[6] WORD-FOR-WORD TRAP β€” weight: HIGH (avoid this)\n"
f" Do NOT translate word by word. No one speaks that way.\n"
f" βœ— Fail: a literal one-to-one rendering that preserves the source word order\n"
f" βœ“ Pass: a restructured line that reads naturally in {target_language} "
f"while keeping the same meaning\n"
f" Restructure freely. {target_language} has its own natural word order β€” use it.\n\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
f"BEFORE RETURNING OUTPUT:\n"
f"For each line, silently run this checklist:\n"
f" β–‘ Would a native speaker say this naturally out loud?\n"
f" β–‘ Is it shorter than the English original?\n"
f" β–‘ Are there any commas that create awkward TTS pauses? β†’ break into short sentences\n"
f" β–‘ Does the emotional tone match?\n"
f" β–‘ Are there any English words hiding in the output?\n"
f"If any box fails β†’ rewrite that line. Then output.\n"
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
f"Return ONLY the JSON array. No preamble, no explanation, no duration prefixes."
)
# Let language-specific handler override the prompt if needed
system_prompt = get_translation_prompt(target_language, default_prompt)
expected = len(segments)
strict_prompt = (
system_prompt
+ f"\n\nCRITICAL: You MUST return exactly {expected} items in the JSON array "
f"β€” one per input line. Do NOT merge, skip, or split any lines."
)
client = build_client()
max_retries = 2
try:
for attempt in range(1, max_retries + 1):
response = client.chat.completions.create(
model=MODEL,
messages=[
{"role": "system", "content": strict_prompt},
{"role": "user", "content": numbered},
],
temperature=0.2,
)
raw = response.choices[0].message.content.strip()
log_llm_call(
step="s3_translate", provider="pollinations", model=MODEL,
system_prompt=strict_prompt, user_prompt=numbered,
response=raw, temperature=0.2,
)
translated_list = parse_json_array(raw)
if len(translated_list) == expected:
break
print(f"[s3] Pollinations returned {len(translated_list)}/{expected} items (attempt {attempt}/{max_retries})")
if attempt == max_retries:
raise ValueError(
f"Translation returned {len(translated_list)} items but expected {expected} after {max_retries} attempts"
)
cleaned = [re.sub(r'^\d+[\.\)\-]\s*', '', t) for t in translated_list]
result = []
for seg, translated_text in zip(segments, cleaned):
result.append({**seg, "translated_text": translated_text})
print(f"[s3] Translating via Pollinations complete βœ“")
return result
except Exception as e:
print(f"[s3] Pollinations translation error ({e}) β€” using fallback.")
# Language-specific fallback routing
if get_fallback_mode(target_language) == "bedrock":
return bedrock_fallback(segments, numbered, system_prompt)
# Default: Google Translate
from deep_translator import GoogleTranslator
try:
translator = GoogleTranslator(source='auto', target=target_language.lower())
except Exception as e2:
print(f"[s3] Fallback failed to init translator ({e2})")
raise
result = []
for seg in segments:
translated_text = translator.translate(seg["text"])
result.append({**seg, "translated_text": translated_text})
print(f"[s3] Translation via fallback complete βœ“")
return result
def translate(segments: list[dict], target_language: str) -> list[dict]:
"""
Translate the text of each segment into target_language in batches.
Args:
segments: List of {start, end, text} dicts.
target_language: Full language name, e.g. "Spanish", "French", "Hindi".
Returns:
Same list with 'translated_text' added to each segment.
Language-specific fields (e.g. 'tts_text') may also be added.
"""
if not segments:
return segments
print(f"[s3] Translating {len(segments)} segments β†’ {target_language} (in batches)...")
BATCH_SIZE = 15
final_result = []
for i in range(0, len(segments), BATCH_SIZE):
batch = segments[i:i + BATCH_SIZE]
if len(segments) > BATCH_SIZE:
print(f"[s3] Processing batch {i//BATCH_SIZE + 1} ({len(batch)} items)...")
batch_result = _translate_batch(batch, target_language)
final_result.extend(batch_result)
# Run language-specific post-processing (e.g., Urdu transliteration)
final_result = post_translate(final_result, target_language)
return final_result