videovoice / steps /s6_captions.py
github-actions[bot]
deploy: switch to chatterbox requirements @ 0fae627
96e0666
"""
Step 6b: Generate ASS subtitle file from translated segments.
Produces OpusClip-style karaoke captions: max 4 words per line,
with word-by-word highlight using ASS \\kf tags.
"""
from pathlib import Path
_RTL_LANGUAGES = {"Arabic", "Hebrew", "Urdu", "Farsi", "Persian"}
def _format_ass_time(seconds: float) -> str:
"""Convert seconds to ASS timestamp format: H:MM:SS.cc"""
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
cs = int(round((seconds - int(seconds)) * 100))
return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
def _build_karaoke_chunks(
text: str,
seg_start: float,
seg_end: float,
max_words: int = 4,
) -> list[dict]:
"""Split text into timed word chunks for karaoke display.
Distributes the segment duration across words proportionally
to character count, then groups into chunks of max_words.
Returns list of {"words": [(word, duration_cs), ...], "start": float, "end": float}.
"""
raw_words = text.split()
if not raw_words:
return []
total_duration = max(seg_end - seg_start, 0.1)
total_chars = sum(max(len(w), 1) for w in raw_words)
# Character-weighted durations
word_durations = []
for w in raw_words:
frac = max(len(w), 1) / total_chars
dur = total_duration * frac
word_durations.append(max(dur, 0.05))
# Normalize so they sum to total_duration exactly
dur_sum = sum(word_durations)
word_durations = [d * total_duration / dur_sum for d in word_durations]
# Build absolute timestamps per word
timestamps = []
t = seg_start
for dur in word_durations:
timestamps.append((t, t + dur))
t += dur
# Group into chunks
chunks = []
for i in range(0, len(raw_words), max_words):
chunk_words = raw_words[i:i + max_words]
chunk_durs = word_durations[i:i + max_words]
chunk_start = timestamps[i][0]
chunk_end = timestamps[min(i + max_words, len(raw_words)) - 1][1]
words_with_timing = []
for w, dur in zip(chunk_words, chunk_durs):
cs = max(round(dur * 100), 1) # centiseconds, minimum 1
words_with_timing.append((w, cs))
chunks.append({
"words": words_with_timing,
"start": chunk_start,
"end": chunk_end,
})
return chunks
def _format_karaoke_line(
chunk: dict,
style_name: str = "Karaoke",
is_rtl: bool = False,
highlight_color: str = "00FFFF",
) -> str:
"""Format a karaoke chunk as an ASS Dialogue line.
For RTL chunks: reverse the segment order so words read right-to-left,
AND swap \\kf for \\t() color transitions so highlight timing follows
spoken order instead of source order. With plain \\kf the highlight
would fill in source order — i.e. left-to-right in the reversed layout,
which is the wrong direction for RTL speech.
"""
start = _format_ass_time(chunk["start"])
end = _format_ass_time(chunk["end"])
if is_rtl:
# Compute each word's highlight time slot in spoken order, in ms
# relative to line start (\kf cs * 10).
words = chunk["words"]
time_slots = []
t_ms = 0
for _, cs in words:
time_slots.append((t_ms, t_ms + cs * 10))
t_ms += cs * 10
# Emit segments in REVERSED visual order. Each carries its own
# \t() so highlight timing stays tied to spoken order. No per-word
# RLE/PDF wraps: each override block already splits libass into a
# separate BiDi run, so wraps are redundant and caused layout drift
# during \t() color animation. Urdu/Arabic chars are strongly RTL
# by Unicode property and shape correctly within each word without
# explicit marks.
parts = []
for i in reversed(range(len(words))):
word, _ = words[i]
t_start, t_end = time_slots[i]
parts.append(
f"{{\\1c&HFFFFFF&\\t({t_start},{t_end},\\1c&H{highlight_color}&)}}"
f"{word}"
)
karaoke_text = " ".join(parts)
else:
parts = [f"{{\\kf{cs}}}{word}" for word, cs in chunk["words"]]
karaoke_text = " ".join(parts)
return f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{karaoke_text}"
def generate_captions(
segments: list[dict],
output_path: str = "tmp/captions.ass",
max_words_per_line: int = 4,
highlight_color: str = "00FFFF",
target_language: str = "",
) -> str:
"""
Generate an ASS subtitle file with karaoke-style word highlights.
Args:
segments: List of dicts with {start, end, translated_text, words?}.
output_path: Where to write the .ass file.
max_words_per_line: Max words per caption chunk (default 4).
highlight_color: BGR hex color for karaoke fill (default yellow).
target_language: Target language for RTL detection.
Returns:
Path to the generated ASS file.
"""
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
is_rtl = target_language in _RTL_LANGUAGES
wrap_style = 2 if is_rtl else 0
# Tahoma has reliable Arabic/Urdu shaping across macOS/Windows/Linux ffmpeg
# builds; Arial often lacks the glyph coverage on headless Linux.
font = "Tahoma" if is_rtl else "Noto Sans"
# Encoding 178 = Windows Arabic codepage — hints libass font selection.
encoding = 178 if is_rtl else 0
header = f"""\
[Script Info]
Title: VideoVoice Captions
ScriptType: v4.00+
PlayResX: 1920
PlayResY: 1080
WrapStyle: {wrap_style}
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,{font},52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,40,40,50,{encoding}
Style: Karaoke,{font},58,&H00FFFFFF,&H00{highlight_color},&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,60,{encoding}
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
lines = [header]
for seg in segments:
text = seg.get("caption_text", seg.get("translated_text", seg.get("text", "")))
if not text or not text.strip():
continue
has_words = bool(seg.get("words"))
if has_words:
chunks = _build_karaoke_chunks(text, seg["start"], seg["end"], max_words_per_line)
for chunk in chunks:
lines.append(_format_karaoke_line(
chunk, is_rtl=is_rtl, highlight_color=highlight_color
))
else:
start = _format_ass_time(seg["start"])
end = _format_ass_time(seg["end"])
safe_text = text.replace("\\", "\\\\").replace("{", "\\{").replace("}", "\\}")
if is_rtl:
safe_text = f"\u202B{safe_text}\u202C"
lines.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{safe_text}")
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
print(f"[s6b] Captions generated → {output_path} ✓")
return output_path