""" Step 6b: Generate ASS subtitle file from translated segments. Produces OpusClip-style karaoke captions: max 4 words per line, with word-by-word highlight using ASS \\kf tags. """ from pathlib import Path _RTL_LANGUAGES = {"Arabic", "Hebrew", "Urdu", "Farsi", "Persian"} def _format_ass_time(seconds: float) -> str: """Convert seconds to ASS timestamp format: H:MM:SS.cc""" h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = int(seconds % 60) cs = int(round((seconds - int(seconds)) * 100)) return f"{h}:{m:02d}:{s:02d}.{cs:02d}" def _build_karaoke_chunks( text: str, seg_start: float, seg_end: float, max_words: int = 4, ) -> list[dict]: """Split text into timed word chunks for karaoke display. Distributes the segment duration across words proportionally to character count, then groups into chunks of max_words. Returns list of {"words": [(word, duration_cs), ...], "start": float, "end": float}. """ raw_words = text.split() if not raw_words: return [] total_duration = max(seg_end - seg_start, 0.1) total_chars = sum(max(len(w), 1) for w in raw_words) # Character-weighted durations word_durations = [] for w in raw_words: frac = max(len(w), 1) / total_chars dur = total_duration * frac word_durations.append(max(dur, 0.05)) # Normalize so they sum to total_duration exactly dur_sum = sum(word_durations) word_durations = [d * total_duration / dur_sum for d in word_durations] # Build absolute timestamps per word timestamps = [] t = seg_start for dur in word_durations: timestamps.append((t, t + dur)) t += dur # Group into chunks chunks = [] for i in range(0, len(raw_words), max_words): chunk_words = raw_words[i:i + max_words] chunk_durs = word_durations[i:i + max_words] chunk_start = timestamps[i][0] chunk_end = timestamps[min(i + max_words, len(raw_words)) - 1][1] words_with_timing = [] for w, dur in zip(chunk_words, chunk_durs): cs = max(round(dur * 100), 1) # centiseconds, minimum 1 words_with_timing.append((w, cs)) chunks.append({ "words": words_with_timing, "start": chunk_start, "end": chunk_end, }) return chunks def _format_karaoke_line( chunk: dict, style_name: str = "Karaoke", is_rtl: bool = False, highlight_color: str = "00FFFF", ) -> str: """Format a karaoke chunk as an ASS Dialogue line. For RTL chunks: reverse the segment order so words read right-to-left, AND swap \\kf for \\t() color transitions so highlight timing follows spoken order instead of source order. With plain \\kf the highlight would fill in source order — i.e. left-to-right in the reversed layout, which is the wrong direction for RTL speech. """ start = _format_ass_time(chunk["start"]) end = _format_ass_time(chunk["end"]) if is_rtl: # Compute each word's highlight time slot in spoken order, in ms # relative to line start (\kf cs * 10). words = chunk["words"] time_slots = [] t_ms = 0 for _, cs in words: time_slots.append((t_ms, t_ms + cs * 10)) t_ms += cs * 10 # Emit segments in REVERSED visual order. Each carries its own # \t() so highlight timing stays tied to spoken order. No per-word # RLE/PDF wraps: each override block already splits libass into a # separate BiDi run, so wraps are redundant and caused layout drift # during \t() color animation. Urdu/Arabic chars are strongly RTL # by Unicode property and shape correctly within each word without # explicit marks. parts = [] for i in reversed(range(len(words))): word, _ = words[i] t_start, t_end = time_slots[i] parts.append( f"{{\\1c&HFFFFFF&\\t({t_start},{t_end},\\1c&H{highlight_color}&)}}" f"{word}" ) karaoke_text = " ".join(parts) else: parts = [f"{{\\kf{cs}}}{word}" for word, cs in chunk["words"]] karaoke_text = " ".join(parts) return f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{karaoke_text}" def generate_captions( segments: list[dict], output_path: str = "tmp/captions.ass", max_words_per_line: int = 4, highlight_color: str = "00FFFF", target_language: str = "", ) -> str: """ Generate an ASS subtitle file with karaoke-style word highlights. Args: segments: List of dicts with {start, end, translated_text, words?}. output_path: Where to write the .ass file. max_words_per_line: Max words per caption chunk (default 4). highlight_color: BGR hex color for karaoke fill (default yellow). target_language: Target language for RTL detection. Returns: Path to the generated ASS file. """ Path(output_path).parent.mkdir(parents=True, exist_ok=True) is_rtl = target_language in _RTL_LANGUAGES wrap_style = 2 if is_rtl else 0 # Tahoma has reliable Arabic/Urdu shaping across macOS/Windows/Linux ffmpeg # builds; Arial often lacks the glyph coverage on headless Linux. font = "Tahoma" if is_rtl else "Noto Sans" # Encoding 178 = Windows Arabic codepage — hints libass font selection. encoding = 178 if is_rtl else 0 header = f"""\ [Script Info] Title: VideoVoice Captions ScriptType: v4.00+ PlayResX: 1920 PlayResY: 1080 WrapStyle: {wrap_style} [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,{font},52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,40,40,50,{encoding} Style: Karaoke,{font},58,&H00FFFFFF,&H00{highlight_color},&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,60,{encoding} [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ lines = [header] for seg in segments: text = seg.get("caption_text", seg.get("translated_text", seg.get("text", ""))) if not text or not text.strip(): continue has_words = bool(seg.get("words")) if has_words: chunks = _build_karaoke_chunks(text, seg["start"], seg["end"], max_words_per_line) for chunk in chunks: lines.append(_format_karaoke_line( chunk, is_rtl=is_rtl, highlight_color=highlight_color )) else: start = _format_ass_time(seg["start"]) end = _format_ass_time(seg["end"]) safe_text = text.replace("\\", "\\\\").replace("{", "\\{").replace("}", "\\}") if is_rtl: safe_text = f"\u202B{safe_text}\u202C" lines.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{safe_text}") with open(output_path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n") print(f"[s6b] Captions generated → {output_path} ✓") return output_path