Spaces:
Running on Zero
Running on Zero
| """ | |
| Step 6b: Generate ASS subtitle file from translated segments. | |
| Produces OpusClip-style karaoke captions: max 4 words per line, | |
| with word-by-word highlight using ASS \\kf tags. | |
| """ | |
| from pathlib import Path | |
| _RTL_LANGUAGES = {"Arabic", "Hebrew", "Urdu", "Farsi", "Persian"} | |
| def _format_ass_time(seconds: float) -> str: | |
| """Convert seconds to ASS timestamp format: H:MM:SS.cc""" | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = int(seconds % 60) | |
| cs = int(round((seconds - int(seconds)) * 100)) | |
| return f"{h}:{m:02d}:{s:02d}.{cs:02d}" | |
| def _build_karaoke_chunks( | |
| text: str, | |
| seg_start: float, | |
| seg_end: float, | |
| max_words: int = 4, | |
| ) -> list[dict]: | |
| """Split text into timed word chunks for karaoke display. | |
| Distributes the segment duration across words proportionally | |
| to character count, then groups into chunks of max_words. | |
| Returns list of {"words": [(word, duration_cs), ...], "start": float, "end": float}. | |
| """ | |
| raw_words = text.split() | |
| if not raw_words: | |
| return [] | |
| total_duration = max(seg_end - seg_start, 0.1) | |
| total_chars = sum(max(len(w), 1) for w in raw_words) | |
| # Character-weighted durations | |
| word_durations = [] | |
| for w in raw_words: | |
| frac = max(len(w), 1) / total_chars | |
| dur = total_duration * frac | |
| word_durations.append(max(dur, 0.05)) | |
| # Normalize so they sum to total_duration exactly | |
| dur_sum = sum(word_durations) | |
| word_durations = [d * total_duration / dur_sum for d in word_durations] | |
| # Build absolute timestamps per word | |
| timestamps = [] | |
| t = seg_start | |
| for dur in word_durations: | |
| timestamps.append((t, t + dur)) | |
| t += dur | |
| # Group into chunks | |
| chunks = [] | |
| for i in range(0, len(raw_words), max_words): | |
| chunk_words = raw_words[i:i + max_words] | |
| chunk_durs = word_durations[i:i + max_words] | |
| chunk_start = timestamps[i][0] | |
| chunk_end = timestamps[min(i + max_words, len(raw_words)) - 1][1] | |
| words_with_timing = [] | |
| for w, dur in zip(chunk_words, chunk_durs): | |
| cs = max(round(dur * 100), 1) # centiseconds, minimum 1 | |
| words_with_timing.append((w, cs)) | |
| chunks.append({ | |
| "words": words_with_timing, | |
| "start": chunk_start, | |
| "end": chunk_end, | |
| }) | |
| return chunks | |
| def _format_karaoke_line( | |
| chunk: dict, | |
| style_name: str = "Karaoke", | |
| is_rtl: bool = False, | |
| highlight_color: str = "00FFFF", | |
| ) -> str: | |
| """Format a karaoke chunk as an ASS Dialogue line. | |
| For RTL chunks: reverse the segment order so words read right-to-left, | |
| AND swap \\kf for \\t() color transitions so highlight timing follows | |
| spoken order instead of source order. With plain \\kf the highlight | |
| would fill in source order — i.e. left-to-right in the reversed layout, | |
| which is the wrong direction for RTL speech. | |
| """ | |
| start = _format_ass_time(chunk["start"]) | |
| end = _format_ass_time(chunk["end"]) | |
| if is_rtl: | |
| # Compute each word's highlight time slot in spoken order, in ms | |
| # relative to line start (\kf cs * 10). | |
| words = chunk["words"] | |
| time_slots = [] | |
| t_ms = 0 | |
| for _, cs in words: | |
| time_slots.append((t_ms, t_ms + cs * 10)) | |
| t_ms += cs * 10 | |
| # Emit segments in REVERSED visual order. Each carries its own | |
| # \t() so highlight timing stays tied to spoken order. No per-word | |
| # RLE/PDF wraps: each override block already splits libass into a | |
| # separate BiDi run, so wraps are redundant and caused layout drift | |
| # during \t() color animation. Urdu/Arabic chars are strongly RTL | |
| # by Unicode property and shape correctly within each word without | |
| # explicit marks. | |
| parts = [] | |
| for i in reversed(range(len(words))): | |
| word, _ = words[i] | |
| t_start, t_end = time_slots[i] | |
| parts.append( | |
| f"{{\\1c&HFFFFFF&\\t({t_start},{t_end},\\1c&H{highlight_color}&)}}" | |
| f"{word}" | |
| ) | |
| karaoke_text = " ".join(parts) | |
| else: | |
| parts = [f"{{\\kf{cs}}}{word}" for word, cs in chunk["words"]] | |
| karaoke_text = " ".join(parts) | |
| return f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{karaoke_text}" | |
| def generate_captions( | |
| segments: list[dict], | |
| output_path: str = "tmp/captions.ass", | |
| max_words_per_line: int = 4, | |
| highlight_color: str = "00FFFF", | |
| target_language: str = "", | |
| ) -> str: | |
| """ | |
| Generate an ASS subtitle file with karaoke-style word highlights. | |
| Args: | |
| segments: List of dicts with {start, end, translated_text, words?}. | |
| output_path: Where to write the .ass file. | |
| max_words_per_line: Max words per caption chunk (default 4). | |
| highlight_color: BGR hex color for karaoke fill (default yellow). | |
| target_language: Target language for RTL detection. | |
| Returns: | |
| Path to the generated ASS file. | |
| """ | |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
| is_rtl = target_language in _RTL_LANGUAGES | |
| wrap_style = 2 if is_rtl else 0 | |
| # Tahoma has reliable Arabic/Urdu shaping across macOS/Windows/Linux ffmpeg | |
| # builds; Arial often lacks the glyph coverage on headless Linux. | |
| font = "Tahoma" if is_rtl else "Noto Sans" | |
| # Encoding 178 = Windows Arabic codepage — hints libass font selection. | |
| encoding = 178 if is_rtl else 0 | |
| header = f"""\ | |
| [Script Info] | |
| Title: VideoVoice Captions | |
| ScriptType: v4.00+ | |
| PlayResX: 1920 | |
| PlayResY: 1080 | |
| WrapStyle: {wrap_style} | |
| [V4+ Styles] | |
| Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding | |
| Style: Default,{font},52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,40,40,50,{encoding} | |
| Style: Karaoke,{font},58,&H00FFFFFF,&H00{highlight_color},&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,60,{encoding} | |
| [Events] | |
| Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
| """ | |
| lines = [header] | |
| for seg in segments: | |
| text = seg.get("caption_text", seg.get("translated_text", seg.get("text", ""))) | |
| if not text or not text.strip(): | |
| continue | |
| has_words = bool(seg.get("words")) | |
| if has_words: | |
| chunks = _build_karaoke_chunks(text, seg["start"], seg["end"], max_words_per_line) | |
| for chunk in chunks: | |
| lines.append(_format_karaoke_line( | |
| chunk, is_rtl=is_rtl, highlight_color=highlight_color | |
| )) | |
| else: | |
| start = _format_ass_time(seg["start"]) | |
| end = _format_ass_time(seg["end"]) | |
| safe_text = text.replace("\\", "\\\\").replace("{", "\\{").replace("}", "\\}") | |
| if is_rtl: | |
| safe_text = f"\u202B{safe_text}\u202C" | |
| lines.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{safe_text}") | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write("\n".join(lines) + "\n") | |
| print(f"[s6b] Captions generated → {output_path} ✓") | |
| return output_path | |