Spaces:

Rafii
/

videovoice

Running on Zero

github-actions[bot]

deploy: switch to chatterbox requirements @ 0fae627

96e0666 about 1 month ago

7.29 kB

	"""
	Step 6b: Generate ASS subtitle file from translated segments.

	Produces OpusClip-style karaoke captions: max 4 words per line,
	with word-by-word highlight using ASS \\kf tags.
	"""
	from pathlib import Path


	_RTL_LANGUAGES = {"Arabic", "Hebrew", "Urdu", "Farsi", "Persian"}


	def _format_ass_time(seconds: float) -> str:
	"""Convert seconds to ASS timestamp format: H:MM:SS.cc"""
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = int(seconds % 60)
	cs = int(round((seconds - int(seconds)) * 100))
	return f"{h}:{m:02d}:{s:02d}.{cs:02d}"


	def _build_karaoke_chunks(
	text: str,
	seg_start: float,
	seg_end: float,
	max_words: int = 4,
	) -> list[dict]:
	"""Split text into timed word chunks for karaoke display.

	Distributes the segment duration across words proportionally
	to character count, then groups into chunks of max_words.

	Returns list of {"words": [(word, duration_cs), ...], "start": float, "end": float}.
	"""
	raw_words = text.split()
	if not raw_words:
	return []

	total_duration = max(seg_end - seg_start, 0.1)
	total_chars = sum(max(len(w), 1) for w in raw_words)

	# Character-weighted durations
	word_durations = []
	for w in raw_words:
	frac = max(len(w), 1) / total_chars
	dur = total_duration * frac
	word_durations.append(max(dur, 0.05))

	# Normalize so they sum to total_duration exactly
	dur_sum = sum(word_durations)
	word_durations = [d * total_duration / dur_sum for d in word_durations]

	# Build absolute timestamps per word
	timestamps = []
	t = seg_start
	for dur in word_durations:
	timestamps.append((t, t + dur))
	t += dur

	# Group into chunks
	chunks = []
	for i in range(0, len(raw_words), max_words):
	chunk_words = raw_words[i:i + max_words]
	chunk_durs = word_durations[i:i + max_words]
	chunk_start = timestamps[i][0]
	chunk_end = timestamps[min(i + max_words, len(raw_words)) - 1][1]

	words_with_timing = []
	for w, dur in zip(chunk_words, chunk_durs):
	cs = max(round(dur * 100), 1) # centiseconds, minimum 1
	words_with_timing.append((w, cs))

	chunks.append({
	"words": words_with_timing,
	"start": chunk_start,
	"end": chunk_end,
	})

	return chunks


	def _format_karaoke_line(
	chunk: dict,
	style_name: str = "Karaoke",
	is_rtl: bool = False,
	highlight_color: str = "00FFFF",
	) -> str:
	"""Format a karaoke chunk as an ASS Dialogue line.

	For RTL chunks: reverse the segment order so words read right-to-left,
	AND swap \\kf for \\t() color transitions so highlight timing follows
	spoken order instead of source order. With plain \\kf the highlight
	would fill in source order — i.e. left-to-right in the reversed layout,
	which is the wrong direction for RTL speech.
	"""
	start = _format_ass_time(chunk["start"])
	end = _format_ass_time(chunk["end"])

	if is_rtl:
	# Compute each word's highlight time slot in spoken order, in ms
	# relative to line start (\kf cs * 10).
	words = chunk["words"]
	time_slots = []
	t_ms = 0
	for _, cs in words:
	time_slots.append((t_ms, t_ms + cs * 10))
	t_ms += cs * 10

	# Emit segments in REVERSED visual order. Each carries its own
	# \t() so highlight timing stays tied to spoken order. No per-word
	# RLE/PDF wraps: each override block already splits libass into a
	# separate BiDi run, so wraps are redundant and caused layout drift
	# during \t() color animation. Urdu/Arabic chars are strongly RTL
	# by Unicode property and shape correctly within each word without
	# explicit marks.
	parts = []
	for i in reversed(range(len(words))):
	word, _ = words[i]
	t_start, t_end = time_slots[i]
	parts.append(
	f"{{\\1c&HFFFFFF&\\t({t_start},{t_end},\\1c&H{highlight_color}&)}}"
	f"{word}"
	)
	karaoke_text = " ".join(parts)
	else:
	parts = [f"{{\\kf{cs}}}{word}" for word, cs in chunk["words"]]
	karaoke_text = " ".join(parts)

	return f"Dialogue: 0,{start},{end},{style_name},,0,0,0,,{karaoke_text}"


	def generate_captions(
	segments: list[dict],
	output_path: str = "tmp/captions.ass",
	max_words_per_line: int = 4,
	highlight_color: str = "00FFFF",
	target_language: str = "",
	) -> str:
	"""
	Generate an ASS subtitle file with karaoke-style word highlights.

	Args:
	segments: List of dicts with {start, end, translated_text, words?}.
	output_path: Where to write the .ass file.
	max_words_per_line: Max words per caption chunk (default 4).
	highlight_color: BGR hex color for karaoke fill (default yellow).
	target_language: Target language for RTL detection.

	Returns:
	Path to the generated ASS file.
	"""
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)

	is_rtl = target_language in _RTL_LANGUAGES
	wrap_style = 2 if is_rtl else 0
	# Tahoma has reliable Arabic/Urdu shaping across macOS/Windows/Linux ffmpeg
	# builds; Arial often lacks the glyph coverage on headless Linux.
	font = "Tahoma" if is_rtl else "Noto Sans"
	# Encoding 178 = Windows Arabic codepage — hints libass font selection.
	encoding = 178 if is_rtl else 0

	header = f"""\
	[Script Info]
	Title: VideoVoice Captions
	ScriptType: v4.00+
	PlayResX: 1920
	PlayResY: 1080
	WrapStyle: {wrap_style}

	[V4+ Styles]
	Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
	Style: Default,{font},52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,1,2,40,40,50,{encoding}
	Style: Karaoke,{font},58,&H00FFFFFF,&H00{highlight_color},&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,60,{encoding}

	[Events]
	Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
	"""

	lines = [header]
	for seg in segments:
	text = seg.get("caption_text", seg.get("translated_text", seg.get("text", "")))
	if not text or not text.strip():
	continue

	has_words = bool(seg.get("words"))

	if has_words:
	chunks = _build_karaoke_chunks(text, seg["start"], seg["end"], max_words_per_line)
	for chunk in chunks:
	lines.append(_format_karaoke_line(
	chunk, is_rtl=is_rtl, highlight_color=highlight_color
	))
	else:
	start = _format_ass_time(seg["start"])
	end = _format_ass_time(seg["end"])
	safe_text = text.replace("\\", "\\\\").replace("{", "\\{").replace("}", "\\}")
	if is_rtl:
	safe_text = f"\u202B{safe_text}\u202C"
	lines.append(f"Dialogue: 0,{start},{end},Default,,0,0,0,,{safe_text}")

	with open(output_path, "w", encoding="utf-8") as f:
	f.write("\n".join(lines) + "\n")

	print(f"[s6b] Captions generated → {output_path} ✓")
	return output_path