clipforge / src /humeo /cutter.py
moonlantern1's picture
Center native highlight captions by line anchor
468d2a3 verified
"""Subtitle helpers for the product pipeline."""
import logging
import math
import os
import re
from pathlib import Path
from humeo_core.schemas import Clip, RenderTheme, TranscriptWord
from humeo.transcript_align import (
clip_subtitle_words,
clip_words_to_srt_lines,
format_ass,
format_srt,
group_words_to_cue_chunks,
)
logger = logging.getLogger(__name__)
_NATIVE_HIGHLIGHT_FONT_NAME = "League Spartan"
_NATIVE_HIGHLIGHT_PURPLE = "&H00F65C8B"
_NATIVE_HIGHLIGHT_LEAD_SEC = 0.06
_NATIVE_HIGHLIGHT_MIN_DWELL_SEC = 0.16
_NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC = 0.035
_NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC = 1.65
_NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO = 0.62
_NATIVE_HIGHLIGHT_SAFE_MARGIN_X = 150
_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE = r"\blur3.0"
_NATIVE_HIGHLIGHT_STOPWORDS = {
"a",
"all",
"an",
"and",
"are",
"as",
"at",
"be",
"but",
"by",
"for",
"from",
"i",
"if",
"in",
"is",
"it",
"of",
"on",
"or",
"so",
"that",
"the",
"their",
"there",
"they",
"this",
"to",
"was",
"we",
"with",
"you",
"your",
"has",
"have",
"had",
"been",
"being",
}
def _balance_reference_caption(text: str) -> str:
words = text.split()
if len(words) <= 5 and len(text) <= 28:
return text
best_idx = 1
best_delta = 10**9
for idx in range(1, len(words)):
left = " ".join(words[:idx])
right = " ".join(words[idx:])
line_penalty = 0
if len(words[:idx]) < 2 or len(words[idx:]) < 2:
line_penalty += 1000
delta = abs(len(left) - len(right)) + abs(len(words[:idx]) - len(words[idx:])) * 6 + line_penalty
if delta < best_delta:
best_delta = delta
best_idx = idx
return " ".join(words[:best_idx]) + "\n" + " ".join(words[best_idx:])
def _native_line_width(font, words) -> float:
return _text_width(font, " ".join(word.word for word in words))
def _native_highlight_partition_penalty(lines, font, max_line_width: float) -> float:
widths = [_native_line_width(font, line) for line in lines]
overflow = sum(max(0.0, width - max_line_width) for width in widths)
word_counts = [len(line) for line in lines]
total_words = sum(word_counts)
width_balance = (max(widths) - min(widths)) if len(widths) > 1 else 0.0
word_balance = (max(word_counts) - min(word_counts)) if len(word_counts) > 1 else 0
single_word_penalty = sum(260 for line in lines if len(line) == 1 and total_words > 3)
return (
overflow * 80.0
+ len(lines) * 120.0
+ width_balance * 0.16
+ word_balance * 120.0
+ single_word_penalty
)
def _candidate_native_highlight_partitions(words, max_lines: int):
n = len(words)
if n == 0:
return []
if max_lines <= 1 or n == 1:
return [[list(words)]]
out = [[list(words)]]
for first_break in range(1, n):
out.append([list(words[:first_break]), list(words[first_break:])])
if max_lines >= 3 and n >= 3:
for first_break in range(1, n - 1):
for second_break in range(first_break + 1, n):
out.append(
[
list(words[:first_break]),
list(words[first_break:second_break]),
list(words[second_break:]),
]
)
return out
def _split_native_highlight_lines(words, *, font=None, max_line_width: float | None = None):
if len(words) <= 3 and len(" ".join(word.word for word in words)) <= 22:
return [list(words)]
if len(words) < 2:
return [list(words)]
if font is not None and max_line_width is not None:
candidates = _candidate_native_highlight_partitions(words, max_lines=3)
return min(
candidates,
key=lambda lines: _native_highlight_partition_penalty(
lines,
font,
max_line_width,
),
)
best_idx = 1
best_delta = 10**9
for idx in range(1, len(words)):
left_words = words[:idx]
right_words = words[idx:]
left = " ".join(word.word for word in left_words)
right = " ".join(word.word for word in right_words)
line_penalty = 0
if len(left_words) < 2 or len(right_words) < 2:
line_penalty += 800
delta = abs(len(left) - len(right)) + abs(len(left_words) - len(right_words)) * 7 + line_penalty
if delta < best_delta:
best_delta = delta
best_idx = idx
return [list(words[:best_idx]), list(words[best_idx:])]
def _clean_native_highlight_token(text: str) -> str:
return re.sub(r"(^[^A-Za-z0-9$%#]+|[^A-Za-z0-9$%#]+$)", "", text or "")
def _native_highlight_span_score(words) -> float:
cleaned = [_clean_native_highlight_token(word.word) for word in words]
cleaned = [token for token in cleaned if token]
if not cleaned:
return -1e9
if all(token.lower() in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned):
return -1e9
score = 0.0
for token in cleaned:
lower = token.lower()
if lower not in _NATIVE_HIGHLIGHT_STOPWORDS:
score += 2.0
if any(ch.isdigit() for ch in token) or "$" in token or "%" in token:
score += 3.0
if len(token) >= 6:
score += 0.8
if token.isupper() and len(token) > 1:
score += 0.6
if len(cleaned) == 2:
score -= 0.55
if any(any(ch.isdigit() for ch in token) or "$" in token or "%" in token for token in cleaned):
score += 1.1
elif cleaned[0].lower() in _NATIVE_HIGHLIGHT_STOPWORDS or cleaned[1].lower() in _NATIVE_HIGHLIGHT_STOPWORDS:
score -= 0.6
else:
score += 0.3
if len(" ".join(cleaned)) > 18:
score -= 0.6
return score
def _should_render_native_highlight_group(words) -> bool:
cleaned = [_clean_native_highlight_token(word.word) for word in words]
cleaned = [token for token in cleaned if token]
if not cleaned:
return False
return any(token.lower() not in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned)
def _native_highlight_font_path() -> Path | None:
try:
import humeo_core
bundled = (
Path(humeo_core.__file__).resolve().parent
/ "assets"
/ "fonts"
/ "LeagueSpartan-Bold.ttf"
)
if bundled.is_file():
return bundled
except Exception:
pass
windows_fonts = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts"
for filename in ("arialbd.ttf", "Arialbd.ttf", "ARIALBD.TTF", "arial.ttf"):
path = windows_fonts / filename
if path.is_file():
return path
return None
def _text_width(font, text: str) -> float:
if not text:
return 0.0
if hasattr(font, "getlength"):
return float(font.getlength(text))
bbox = font.getbbox(text)
return float(bbox[2] - bbox[0])
def _text_height(font) -> int:
bbox = font.getbbox("Ag")
return max(1, int(round(bbox[3] - bbox[1])))
def _escape_ass_text(text: str) -> str:
return (
text.replace("\\", r"\\")
.replace("{", r"\{")
.replace("}", r"\}")
.replace("\n", r"\N")
)
def _native_highlight_overlay_text(line_words, highlight_idx: int) -> str:
parts: list[str] = []
for word_idx, word in enumerate(line_words):
if word_idx == highlight_idx:
parts.append(
f"{{\\rHighlight{_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE}}}"
f"{_escape_ass_text(word.word)}"
"{\\rInvisible}"
)
else:
parts.append(_escape_ass_text(word.word))
return " ".join(parts)
def _word_timing_weight(word: TranscriptWord) -> float:
token = _clean_native_highlight_token(word.word)
return max(0.65, min(2.2, len(token or word.word) / 5.5))
def _suspicious_native_highlight_timing(
words: list[TranscriptWord],
idx: int,
*,
clip_duration: float,
) -> bool:
word = words[idx]
start = float(word.start_time)
end = float(word.end_time)
if not (math.isfinite(start) and math.isfinite(end)):
return True
if start < -0.01 or end > clip_duration + 0.25:
return True
duration = end - start
if duration < _NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC:
return True
if duration > _NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC:
return True
if idx > 0:
prev = words[idx - 1]
if start < float(prev.start_time) - 0.03:
return True
if start < float(prev.end_time) - 0.35:
return True
if idx + 1 < len(words):
nxt = words[idx + 1]
if float(nxt.start_time) < start - 0.03:
return True
return False
def _repair_native_highlight_timings(
words: list[TranscriptWord],
*,
clip_duration: float,
) -> list[TranscriptWord]:
"""Repair obvious ASR word timestamp glitches before per-word highlighting.
This is intentionally conservative: clean Whisper/ElevenLabs timings pass
through almost unchanged, while zero-length, reversed, huge, or badly
overlapping word timings get interpolated between neighboring reliable words.
"""
if not words:
return []
clip_duration = max(0.0, float(clip_duration))
records: list[dict[str, object]] = []
for idx, word in enumerate(words):
start = max(0.0, min(clip_duration, float(word.start_time)))
end = max(0.0, min(clip_duration, float(word.end_time)))
records.append(
{
"word": word.word,
"start": start,
"end": end,
"bad": _suspicious_native_highlight_timing(
words,
idx,
clip_duration=clip_duration,
),
"weight": _word_timing_weight(word),
}
)
idx = 0
while idx < len(records):
if not records[idx]["bad"]:
idx += 1
continue
run_start = idx
while idx < len(records) and records[idx]["bad"]:
idx += 1
run_end = idx - 1
count = run_end - run_start + 1
left_time = (
float(records[run_start - 1]["end"])
if run_start > 0
else max(0.0, float(records[run_start]["start"]))
)
right_time = (
float(records[run_end + 1]["start"])
if run_end + 1 < len(records)
else min(clip_duration, max(left_time, float(records[run_end]["end"])))
)
weight_span = sum(float(r["weight"]) for r in records[run_start : run_end + 1]) * 0.13
min_span = max(0.11 * count, weight_span)
if right_time <= left_time + min_span:
right_time = min(clip_duration, left_time + min_span)
if right_time <= left_time:
right_time = min(clip_duration, left_time + max(0.08, 0.12 * count))
span = max(0.001, right_time - left_time)
weights = [float(r["weight"]) for r in records[run_start : run_end + 1]]
total_weight = max(0.001, sum(weights))
cursor = left_time
for offset, weight in enumerate(weights):
rec = records[run_start + offset]
next_cursor = (
right_time
if offset == count - 1
else cursor + span * (weight / total_weight)
)
rec["start"] = cursor
rec["end"] = max(cursor + 0.04, next_cursor)
cursor = float(rec["end"])
repaired: list[TranscriptWord] = []
prev_end = 0.0
for rec in records:
start = max(0.0, float(rec["start"]))
end = max(start + 0.02, float(rec["end"]))
if start < prev_end - 0.02:
start = prev_end
end = max(end, start + 0.04)
if clip_duration > 0.0:
end = min(clip_duration, end)
if end <= start:
start = max(0.0, min(start, clip_duration - 0.02))
end = min(clip_duration, start + 0.04)
repaired.append(TranscriptWord(word=str(rec["word"]), start_time=start, end_time=end))
prev_end = max(prev_end, end)
return repaired
def _native_highlight_word_windows(
words: list[TranscriptWord],
*,
lead_sec: float,
min_dwell_sec: float,
) -> list[tuple[float, float]]:
if not words:
return []
lead_sec = max(0.0, float(lead_sec))
min_dwell_sec = max(0.02, float(min_dwell_sec))
cue_start = max(0.0, words[0].start_time - lead_sec)
cue_end = max(words[-1].end_time, words[-1].start_time + min_dwell_sec)
starts: list[float] = []
for idx, word in enumerate(words):
start = max(cue_start, float(word.start_time) - lead_sec)
if idx > 0:
start = max(start, starts[-1] + 0.01)
starts.append(start)
windows: list[tuple[float, float]] = []
for idx, word in enumerate(words):
start = starts[idx]
natural_end = max(float(word.end_time), start + min_dwell_sec)
limit = starts[idx + 1] if idx + 1 < len(starts) else cue_end
end = min(natural_end, limit)
if end <= start:
end = min(limit, start + 0.01)
windows.append((start, max(start + 0.01, end)))
return windows
def _fmt_ass_time(seconds: float) -> str:
seconds = max(0.0, seconds)
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
whole = int(secs)
cs = int(round((secs - whole) * 100))
if cs >= 100:
cs = 99
return f"{hours:d}:{minutes:02d}:{whole:02d}.{cs:02d}"
def _format_native_highlight_ass(
cue_chunks,
*,
play_res_x: int,
play_res_y: int,
font_size: int,
margin_v: int,
font_name: str,
highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC,
highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC,
) -> str:
from PIL import ImageFont
font_path = _native_highlight_font_path()
if font_path is not None:
font = ImageFont.truetype(str(font_path), size=font_size)
else:
font = ImageFont.load_default()
line_height = max(font_size, _text_height(font) + 6)
line_gap = max(8, int(round(font_size * 0.08)))
bottom_anchor = play_res_y - margin_v
safe_margin_x = min(
int(round(play_res_x * 0.12)),
max(24, _NATIVE_HIGHLIGHT_SAFE_MARGIN_X),
)
max_line_width = min(
play_res_x * _NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO,
play_res_x - (safe_margin_x * 2),
)
header = (
"[Script Info]\n"
"ScriptType: v4.00+\n"
f"PlayResX: {play_res_x}\n"
f"PlayResY: {play_res_y}\n"
"WrapStyle: 0\n"
"ScaledBorderAndShadow: yes\n"
"YCbCr Matrix: None\n"
"\n"
"[V4+ Styles]\n"
"Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
"OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
"ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
"Alignment, MarginL, MarginR, MarginV, Encoding\n"
f"Style: Base,{font_name},{font_size},&H00FFFFFF,&H000000FF,&H00101010,&H00000000,-1,0,0,0,100,100,-1,0,1,4,0,8,0,0,0,0\n"
f"Style: Highlight,{font_name},{font_size},&H00FFFFFF,&H000000FF,{_NATIVE_HIGHLIGHT_PURPLE},&H00000000,-1,0,0,0,100,100,-1,0,3,4,0,8,0,0,0,0\n"
f"Style: Invisible,{font_name},{font_size},&HFF000000,&H000000FF,&HFF000000,&HFF000000,-1,0,0,0,100,100,-1,0,1,0,0,8,0,0,0,0\n"
"\n"
"[Events]\n"
"Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
)
events: list[str] = []
for cue_words in cue_chunks:
if not cue_words:
continue
lines = _split_native_highlight_lines(
cue_words,
font=font,
max_line_width=max_line_width,
)
cue_windows = _native_highlight_word_windows(
cue_words,
lead_sec=highlight_lead_sec,
min_dwell_sec=highlight_min_dwell_sec,
)
block_height = len(lines) * line_height + max(0, len(lines) - 1) * line_gap
block_top = bottom_anchor - block_height
cue_start = cue_windows[0][0] if cue_windows else cue_words[0].start_time
cue_end = cue_windows[-1][1] if cue_windows else cue_words[-1].end_time
word_offset = 0
line_center_x = play_res_x / 2.0
for line_idx, line_words in enumerate(lines):
if not line_words:
continue
line_text = " ".join(word.word for word in line_words)
line_top = block_top + line_idx * (line_height + line_gap)
events.append(
"Dialogue: 1,"
f"{_fmt_ass_time(cue_start)},{_fmt_ass_time(cue_end)},Base,,0,0,0,,"
f"{{\\an8\\pos({line_center_x:.1f},{line_top:.1f})}}{_escape_ass_text(line_text)}"
)
for word_idx, word in enumerate(line_words):
cleaned = _clean_native_highlight_token(word.word)
if not cleaned:
continue
word_start, word_end = cue_windows[word_offset + word_idx]
events.append(
"Dialogue: 0,"
f"{_fmt_ass_time(word_start)},{_fmt_ass_time(word_end)},Invisible,,0,0,0,,"
f"{{\\an8\\pos({line_center_x:.1f},{line_top:.1f})}}"
f"{_native_highlight_overlay_text(line_words, word_idx)}"
)
word_offset += len(line_words)
return header + "\n".join(events) + ("\n" if events else "")
def generate_srt(
clip: Clip,
transcript: dict,
output_dir: Path,
*,
max_words_per_cue: int = 8,
max_cue_sec: float = 4.0,
) -> Path:
"""
Build an SRT file from word-level ASR aligned to this clip's timeline.
``transcript`` is the persisted ``transcript.json`` (segments with optional
per-word timestamps). Times are shifted so 0 = clip in-point.
"""
srt_path = output_dir / f"clip_{clip.clip_id}.srt"
aligned = clip_subtitle_words(transcript, clip)
lines = clip_words_to_srt_lines(
aligned.words,
max_words_per_cue=max_words_per_cue,
max_cue_sec=max_cue_sec,
)
srt_path.write_text(format_srt(lines), encoding="utf-8")
logger.info("Generated SRT: %s (%d cues)", srt_path, len(lines))
return srt_path
def generate_ass(
clip: Clip,
transcript: dict,
output_dir: Path,
*,
max_words_per_cue: int = 4,
max_cue_sec: float = 2.2,
play_res_x: int = 1080,
play_res_y: int = 1920,
font_size: int = 48,
margin_v: int = 160,
margin_h: int = 60,
font_name: str = "Arial",
render_theme: RenderTheme = RenderTheme.LEGACY,
native_highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC,
native_highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC,
repair_word_timings: bool = True,
) -> Path:
"""Generate an ASS caption file tuned for direct libass rendering.
Unlike SRT → libass (default PlayResY=288), an ASS file with
``PlayResY = output_height`` means libass' scale factor is 1.0, so the
``font_size`` / ``margin_v`` arguments below are honest output pixels.
This is the root-cause fix for the "captions rendering in the middle of
the frame, four times too large" bug the user reported.
"""
ass_path = output_dir / f"clip_{clip.clip_id}.ass"
aligned = clip_subtitle_words(transcript, clip)
cue_words = max_words_per_cue
cue_sec = max_cue_sec
cue_font_size = font_size
cue_margin_v = margin_v
prefer_break_on_punctuation = False
min_words_before_break = 1
if render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
cue_words = max(max_words_per_cue, 7)
cue_sec = max(max_cue_sec, 2.6)
cue_font_size = max(font_size, 52)
cue_margin_v = min(margin_v, 136)
prefer_break_on_punctuation = True
min_words_before_break = 5
elif render_theme == RenderTheme.NATIVE_HIGHLIGHT:
cue_words = 4
cue_sec = 1.45
cue_font_size = max(font_size, 80)
cue_margin_v = max(margin_v, 300)
prefer_break_on_punctuation = True
min_words_before_break = 3
aligned_words = aligned.words
if render_theme == RenderTheme.NATIVE_HIGHLIGHT and repair_word_timings:
aligned_words = _repair_native_highlight_timings(
aligned_words,
clip_duration=clip.duration_sec,
)
cue_chunks = group_words_to_cue_chunks(
aligned_words,
max_words_per_cue=cue_words,
max_cue_sec=cue_sec,
prefer_break_on_punctuation=prefer_break_on_punctuation,
min_words_before_break=min_words_before_break,
)
lines = [
(chunk[0].start_time, chunk[-1].end_time, " ".join(word.word for word in chunk))
for chunk in cue_chunks
]
if render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
lines = [(start, end, _balance_reference_caption(text)) for start, end, text in lines]
ass_text = format_ass(
lines,
play_res_x=play_res_x,
play_res_y=play_res_y,
font_size=cue_font_size,
margin_v=cue_margin_v,
margin_h=margin_h,
font_name="Source Sans 3",
render_theme=render_theme,
)
elif render_theme == RenderTheme.NATIVE_HIGHLIGHT:
ass_text = _format_native_highlight_ass(
cue_chunks,
play_res_x=play_res_x,
play_res_y=play_res_y,
font_size=cue_font_size,
margin_v=cue_margin_v,
font_name=_NATIVE_HIGHLIGHT_FONT_NAME,
highlight_lead_sec=native_highlight_lead_sec,
highlight_min_dwell_sec=native_highlight_min_dwell_sec,
)
else:
ass_text = format_ass(
lines,
play_res_x=play_res_x,
play_res_y=play_res_y,
font_size=cue_font_size,
margin_v=cue_margin_v,
margin_h=margin_h,
font_name=font_name,
render_theme=render_theme,
)
ass_path.write_text(ass_text, encoding="utf-8")
logger.info("Generated ASS: %s (%d cues)", ass_path, len(lines))
return ass_path