studiox-reel-cutter / highlights.py
Rajan18's picture
as
19002fb verified
"""
pipeline/highlights.py
Sends the structured transcript to an LLM via OpenRouter and asks it
to pick the top N highlight segments with precise timestamps.
Uses the OpenAI-compatible OpenRouter API so you can swap models
(Claude, Gemini, Qwen, Llama) just by changing the OPENROUTER_MODEL
env var without touching code.
"""
import json
import os
import re
import textwrap
from typing import Callable, List, Optional
from openai import OpenAI
from utils import Segment, format_duration, log
class HighlightModelError(RuntimeError):
"""Raised when all OpenRouter model attempts fail."""
# ── OpenRouter client (lazy-initialised) ────────────────────────────────────
_client: Optional[OpenAI] = None
def _get_client() -> OpenAI:
global _client
if _client is None:
api_key = os.environ.get("OPENROUTER_API_KEY", "")
if not api_key:
raise RuntimeError(
"OPENROUTER_API_KEY env var is not set. "
"Add it to your HF Space Secrets."
)
_client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=api_key,
)
return _client
# ── Transcript formatter ─────────────────────────────────────────────────────
def _build_transcript_block(transcript_data: dict, max_sentences: int = 300) -> str:
"""
Format the transcript as a compact text block for the LLM prompt.
Each line: [MM:SS – MM:SS] (SENTIMENT) sentence text
"""
lines = []
sentences = transcript_data.get("sentences", [])
# Evenly sample if transcript is very long
if len(sentences) > max_sentences:
step = len(sentences) / max_sentences
sentences = [sentences[int(i * step)] for i in range(max_sentences)]
for s in sentences:
start = format_duration(s["start_ms"] / 1000)
end = format_duration(s["end_ms"] / 1000)
sent = s.get("sentiment", "NEUTRAL")
lines.append(f"[{start} – {end}] ({sent}) {s['text']}")
chapters = transcript_data.get("chapters", [])
if chapters:
lines.append("\n--- AUTO-DETECTED CHAPTERS ---")
for c in chapters:
start = format_duration(c["start_ms"] / 1000)
end = format_duration(c["end_ms"] / 1000)
lines.append(f"[{start} – {end}] {c['gist']}: {c['summary']}")
return "\n".join(lines)
# ── Overlap removal ──────────────────────────────────────────────────────────
def _remove_overlaps(segments: List[Segment]) -> List[Segment]:
"""Keep highest-scored segment when two overlap."""
ranked = sorted(segments, key=lambda s: s.score, reverse=True)
accepted: List[Segment] = []
for seg in ranked:
if not any(
seg.start < a.end and seg.end > a.start
for a in accepted
):
accepted.append(seg)
return accepted
def _candidate_models() -> List[str]:
"""
Build model fallback chain.
- OPENROUTER_MODEL: primary model
- OPENROUTER_FALLBACK_MODELS: comma-separated fallback list
"""
primary = os.environ.get("OPENROUTER_MODEL")
if primary is None:
primary = "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free"
primary = str(primary)
raw_fallbacks = os.environ.get("OPENROUTER_FALLBACK_MODELS")
if raw_fallbacks is None:
raw_fallbacks = "inclusionai/ling-2.6-1t:free"
raw_fallbacks = str(raw_fallbacks)
# Only one fallback model, no need to strip whitespace
fallbacks = [m for m in raw_fallbacks.split(",") if m]
return list(dict.fromkeys([primary, *fallbacks]))
# ── Main function ────────────────────────────────────────────────────────────
def detect_highlights(
transcript_data : dict,
video_duration : float,
num_reels : int = 5,
min_duration : float = 10,
max_duration : float = 30,
progress_cb : Optional[Callable[[str, int], None]] = None,
) -> List[Segment]:
"""
Ask the LLM to identify the most engaging segments.
Returns a sorted, non-overlapping list of Segment objects.
"""
models = _candidate_models()
model = models[0]
transcript_text = _build_transcript_block(transcript_data)
total_dur_str = format_duration(video_duration)
system_prompt = textwrap.dedent("""
You are an expert video editor and social media strategist specialising
in short-form content for Instagram Reels and YouTube Shorts.
Your task: analyse a video transcript and identify the most compelling
highlight segments that will perform well as vertical reels.
SELECTION CRITERIA (priority order):
1. Emotional peaks β€” excitement, humour, surprise, anger, inspiration
2. Strong hooks β€” opening lines that immediately grab attention
3. Punchlines & payoffs β€” satisfying end of a story or argument
4. Key insights β€” the single most important takeaway from a section
5. High-energy moments β€” fast speech, emphasis, laughter
STRICT RULES:
- Times must be in SECONDS (decimal, e.g. 45.5) β€” NOT MM:SS format
- Each segment must start at a sentence boundary
- Segments must NOT overlap
- Segments must NOT start before 0 or end after the video duration
- Prefer segments that open with a strong hook
- Avoid mostly-silent or filler-word segments
- Spread picks across the whole video
OUTPUT FORMAT β€” respond with ONLY valid JSON, nothing else:
[
{
"start" : <float seconds>,
"end" : <float seconds>,
"reason" : "<one sentence why this moment is engaging>",
"score" : <float 0.0–10.0>
}
]
""")
user_prompt = textwrap.dedent(f"""
VIDEO DURATION : {total_dur_str} ({video_duration:.1f} seconds)
REELS NEEDED : {num_reels}
MIN LENGTH : {min_duration} seconds
MAX LENGTH : {max_duration} seconds
TRANSCRIPT:
{transcript_text}
Identify exactly {num_reels} highlight segments.
Respond with pure JSON only β€” no markdown fences, no explanation.
""")
log("πŸ€–", f"Detecting highlights with primary model {model}…")
if progress_cb:
progress_cb("detecting_highlights", 10)
response = None
errors: List[str] = []
raw = None
for attempt, candidate in enumerate(models, start=1):
try:
log("πŸ€–", f"LLM attempt {attempt}/{len(models)} using {candidate}")
response = _get_client().chat.completions.create(
model = candidate,
temperature = 0.3,
max_tokens = 1024,
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
)
raw = response.choices[0].message.content
if raw is None or not isinstance(raw, str):
msg = f"Invalid response content: {raw}"
errors.append(f"{candidate}: {msg}")
log("⚠️", f"LLM returned None content from {candidate}, trying fallback")
continue
model = candidate
break
except Exception as exc:
msg = str(exc)
errors.append(f"{candidate}: {msg}")
if "No endpoints found" in msg:
log("⚠️", f"Model unavailable on OpenRouter: {candidate}")
continue
# For non-endpoint errors, still continue to fallback once.
log("⚠️", f"LLM attempt failed on {candidate}: {msg}")
continue
if raw is None or not isinstance(raw, str):
raise HighlightModelError(
"OpenRouter model selection failed. "
"Set OPENROUTER_MODEL to a currently available model, or configure "
"OPENROUTER_FALLBACK_MODELS. "
f"Tried: {', '.join(models)}. "
f"Reasons: {' | '.join(errors[-3:])}"
)
# Strip markdown fences if the model added them
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
try:
raw_segs = json.loads(raw)
except json.JSONDecodeError as e:
raise HighlightModelError(
f"LLM ({model}) returned invalid JSON while selecting highlights"
) from e
if progress_cb:
progress_cb("detecting_highlights", 80)
# ── Validate, clamp, deduplicate ─────────────────────────────────────────
segments: List[Segment] = []
for i, r in enumerate(raw_segs):
start = float(r.get("start", 0))
end = float(r.get("end", 0))
start = max(0.0, min(start, video_duration - min_duration))
end = min(end, video_duration)
if end - start < min_duration:
end = min(start + min_duration, video_duration)
if end - start > max_duration:
end = start + max_duration
if end - start < min_duration / 2:
log("⚠️", f"Segment {i+1} too short after clamping β€” skipped")
continue
segments.append(Segment(
index = i + 1,
start = round(start, 2),
end = round(end, 2),
reason = str(r.get("reason", "")),
score = float(r.get("score", 0)),
))
segments = _remove_overlaps(segments)
segments.sort(key=lambda s: s.start)
log("βœ…", f"Detected {len(segments)} highlight segments")
for seg in segments:
log(" πŸ“", f"#{seg.index} [{seg.start:.1f}s–{seg.end:.1f}s] "
f"({seg.duration:.0f}s) β€” {seg.reason}")
if progress_cb:
progress_cb("detecting_highlights", 100)
return segments