Spaces:

Rajan18
/

studiox-reel-cutter

Running

19002fb verified 10 days ago

10.6 kB

	"""
	pipeline/highlights.py
	Sends the structured transcript to an LLM via OpenRouter and asks it
	to pick the top N highlight segments with precise timestamps.

	Uses the OpenAI-compatible OpenRouter API so you can swap models
	(Claude, Gemini, Qwen, Llama) just by changing the OPENROUTER_MODEL
	env var without touching code.
	"""

	import json
	import os
	import re
	import textwrap
	from typing import Callable, List, Optional

	from openai import OpenAI

	from utils import Segment, format_duration, log


	class HighlightModelError(RuntimeError):
	"""Raised when all OpenRouter model attempts fail."""


	# ── OpenRouter client (lazy-initialised) ────────────────────────────────────
	_client: Optional[OpenAI] = None


	def _get_client() -> OpenAI:
	global _client
	if _client is None:
	api_key = os.environ.get("OPENROUTER_API_KEY", "")
	if not api_key:
	raise RuntimeError(
	"OPENROUTER_API_KEY env var is not set. "
	"Add it to your HF Space Secrets."
	)
	_client = OpenAI(
	base_url="https://openrouter.ai/api/v1",
	api_key=api_key,
	)
	return _client


	# ── Transcript formatter ─────────────────────────────────────────────────────

	def _build_transcript_block(transcript_data: dict, max_sentences: int = 300) -> str:
	"""
	Format the transcript as a compact text block for the LLM prompt.
	Each line: [MM:SS – MM:SS] (SENTIMENT) sentence text
	"""
	lines = []
	sentences = transcript_data.get("sentences", [])

	# Evenly sample if transcript is very long
	if len(sentences) > max_sentences:
	step = len(sentences) / max_sentences
	sentences = [sentences[int(i * step)] for i in range(max_sentences)]

	for s in sentences:
	start = format_duration(s["start_ms"] / 1000)
	end = format_duration(s["end_ms"] / 1000)
	sent = s.get("sentiment", "NEUTRAL")
	lines.append(f"[{start} – {end}] ({sent}) {s['text']}")

	chapters = transcript_data.get("chapters", [])
	if chapters:
	lines.append("\n--- AUTO-DETECTED CHAPTERS ---")
	for c in chapters:
	start = format_duration(c["start_ms"] / 1000)
	end = format_duration(c["end_ms"] / 1000)
	lines.append(f"[{start} – {end}] {c['gist']}: {c['summary']}")

	return "\n".join(lines)


	# ── Overlap removal ──────────────────────────────────────────────────────────

	def _remove_overlaps(segments: List[Segment]) -> List[Segment]:
	"""Keep highest-scored segment when two overlap."""
	ranked = sorted(segments, key=lambda s: s.score, reverse=True)
	accepted: List[Segment] = []
	for seg in ranked:
	if not any(
	seg.start < a.end and seg.end > a.start
	for a in accepted
	):
	accepted.append(seg)
	return accepted


	def _candidate_models() -> List[str]:
	"""
	Build model fallback chain.
	- OPENROUTER_MODEL: primary model
	- OPENROUTER_FALLBACK_MODELS: comma-separated fallback list
	"""
	primary = os.environ.get("OPENROUTER_MODEL")
	if primary is None:
	primary = "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free"
	primary = str(primary)

	raw_fallbacks = os.environ.get("OPENROUTER_FALLBACK_MODELS")
	if raw_fallbacks is None:
	raw_fallbacks = "inclusionai/ling-2.6-1t:free"
	raw_fallbacks = str(raw_fallbacks)
	# Only one fallback model, no need to strip whitespace
	fallbacks = [m for m in raw_fallbacks.split(",") if m]
	return list(dict.fromkeys([primary, *fallbacks]))


	# ── Main function ────────────────────────────────────────────────────────────

	def detect_highlights(
	transcript_data : dict,
	video_duration : float,
	num_reels : int = 5,
	min_duration : float = 10,
	max_duration : float = 30,
	progress_cb : Optional[Callable[[str, int], None]] = None,
	) -> List[Segment]:
	"""
	Ask the LLM to identify the most engaging segments.

	Returns a sorted, non-overlapping list of Segment objects.
	"""
	models = _candidate_models()
	model = models[0]
	transcript_text = _build_transcript_block(transcript_data)
	total_dur_str = format_duration(video_duration)

	system_prompt = textwrap.dedent("""
	You are an expert video editor and social media strategist specialising
	in short-form content for Instagram Reels and YouTube Shorts.

	Your task: analyse a video transcript and identify the most compelling
	highlight segments that will perform well as vertical reels.

	SELECTION CRITERIA (priority order):
	1. Emotional peaks — excitement, humour, surprise, anger, inspiration
	2. Strong hooks — opening lines that immediately grab attention
	3. Punchlines & payoffs — satisfying end of a story or argument
	4. Key insights — the single most important takeaway from a section
	5. High-energy moments — fast speech, emphasis, laughter

	STRICT RULES:
	- Times must be in SECONDS (decimal, e.g. 45.5) — NOT MM:SS format
	- Each segment must start at a sentence boundary
	- Segments must NOT overlap
	- Segments must NOT start before 0 or end after the video duration
	- Prefer segments that open with a strong hook
	- Avoid mostly-silent or filler-word segments
	- Spread picks across the whole video

	OUTPUT FORMAT — respond with ONLY valid JSON, nothing else:
	[
	{
	"start" : <float seconds>,
	"end" : <float seconds>,
	"reason" : "<one sentence why this moment is engaging>",
	"score" : <float 0.0–10.0>
	}
	]
	""")

	user_prompt = textwrap.dedent(f"""
	VIDEO DURATION : {total_dur_str} ({video_duration:.1f} seconds)
	REELS NEEDED : {num_reels}
	MIN LENGTH : {min_duration} seconds
	MAX LENGTH : {max_duration} seconds

	TRANSCRIPT:
	{transcript_text}

	Identify exactly {num_reels} highlight segments.
	Respond with pure JSON only — no markdown fences, no explanation.
	""")

	log("🤖", f"Detecting highlights with primary model {model}…")
	if progress_cb:
	progress_cb("detecting_highlights", 10)

	response = None
	errors: List[str] = []
	raw = None
	for attempt, candidate in enumerate(models, start=1):
	try:
	log("🤖", f"LLM attempt {attempt}/{len(models)} using {candidate}")
	response = _get_client().chat.completions.create(
	model = candidate,
	temperature = 0.3,
	max_tokens = 1024,
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt},
	],
	)
	raw = response.choices[0].message.content
	if raw is None or not isinstance(raw, str):
	msg = f"Invalid response content: {raw}"
	errors.append(f"{candidate}: {msg}")
	log("⚠️", f"LLM returned None content from {candidate}, trying fallback")
	continue
	model = candidate
	break
	except Exception as exc:
	msg = str(exc)
	errors.append(f"{candidate}: {msg}")
	if "No endpoints found" in msg:
	log("⚠️", f"Model unavailable on OpenRouter: {candidate}")
	continue
	# For non-endpoint errors, still continue to fallback once.
	log("⚠️", f"LLM attempt failed on {candidate}: {msg}")
	continue

	if raw is None or not isinstance(raw, str):
	raise HighlightModelError(
	"OpenRouter model selection failed. "
	"Set OPENROUTER_MODEL to a currently available model, or configure "
	"OPENROUTER_FALLBACK_MODELS. "
	f"Tried: {', '.join(models)}. "
	f"Reasons: {' \| '.join(errors[-3:])}"
	)
	# Strip markdown fences if the model added them
	raw = re.sub(r"^```(?:json)?\s*", "", raw)
	raw = re.sub(r"\s*```$", "", raw)

	try:
	raw_segs = json.loads(raw)
	except json.JSONDecodeError as e:
	raise HighlightModelError(
	f"LLM ({model}) returned invalid JSON while selecting highlights"
	) from e

	if progress_cb:
	progress_cb("detecting_highlights", 80)

	# ── Validate, clamp, deduplicate ─────────────────────────────────────────
	segments: List[Segment] = []
	for i, r in enumerate(raw_segs):
	start = float(r.get("start", 0))
	end = float(r.get("end", 0))

	start = max(0.0, min(start, video_duration - min_duration))
	end = min(end, video_duration)

	if end - start < min_duration:
	end = min(start + min_duration, video_duration)
	if end - start > max_duration:
	end = start + max_duration
	if end - start < min_duration / 2:
	log("⚠️", f"Segment {i+1} too short after clamping — skipped")
	continue

	segments.append(Segment(
	index = i + 1,
	start = round(start, 2),
	end = round(end, 2),
	reason = str(r.get("reason", "")),
	score = float(r.get("score", 0)),
	))

	segments = _remove_overlaps(segments)
	segments.sort(key=lambda s: s.start)

	log("✅", f"Detected {len(segments)} highlight segments")
	for seg in segments:
	log(" 📍", f"#{seg.index} [{seg.start:.1f}s–{seg.end:.1f}s] "
	f"({seg.duration:.0f}s) — {seg.reason}")

	if progress_cb:
	progress_cb("detecting_highlights", 100)

	return segments