Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Running

ElevenClip-AI / backend /app /services /highlight.py

JakgritB

feat(editor): subtitle-first editor + AI subtitle pipeline

89e1dc4 3 days ago

17.8 kB

	import json
	import re
	from uuid import uuid4

	from app.core.config import Settings
	from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment


	class QwenHighlightDetector:
	def __init__(self, settings: Settings) -> None:
	self.settings = settings
	self._llm = None

	def detect(
	self, transcript: list[TranscriptSegment], profile: ChannelProfile
	) -> list[ClipCandidate]:
	if self.settings.demo_mode:
	return self._heuristic_detect(transcript, profile)

	try:
	return self._qwen_detect(transcript, profile)
	except Exception:
	return self._heuristic_detect(transcript, profile)

	def _qwen_detect(
	self, transcript: list[TranscriptSegment], profile: ChannelProfile
	) -> list[ClipCandidate]:
	try:
	from vllm import LLM, SamplingParams
	except Exception as exc:
	raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc

	if self._llm is None:
	self._llm = LLM(
	model=self.settings.qwen_text_model_id,
	dtype=self.settings.preferred_torch_dtype,
	trust_remote_code=True,
	)

	transcript_text = "\n".join(
	f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
	for segment in transcript
	)
	niche = _effective_niche(profile)
	channel_description = profile.channel_description or "No extra channel description provided."
	clip_count = min(profile.clip_count, self.settings.max_clips)
	prompt = f"""
	You are selecting short-form clips for a creator.
	Profile:
	- niche: {niche}
	- creator description: {channel_description}
	- style: {profile.clip_style}
	- target length seconds: {profile.clip_length_seconds}
	- target number of clips: {clip_count}
	- language: {profile.primary_language}
	- platform: {profile.target_platform.value}

	Return strict JSON only. Shape:
	[
	{{
	"start_seconds": 12.0,
	"end_seconds": 72.0,
	"title": "short title",
	"reason": "why this will engage viewers",
	"score": 91,
	"subtitle_text": "clean subtitle text"
	}}
	]

	Transcript:
	{transcript_text}
	""".strip()
	sampling = SamplingParams(temperature=0.2, max_tokens=1200)
	outputs = self._llm.generate([prompt], sampling)
	text = outputs[0].outputs[0].text
	payload = self._parse_json_array(text)
	clips = [
	ClipCandidate(
	id=uuid4().hex,
	start_seconds=float(item["start_seconds"]),
	end_seconds=float(item["end_seconds"]),
	title=str(item.get("title") or "Highlight"),
	reason=str(item.get("reason") or "High engagement potential"),
	score=float(item.get("score") or 75),
	subtitle_text=str(item.get("subtitle_text") or ""),
	metadata={"model": self.settings.qwen_text_model_id},
	)
	for item in payload[:clip_count]
	]
	return clips or self._heuristic_detect(transcript, profile)

	def _parse_json_array(self, text: str) -> list[dict]:
	match = re.search(r"\[[\s\S]*\]", text)
	if not match:
	raise ValueError("No JSON array in Qwen response")
	payload = json.loads(match.group(0))
	if not isinstance(payload, list):
	raise ValueError("Qwen response is not a list")
	return payload

	# ──────────────────────────────────────────────────────────────
	# AI subtitle actions (Polish, Translate)
	# ──────────────────────────────────────────────────────────────

	def polish_subtitles(
	self, cues: list[SubtitleCue], style: str \| None = None
	) -> list[SubtitleCue]:
	"""Rewrite cue text to be punchier and more readable on short-form video.

	Demo mode returns deterministic polished text so the UX is testable
	without GPU. Production mode calls Qwen2.5.
	"""
	if self.settings.demo_mode:
	return self._heuristic_polish(cues, style)
	try:
	return self._qwen_polish(cues, style)
	except Exception:
	return self._heuristic_polish(cues, style)

	def translate_subtitles(
	self, cues: list[SubtitleCue], target_language: str
	) -> list[SubtitleCue]:
	"""Translate cue text to target_language while preserving timing."""
	if self.settings.demo_mode:
	return self._heuristic_translate(cues, target_language)
	try:
	return self._qwen_translate(cues, target_language)
	except Exception:
	return self._heuristic_translate(cues, target_language)

	# ──────────────────────────────────────────────────────────────
	# Demo / fallback implementations
	# ──────────────────────────────────────────────────────────────

	def _heuristic_polish(
	self, cues: list[SubtitleCue], style: str \| None
	) -> list[SubtitleCue]:
	"""Apply simple text transformations that look like an AI polish."""
	polished: list[SubtitleCue] = []
	for cue in cues:
	text = (cue.text or "").strip()
	if not text:
	polished.append(cue.model_copy())
	continue
	# Shorten redundant phrasing (heuristic)
	text = re.sub(r"\s+", " ", text)
	text = re.sub(r"^(so\|well\|like\|um\|uh\|you know\|i mean)[,\s]+", "", text, flags=re.IGNORECASE)
	text = text.rstrip(" ,.;:")
	# Add light emphasis based on style
	if style and style.lower() == "dramatic" and not text.endswith("!"):
	text = text + "!"
	polished.append(
	SubtitleCue(
	start_seconds=cue.start_seconds,
	end_seconds=cue.end_seconds,
	text=text,
	)
	)
	return polished

	def _heuristic_translate(
	self, cues: list[SubtitleCue], target_language: str
	) -> list[SubtitleCue]:
	"""Demo translation: append a marker so the UX shows the action ran."""
	marker = f"[{target_language[:2].upper()}]"
	translated: list[SubtitleCue] = []
	for cue in cues:
	text = (cue.text or "").strip()
	translated.append(
	SubtitleCue(
	start_seconds=cue.start_seconds,
	end_seconds=cue.end_seconds,
	text=f"{marker} {text}" if text else "",
	)
	)
	return translated

	# ──────────────────────────────────────────────────────────────
	# Production Qwen calls (used when DEMO_MODE=false on AMD GPU)
	# ──────────────────────────────────────────────────────────────

	def _ensure_llm(self):
	try:
	from vllm import LLM
	except Exception as exc:
	raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc
	if self._llm is None:
	self._llm = LLM(
	model=self.settings.qwen_text_model_id,
	dtype=self.settings.preferred_torch_dtype,
	trust_remote_code=True,
	)
	return self._llm

	def _qwen_polish(
	self, cues: list[SubtitleCue], style: str \| None
	) -> list[SubtitleCue]:
	from vllm import SamplingParams

	llm = self._ensure_llm()
	joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
	prompt = f"""
	Rewrite each subtitle line to be punchier and easier to read on short-form vertical video.
	Keep the same number of lines and the same approximate length per line.
	Style preference: {style or 'natural'}.
	Return one rewritten line per row, prefixed with the original index. No commentary.

	Input:
	{joined}
	""".strip()
	outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800))
	raw = outputs[0].outputs[0].text
	rewritten = self._parse_indexed_lines(raw, expected=len(cues))
	return [
	SubtitleCue(
	start_seconds=cue.start_seconds,
	end_seconds=cue.end_seconds,
	text=rewritten[i] if i < len(rewritten) else cue.text,
	)
	for i, cue in enumerate(cues)
	]

	def _qwen_translate(
	self, cues: list[SubtitleCue], target_language: str
	) -> list[SubtitleCue]:
	from vllm import SamplingParams

	llm = self._ensure_llm()
	joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
	prompt = f"""
	Translate each subtitle line into {target_language}. Preserve line count and order.
	Return one translated line per row, prefixed with the original index. No commentary.

	Input:
	{joined}
	""".strip()
	outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000))
	raw = outputs[0].outputs[0].text
	translated = self._parse_indexed_lines(raw, expected=len(cues))
	return [
	SubtitleCue(
	start_seconds=cue.start_seconds,
	end_seconds=cue.end_seconds,
	text=translated[i] if i < len(translated) else cue.text,
	)
	for i, cue in enumerate(cues)
	]

	def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]:
	lines = []
	for line in raw.splitlines():
	stripped = line.strip()
	if not stripped:
	continue
	match = re.match(r"^\s\d+[.)\s-]+\s(.*)$", stripped)
	lines.append(match.group(1).strip() if match else stripped)
	if len(lines) >= expected:
	break
	return lines

	def _heuristic_detect(
	self, transcript: list[TranscriptSegment], profile: ChannelProfile
	) -> list[ClipCandidate]:
	style_terms = {
	"funny": ["react", "punchy", "mistake", "surprising"],
	"informative": ["important", "practical", "takeaway", "explanation"],
	"dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
	"educational": ["question", "answer", "context", "takeaway"],
	}
	preferred_terms = style_terms.get(profile.clip_style.lower(), [])
	niche = _effective_niche(profile)
	profile_terms = [
	term
	for term in f"{niche} {profile.channel_description}".lower().split()[:30]
	if len(term) > 2
	]
	scored: list[tuple[float, TranscriptSegment]] = []
	for segment in transcript:
	text = segment.text.lower()
	score = 45.0
	score += 12 if "?" in segment.text else 0
	score += 8 if any(term in text for term in preferred_terms) else 0
	score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
	score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
	score += 5 if any(term in text for term in profile_terms) else 0
	score += min(len(segment.text) / 12, 10)
	scored.append((min(score, 100), segment))

	scored.sort(key=lambda item: item[0], reverse=True)
	clips: list[ClipCandidate] = []
	clip_count = min(profile.clip_count, self.settings.max_clips)
	for score, segment in scored[:clip_count]:
	start = max(0.0, segment.start_seconds - 5.0)
	end = start + float(profile.clip_length_seconds)
	clips.append(
	ClipCandidate(
	id=uuid4().hex,
	start_seconds=start,
	end_seconds=end,
	title=self._title_for(segment.text),
	reason=self._reason_for(profile, niche),
	score=round(score, 1),
	subtitle_text=segment.text,
	metadata={"model": "heuristic-fallback"},
	)
	)
	return sorted(clips, key=lambda clip: clip.start_seconds)

	def _title_for(self, text: str) -> str:
	clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'")
	words = clean.split()
	if len(words) > 1:
	title = " ".join(words[:7])
	else:
	title = clean[:48]
	return title[:72].rstrip() or "Highlight"

	def _reason_for(self, profile: ChannelProfile, niche: str) -> str:
	language = profile.primary_language.lower()
	style = _localized_profile_word(profile.clip_style, language, "style")
	niche_label = _localized_profile_word(niche, language, "niche")
	if "thai" in language:
	return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}"
	if "japanese" in language:
	return f"{niche_label} の視聴者に合う {style} スタイルの候補です。"
	if "chinese" in language:
	return f"符合 {niche_label} 受众期待的 {style} 风格。"
	if "korean" in language:
	return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다."
	return f"Matches the {profile.clip_style} style for a {niche} audience."


	def _effective_niche(profile: ChannelProfile) -> str:
	if profile.niche.lower() == "other" and profile.niche_custom:
	return profile.niche_custom
	return profile.niche


	def _localized_profile_word(value: str, language: str, group: str) -> str:
	key = value.lower().replace(" ", "_")
	localized = {
	"thai": {
	"niche": {
	"education": "การศึกษา",
	"gaming": "เกม",
	"podcast": "พอดแคสต์",
	"commentary": "เล่า/วิเคราะห์",
	"cars": "รถยนต์",
	"beauty": "บิวตี้",
	"fitness": "ฟิตเนส",
	"finance": "การเงิน",
	"tech": "เทคโนโลยี",
	"lifestyle": "ไลฟ์สไตล์",
	"music": "ดนตรี",
	},
	"style": {
	"informative": "ให้ข้อมูล",
	"funny": "ตลก",
	"dramatic": "ดราม่า",
	"educational": "สอนเข้าใจง่าย",
	"commentary": "วิเคราะห์",
	},
	},
	"japanese": {
	"niche": {
	"education": "教育",
	"gaming": "ゲーム",
	"podcast": "ポッドキャスト",
	"commentary": "解説",
	"cars": "車",
	"beauty": "美容",
	"fitness": "フィットネス",
	"finance": "金融",
	"tech": "テック",
	"lifestyle": "ライフスタイル",
	"music": "音楽",
	},
	"style": {
	"informative": "情報性の高い",
	"funny": "ユーモアのある",
	"dramatic": "ドラマチックな",
	"educational": "学びやすい",
	"commentary": "解説型の",
	},
	},
	"chinese": {
	"niche": {
	"education": "教育",
	"gaming": "游戏",
	"podcast": "播客",
	"commentary": "解说",
	"cars": "汽车",
	"beauty": "美妆",
	"fitness": "健身",
	"finance": "金融",
	"tech": "科技",
	"lifestyle": "生活方式",
	"music": "音乐",
	},
	"style": {
	"informative": "信息量高",
	"funny": "有趣",
	"dramatic": "戏剧化",
	"educational": "教学型",
	"commentary": "评论型",
	},
	},
	"korean": {
	"niche": {
	"education": "교육",
	"gaming": "게임",
	"podcast": "팟캐스트",
	"commentary": "해설",
	"cars": "자동차",
	"beauty": "뷰티",
	"fitness": "피트니스",
	"finance": "금융",
	"tech": "테크",
	"lifestyle": "라이프스타일",
	"music": "음악",
	},
	"style": {
	"informative": "정보형",
	"funny": "재미있는",
	"dramatic": "극적인",
	"educational": "교육형",
	"commentary": "해설형",
	},
	},
	}
	for language_key, groups in localized.items():
	if language_key in language:
	return groups.get(group, {}).get(key, value)
	return value