ElevenClip-AI / backend /app /services /highlight.py
JakgritB
feat(editor): subtitle-first editor + AI subtitle pipeline
89e1dc4
import json
import re
from uuid import uuid4
from app.core.config import Settings
from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment
class QwenHighlightDetector:
def __init__(self, settings: Settings) -> None:
self.settings = settings
self._llm = None
def detect(
self, transcript: list[TranscriptSegment], profile: ChannelProfile
) -> list[ClipCandidate]:
if self.settings.demo_mode:
return self._heuristic_detect(transcript, profile)
try:
return self._qwen_detect(transcript, profile)
except Exception:
return self._heuristic_detect(transcript, profile)
def _qwen_detect(
self, transcript: list[TranscriptSegment], profile: ChannelProfile
) -> list[ClipCandidate]:
try:
from vllm import LLM, SamplingParams
except Exception as exc:
raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc
if self._llm is None:
self._llm = LLM(
model=self.settings.qwen_text_model_id,
dtype=self.settings.preferred_torch_dtype,
trust_remote_code=True,
)
transcript_text = "\n".join(
f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
for segment in transcript
)
niche = _effective_niche(profile)
channel_description = profile.channel_description or "No extra channel description provided."
clip_count = min(profile.clip_count, self.settings.max_clips)
prompt = f"""
You are selecting short-form clips for a creator.
Profile:
- niche: {niche}
- creator description: {channel_description}
- style: {profile.clip_style}
- target length seconds: {profile.clip_length_seconds}
- target number of clips: {clip_count}
- language: {profile.primary_language}
- platform: {profile.target_platform.value}
Return strict JSON only. Shape:
[
{{
"start_seconds": 12.0,
"end_seconds": 72.0,
"title": "short title",
"reason": "why this will engage viewers",
"score": 91,
"subtitle_text": "clean subtitle text"
}}
]
Transcript:
{transcript_text}
""".strip()
sampling = SamplingParams(temperature=0.2, max_tokens=1200)
outputs = self._llm.generate([prompt], sampling)
text = outputs[0].outputs[0].text
payload = self._parse_json_array(text)
clips = [
ClipCandidate(
id=uuid4().hex,
start_seconds=float(item["start_seconds"]),
end_seconds=float(item["end_seconds"]),
title=str(item.get("title") or "Highlight"),
reason=str(item.get("reason") or "High engagement potential"),
score=float(item.get("score") or 75),
subtitle_text=str(item.get("subtitle_text") or ""),
metadata={"model": self.settings.qwen_text_model_id},
)
for item in payload[:clip_count]
]
return clips or self._heuristic_detect(transcript, profile)
def _parse_json_array(self, text: str) -> list[dict]:
match = re.search(r"\[[\s\S]*\]", text)
if not match:
raise ValueError("No JSON array in Qwen response")
payload = json.loads(match.group(0))
if not isinstance(payload, list):
raise ValueError("Qwen response is not a list")
return payload
# ──────────────────────────────────────────────────────────────
# AI subtitle actions (Polish, Translate)
# ──────────────────────────────────────────────────────────────
def polish_subtitles(
self, cues: list[SubtitleCue], style: str | None = None
) -> list[SubtitleCue]:
"""Rewrite cue text to be punchier and more readable on short-form video.
Demo mode returns deterministic polished text so the UX is testable
without GPU. Production mode calls Qwen2.5.
"""
if self.settings.demo_mode:
return self._heuristic_polish(cues, style)
try:
return self._qwen_polish(cues, style)
except Exception:
return self._heuristic_polish(cues, style)
def translate_subtitles(
self, cues: list[SubtitleCue], target_language: str
) -> list[SubtitleCue]:
"""Translate cue text to target_language while preserving timing."""
if self.settings.demo_mode:
return self._heuristic_translate(cues, target_language)
try:
return self._qwen_translate(cues, target_language)
except Exception:
return self._heuristic_translate(cues, target_language)
# ──────────────────────────────────────────────────────────────
# Demo / fallback implementations
# ──────────────────────────────────────────────────────────────
def _heuristic_polish(
self, cues: list[SubtitleCue], style: str | None
) -> list[SubtitleCue]:
"""Apply simple text transformations that look like an AI polish."""
polished: list[SubtitleCue] = []
for cue in cues:
text = (cue.text or "").strip()
if not text:
polished.append(cue.model_copy())
continue
# Shorten redundant phrasing (heuristic)
text = re.sub(r"\s+", " ", text)
text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE)
text = text.rstrip(" ,.;:")
# Add light emphasis based on style
if style and style.lower() == "dramatic" and not text.endswith("!"):
text = text + "!"
polished.append(
SubtitleCue(
start_seconds=cue.start_seconds,
end_seconds=cue.end_seconds,
text=text,
)
)
return polished
def _heuristic_translate(
self, cues: list[SubtitleCue], target_language: str
) -> list[SubtitleCue]:
"""Demo translation: append a marker so the UX shows the action ran."""
marker = f"[{target_language[:2].upper()}]"
translated: list[SubtitleCue] = []
for cue in cues:
text = (cue.text or "").strip()
translated.append(
SubtitleCue(
start_seconds=cue.start_seconds,
end_seconds=cue.end_seconds,
text=f"{marker} {text}" if text else "",
)
)
return translated
# ──────────────────────────────────────────────────────────────
# Production Qwen calls (used when DEMO_MODE=false on AMD GPU)
# ──────────────────────────────────────────────────────────────
def _ensure_llm(self):
try:
from vllm import LLM
except Exception as exc:
raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc
if self._llm is None:
self._llm = LLM(
model=self.settings.qwen_text_model_id,
dtype=self.settings.preferred_torch_dtype,
trust_remote_code=True,
)
return self._llm
def _qwen_polish(
self, cues: list[SubtitleCue], style: str | None
) -> list[SubtitleCue]:
from vllm import SamplingParams
llm = self._ensure_llm()
joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
prompt = f"""
Rewrite each subtitle line to be punchier and easier to read on short-form vertical video.
Keep the same number of lines and the same approximate length per line.
Style preference: {style or 'natural'}.
Return one rewritten line per row, prefixed with the original index. No commentary.
Input:
{joined}
""".strip()
outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800))
raw = outputs[0].outputs[0].text
rewritten = self._parse_indexed_lines(raw, expected=len(cues))
return [
SubtitleCue(
start_seconds=cue.start_seconds,
end_seconds=cue.end_seconds,
text=rewritten[i] if i < len(rewritten) else cue.text,
)
for i, cue in enumerate(cues)
]
def _qwen_translate(
self, cues: list[SubtitleCue], target_language: str
) -> list[SubtitleCue]:
from vllm import SamplingParams
llm = self._ensure_llm()
joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
prompt = f"""
Translate each subtitle line into {target_language}. Preserve line count and order.
Return one translated line per row, prefixed with the original index. No commentary.
Input:
{joined}
""".strip()
outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000))
raw = outputs[0].outputs[0].text
translated = self._parse_indexed_lines(raw, expected=len(cues))
return [
SubtitleCue(
start_seconds=cue.start_seconds,
end_seconds=cue.end_seconds,
text=translated[i] if i < len(translated) else cue.text,
)
for i, cue in enumerate(cues)
]
def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]:
lines = []
for line in raw.splitlines():
stripped = line.strip()
if not stripped:
continue
match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped)
lines.append(match.group(1).strip() if match else stripped)
if len(lines) >= expected:
break
return lines
def _heuristic_detect(
self, transcript: list[TranscriptSegment], profile: ChannelProfile
) -> list[ClipCandidate]:
style_terms = {
"funny": ["react", "punchy", "mistake", "surprising"],
"informative": ["important", "practical", "takeaway", "explanation"],
"dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
"educational": ["question", "answer", "context", "takeaway"],
}
preferred_terms = style_terms.get(profile.clip_style.lower(), [])
niche = _effective_niche(profile)
profile_terms = [
term
for term in f"{niche} {profile.channel_description}".lower().split()[:30]
if len(term) > 2
]
scored: list[tuple[float, TranscriptSegment]] = []
for segment in transcript:
text = segment.text.lower()
score = 45.0
score += 12 if "?" in segment.text else 0
score += 8 if any(term in text for term in preferred_terms) else 0
score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
score += 5 if any(term in text for term in profile_terms) else 0
score += min(len(segment.text) / 12, 10)
scored.append((min(score, 100), segment))
scored.sort(key=lambda item: item[0], reverse=True)
clips: list[ClipCandidate] = []
clip_count = min(profile.clip_count, self.settings.max_clips)
for score, segment in scored[:clip_count]:
start = max(0.0, segment.start_seconds - 5.0)
end = start + float(profile.clip_length_seconds)
clips.append(
ClipCandidate(
id=uuid4().hex,
start_seconds=start,
end_seconds=end,
title=self._title_for(segment.text),
reason=self._reason_for(profile, niche),
score=round(score, 1),
subtitle_text=segment.text,
metadata={"model": "heuristic-fallback"},
)
)
return sorted(clips, key=lambda clip: clip.start_seconds)
def _title_for(self, text: str) -> str:
clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'")
words = clean.split()
if len(words) > 1:
title = " ".join(words[:7])
else:
title = clean[:48]
return title[:72].rstrip() or "Highlight"
def _reason_for(self, profile: ChannelProfile, niche: str) -> str:
language = profile.primary_language.lower()
style = _localized_profile_word(profile.clip_style, language, "style")
niche_label = _localized_profile_word(niche, language, "niche")
if "thai" in language:
return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}"
if "japanese" in language:
return f"{niche_label} の視聴者に合う {style} スタイルの候補です。"
if "chinese" in language:
return f"符合 {niche_label} 受众期待的 {style} 风格。"
if "korean" in language:
return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다."
return f"Matches the {profile.clip_style} style for a {niche} audience."
def _effective_niche(profile: ChannelProfile) -> str:
if profile.niche.lower() == "other" and profile.niche_custom:
return profile.niche_custom
return profile.niche
def _localized_profile_word(value: str, language: str, group: str) -> str:
key = value.lower().replace(" ", "_")
localized = {
"thai": {
"niche": {
"education": "การศึกษา",
"gaming": "เกม",
"podcast": "พอดแคสต์",
"commentary": "เล่า/วิเคราะห์",
"cars": "รถยนต์",
"beauty": "บิวตี้",
"fitness": "ฟิตเนส",
"finance": "การเงิน",
"tech": "เทคโนโลยี",
"lifestyle": "ไลฟ์สไตล์",
"music": "ดนตรี",
},
"style": {
"informative": "ให้ข้อมูล",
"funny": "ตลก",
"dramatic": "ดราม่า",
"educational": "สอนเข้าใจง่าย",
"commentary": "วิเคราะห์",
},
},
"japanese": {
"niche": {
"education": "教育",
"gaming": "ゲーム",
"podcast": "ポッドキャスト",
"commentary": "解説",
"cars": "車",
"beauty": "美容",
"fitness": "フィットネス",
"finance": "金融",
"tech": "テック",
"lifestyle": "ライフスタイル",
"music": "音楽",
},
"style": {
"informative": "情報性の高い",
"funny": "ユーモアのある",
"dramatic": "ドラマチックな",
"educational": "学びやすい",
"commentary": "解説型の",
},
},
"chinese": {
"niche": {
"education": "教育",
"gaming": "游戏",
"podcast": "播客",
"commentary": "解说",
"cars": "汽车",
"beauty": "美妆",
"fitness": "健身",
"finance": "金融",
"tech": "科技",
"lifestyle": "生活方式",
"music": "音乐",
},
"style": {
"informative": "信息量高",
"funny": "有趣",
"dramatic": "戏剧化",
"educational": "教学型",
"commentary": "评论型",
},
},
"korean": {
"niche": {
"education": "교육",
"gaming": "게임",
"podcast": "팟캐스트",
"commentary": "해설",
"cars": "자동차",
"beauty": "뷰티",
"fitness": "피트니스",
"finance": "금융",
"tech": "테크",
"lifestyle": "라이프스타일",
"music": "음악",
},
"style": {
"informative": "정보형",
"funny": "재미있는",
"dramatic": "극적인",
"educational": "교육형",
"commentary": "해설형",
},
},
}
for language_key, groups in localized.items():
if language_key in language:
return groups.get(group, {}).get(key, value)
return value