| import json |
| import re |
| from uuid import uuid4 |
|
|
| from app.core.config import Settings |
| from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment |
|
|
|
|
| class QwenHighlightDetector: |
| def __init__(self, settings: Settings) -> None: |
| self.settings = settings |
| self._llm = None |
|
|
| def detect( |
| self, transcript: list[TranscriptSegment], profile: ChannelProfile |
| ) -> list[ClipCandidate]: |
| if self.settings.demo_mode: |
| return self._heuristic_detect(transcript, profile) |
|
|
| try: |
| return self._qwen_detect(transcript, profile) |
| except Exception: |
| return self._heuristic_detect(transcript, profile) |
|
|
| def _qwen_detect( |
| self, transcript: list[TranscriptSegment], profile: ChannelProfile |
| ) -> list[ClipCandidate]: |
| try: |
| from vllm import LLM, SamplingParams |
| except Exception as exc: |
| raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc |
|
|
| if self._llm is None: |
| self._llm = LLM( |
| model=self.settings.qwen_text_model_id, |
| dtype=self.settings.preferred_torch_dtype, |
| trust_remote_code=True, |
| ) |
|
|
| transcript_text = "\n".join( |
| f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}" |
| for segment in transcript |
| ) |
| niche = _effective_niche(profile) |
| channel_description = profile.channel_description or "No extra channel description provided." |
| clip_count = min(profile.clip_count, self.settings.max_clips) |
| prompt = f""" |
| You are selecting short-form clips for a creator. |
| Profile: |
| - niche: {niche} |
| - creator description: {channel_description} |
| - style: {profile.clip_style} |
| - target length seconds: {profile.clip_length_seconds} |
| - target number of clips: {clip_count} |
| - language: {profile.primary_language} |
| - platform: {profile.target_platform.value} |
| |
| Return strict JSON only. Shape: |
| [ |
| {{ |
| "start_seconds": 12.0, |
| "end_seconds": 72.0, |
| "title": "short title", |
| "reason": "why this will engage viewers", |
| "score": 91, |
| "subtitle_text": "clean subtitle text" |
| }} |
| ] |
| |
| Transcript: |
| {transcript_text} |
| """.strip() |
| sampling = SamplingParams(temperature=0.2, max_tokens=1200) |
| outputs = self._llm.generate([prompt], sampling) |
| text = outputs[0].outputs[0].text |
| payload = self._parse_json_array(text) |
| clips = [ |
| ClipCandidate( |
| id=uuid4().hex, |
| start_seconds=float(item["start_seconds"]), |
| end_seconds=float(item["end_seconds"]), |
| title=str(item.get("title") or "Highlight"), |
| reason=str(item.get("reason") or "High engagement potential"), |
| score=float(item.get("score") or 75), |
| subtitle_text=str(item.get("subtitle_text") or ""), |
| metadata={"model": self.settings.qwen_text_model_id}, |
| ) |
| for item in payload[:clip_count] |
| ] |
| return clips or self._heuristic_detect(transcript, profile) |
|
|
| def _parse_json_array(self, text: str) -> list[dict]: |
| match = re.search(r"\[[\s\S]*\]", text) |
| if not match: |
| raise ValueError("No JSON array in Qwen response") |
| payload = json.loads(match.group(0)) |
| if not isinstance(payload, list): |
| raise ValueError("Qwen response is not a list") |
| return payload |
|
|
| |
| |
| |
|
|
| def polish_subtitles( |
| self, cues: list[SubtitleCue], style: str | None = None |
| ) -> list[SubtitleCue]: |
| """Rewrite cue text to be punchier and more readable on short-form video. |
| |
| Demo mode returns deterministic polished text so the UX is testable |
| without GPU. Production mode calls Qwen2.5. |
| """ |
| if self.settings.demo_mode: |
| return self._heuristic_polish(cues, style) |
| try: |
| return self._qwen_polish(cues, style) |
| except Exception: |
| return self._heuristic_polish(cues, style) |
|
|
| def translate_subtitles( |
| self, cues: list[SubtitleCue], target_language: str |
| ) -> list[SubtitleCue]: |
| """Translate cue text to target_language while preserving timing.""" |
| if self.settings.demo_mode: |
| return self._heuristic_translate(cues, target_language) |
| try: |
| return self._qwen_translate(cues, target_language) |
| except Exception: |
| return self._heuristic_translate(cues, target_language) |
|
|
| |
| |
| |
|
|
| def _heuristic_polish( |
| self, cues: list[SubtitleCue], style: str | None |
| ) -> list[SubtitleCue]: |
| """Apply simple text transformations that look like an AI polish.""" |
| polished: list[SubtitleCue] = [] |
| for cue in cues: |
| text = (cue.text or "").strip() |
| if not text: |
| polished.append(cue.model_copy()) |
| continue |
| |
| text = re.sub(r"\s+", " ", text) |
| text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE) |
| text = text.rstrip(" ,.;:") |
| |
| if style and style.lower() == "dramatic" and not text.endswith("!"): |
| text = text + "!" |
| polished.append( |
| SubtitleCue( |
| start_seconds=cue.start_seconds, |
| end_seconds=cue.end_seconds, |
| text=text, |
| ) |
| ) |
| return polished |
|
|
| def _heuristic_translate( |
| self, cues: list[SubtitleCue], target_language: str |
| ) -> list[SubtitleCue]: |
| """Demo translation: append a marker so the UX shows the action ran.""" |
| marker = f"[{target_language[:2].upper()}]" |
| translated: list[SubtitleCue] = [] |
| for cue in cues: |
| text = (cue.text or "").strip() |
| translated.append( |
| SubtitleCue( |
| start_seconds=cue.start_seconds, |
| end_seconds=cue.end_seconds, |
| text=f"{marker} {text}" if text else "", |
| ) |
| ) |
| return translated |
|
|
| |
| |
| |
|
|
| def _ensure_llm(self): |
| try: |
| from vllm import LLM |
| except Exception as exc: |
| raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc |
| if self._llm is None: |
| self._llm = LLM( |
| model=self.settings.qwen_text_model_id, |
| dtype=self.settings.preferred_torch_dtype, |
| trust_remote_code=True, |
| ) |
| return self._llm |
|
|
| def _qwen_polish( |
| self, cues: list[SubtitleCue], style: str | None |
| ) -> list[SubtitleCue]: |
| from vllm import SamplingParams |
|
|
| llm = self._ensure_llm() |
| joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) |
| prompt = f""" |
| Rewrite each subtitle line to be punchier and easier to read on short-form vertical video. |
| Keep the same number of lines and the same approximate length per line. |
| Style preference: {style or 'natural'}. |
| Return one rewritten line per row, prefixed with the original index. No commentary. |
| |
| Input: |
| {joined} |
| """.strip() |
| outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800)) |
| raw = outputs[0].outputs[0].text |
| rewritten = self._parse_indexed_lines(raw, expected=len(cues)) |
| return [ |
| SubtitleCue( |
| start_seconds=cue.start_seconds, |
| end_seconds=cue.end_seconds, |
| text=rewritten[i] if i < len(rewritten) else cue.text, |
| ) |
| for i, cue in enumerate(cues) |
| ] |
|
|
| def _qwen_translate( |
| self, cues: list[SubtitleCue], target_language: str |
| ) -> list[SubtitleCue]: |
| from vllm import SamplingParams |
|
|
| llm = self._ensure_llm() |
| joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) |
| prompt = f""" |
| Translate each subtitle line into {target_language}. Preserve line count and order. |
| Return one translated line per row, prefixed with the original index. No commentary. |
| |
| Input: |
| {joined} |
| """.strip() |
| outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000)) |
| raw = outputs[0].outputs[0].text |
| translated = self._parse_indexed_lines(raw, expected=len(cues)) |
| return [ |
| SubtitleCue( |
| start_seconds=cue.start_seconds, |
| end_seconds=cue.end_seconds, |
| text=translated[i] if i < len(translated) else cue.text, |
| ) |
| for i, cue in enumerate(cues) |
| ] |
|
|
| def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]: |
| lines = [] |
| for line in raw.splitlines(): |
| stripped = line.strip() |
| if not stripped: |
| continue |
| match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped) |
| lines.append(match.group(1).strip() if match else stripped) |
| if len(lines) >= expected: |
| break |
| return lines |
|
|
| def _heuristic_detect( |
| self, transcript: list[TranscriptSegment], profile: ChannelProfile |
| ) -> list[ClipCandidate]: |
| style_terms = { |
| "funny": ["react", "punchy", "mistake", "surprising"], |
| "informative": ["important", "practical", "takeaway", "explanation"], |
| "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"], |
| "educational": ["question", "answer", "context", "takeaway"], |
| } |
| preferred_terms = style_terms.get(profile.clip_style.lower(), []) |
| niche = _effective_niche(profile) |
| profile_terms = [ |
| term |
| for term in f"{niche} {profile.channel_description}".lower().split()[:30] |
| if len(term) > 2 |
| ] |
| scored: list[tuple[float, TranscriptSegment]] = [] |
| for segment in transcript: |
| text = segment.text.lower() |
| score = 45.0 |
| score += 12 if "?" in segment.text else 0 |
| score += 8 if any(term in text for term in preferred_terms) else 0 |
| score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0 |
| score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0 |
| score += 5 if any(term in text for term in profile_terms) else 0 |
| score += min(len(segment.text) / 12, 10) |
| scored.append((min(score, 100), segment)) |
|
|
| scored.sort(key=lambda item: item[0], reverse=True) |
| clips: list[ClipCandidate] = [] |
| clip_count = min(profile.clip_count, self.settings.max_clips) |
| for score, segment in scored[:clip_count]: |
| start = max(0.0, segment.start_seconds - 5.0) |
| end = start + float(profile.clip_length_seconds) |
| clips.append( |
| ClipCandidate( |
| id=uuid4().hex, |
| start_seconds=start, |
| end_seconds=end, |
| title=self._title_for(segment.text), |
| reason=self._reason_for(profile, niche), |
| score=round(score, 1), |
| subtitle_text=segment.text, |
| metadata={"model": "heuristic-fallback"}, |
| ) |
| ) |
| return sorted(clips, key=lambda clip: clip.start_seconds) |
|
|
| def _title_for(self, text: str) -> str: |
| clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'") |
| words = clean.split() |
| if len(words) > 1: |
| title = " ".join(words[:7]) |
| else: |
| title = clean[:48] |
| return title[:72].rstrip() or "Highlight" |
|
|
| def _reason_for(self, profile: ChannelProfile, niche: str) -> str: |
| language = profile.primary_language.lower() |
| style = _localized_profile_word(profile.clip_style, language, "style") |
| niche_label = _localized_profile_word(niche, language, "niche") |
| if "thai" in language: |
| return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}" |
| if "japanese" in language: |
| return f"{niche_label} の視聴者に合う {style} スタイルの候補です。" |
| if "chinese" in language: |
| return f"符合 {niche_label} 受众期待的 {style} 风格。" |
| if "korean" in language: |
| return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다." |
| return f"Matches the {profile.clip_style} style for a {niche} audience." |
|
|
|
|
| def _effective_niche(profile: ChannelProfile) -> str: |
| if profile.niche.lower() == "other" and profile.niche_custom: |
| return profile.niche_custom |
| return profile.niche |
|
|
|
|
| def _localized_profile_word(value: str, language: str, group: str) -> str: |
| key = value.lower().replace(" ", "_") |
| localized = { |
| "thai": { |
| "niche": { |
| "education": "การศึกษา", |
| "gaming": "เกม", |
| "podcast": "พอดแคสต์", |
| "commentary": "เล่า/วิเคราะห์", |
| "cars": "รถยนต์", |
| "beauty": "บิวตี้", |
| "fitness": "ฟิตเนส", |
| "finance": "การเงิน", |
| "tech": "เทคโนโลยี", |
| "lifestyle": "ไลฟ์สไตล์", |
| "music": "ดนตรี", |
| }, |
| "style": { |
| "informative": "ให้ข้อมูล", |
| "funny": "ตลก", |
| "dramatic": "ดราม่า", |
| "educational": "สอนเข้าใจง่าย", |
| "commentary": "วิเคราะห์", |
| }, |
| }, |
| "japanese": { |
| "niche": { |
| "education": "教育", |
| "gaming": "ゲーム", |
| "podcast": "ポッドキャスト", |
| "commentary": "解説", |
| "cars": "車", |
| "beauty": "美容", |
| "fitness": "フィットネス", |
| "finance": "金融", |
| "tech": "テック", |
| "lifestyle": "ライフスタイル", |
| "music": "音楽", |
| }, |
| "style": { |
| "informative": "情報性の高い", |
| "funny": "ユーモアのある", |
| "dramatic": "ドラマチックな", |
| "educational": "学びやすい", |
| "commentary": "解説型の", |
| }, |
| }, |
| "chinese": { |
| "niche": { |
| "education": "教育", |
| "gaming": "游戏", |
| "podcast": "播客", |
| "commentary": "解说", |
| "cars": "汽车", |
| "beauty": "美妆", |
| "fitness": "健身", |
| "finance": "金融", |
| "tech": "科技", |
| "lifestyle": "生活方式", |
| "music": "音乐", |
| }, |
| "style": { |
| "informative": "信息量高", |
| "funny": "有趣", |
| "dramatic": "戏剧化", |
| "educational": "教学型", |
| "commentary": "评论型", |
| }, |
| }, |
| "korean": { |
| "niche": { |
| "education": "교육", |
| "gaming": "게임", |
| "podcast": "팟캐스트", |
| "commentary": "해설", |
| "cars": "자동차", |
| "beauty": "뷰티", |
| "fitness": "피트니스", |
| "finance": "금융", |
| "tech": "테크", |
| "lifestyle": "라이프스타일", |
| "music": "음악", |
| }, |
| "style": { |
| "informative": "정보형", |
| "funny": "재미있는", |
| "dramatic": "극적인", |
| "educational": "교육형", |
| "commentary": "해설형", |
| }, |
| }, |
| } |
| for language_key, groups in localized.items(): |
| if language_key in language: |
| return groups.get(group, {}).get(key, value) |
| return value |
|
|