import json import re from uuid import uuid4 from app.core.config import Settings from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment class QwenHighlightDetector: def __init__(self, settings: Settings) -> None: self.settings = settings self._llm = None def detect( self, transcript: list[TranscriptSegment], profile: ChannelProfile ) -> list[ClipCandidate]: if self.settings.demo_mode: return self._heuristic_detect(transcript, profile) try: return self._qwen_detect(transcript, profile) except Exception: return self._heuristic_detect(transcript, profile) def _qwen_detect( self, transcript: list[TranscriptSegment], profile: ChannelProfile ) -> list[ClipCandidate]: try: from vllm import LLM, SamplingParams except Exception as exc: raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc if self._llm is None: self._llm = LLM( model=self.settings.qwen_text_model_id, dtype=self.settings.preferred_torch_dtype, trust_remote_code=True, ) transcript_text = "\n".join( f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}" for segment in transcript ) niche = _effective_niche(profile) channel_description = profile.channel_description or "No extra channel description provided." clip_count = min(profile.clip_count, self.settings.max_clips) prompt = f""" You are selecting short-form clips for a creator. Profile: - niche: {niche} - creator description: {channel_description} - style: {profile.clip_style} - target length seconds: {profile.clip_length_seconds} - target number of clips: {clip_count} - language: {profile.primary_language} - platform: {profile.target_platform.value} Return strict JSON only. Shape: [ {{ "start_seconds": 12.0, "end_seconds": 72.0, "title": "short title", "reason": "why this will engage viewers", "score": 91, "subtitle_text": "clean subtitle text" }} ] Transcript: {transcript_text} """.strip() sampling = SamplingParams(temperature=0.2, max_tokens=1200) outputs = self._llm.generate([prompt], sampling) text = outputs[0].outputs[0].text payload = self._parse_json_array(text) clips = [ ClipCandidate( id=uuid4().hex, start_seconds=float(item["start_seconds"]), end_seconds=float(item["end_seconds"]), title=str(item.get("title") or "Highlight"), reason=str(item.get("reason") or "High engagement potential"), score=float(item.get("score") or 75), subtitle_text=str(item.get("subtitle_text") or ""), metadata={"model": self.settings.qwen_text_model_id}, ) for item in payload[:clip_count] ] return clips or self._heuristic_detect(transcript, profile) def _parse_json_array(self, text: str) -> list[dict]: match = re.search(r"\[[\s\S]*\]", text) if not match: raise ValueError("No JSON array in Qwen response") payload = json.loads(match.group(0)) if not isinstance(payload, list): raise ValueError("Qwen response is not a list") return payload # ────────────────────────────────────────────────────────────── # AI subtitle actions (Polish, Translate) # ────────────────────────────────────────────────────────────── def polish_subtitles( self, cues: list[SubtitleCue], style: str | None = None ) -> list[SubtitleCue]: """Rewrite cue text to be punchier and more readable on short-form video. Demo mode returns deterministic polished text so the UX is testable without GPU. Production mode calls Qwen2.5. """ if self.settings.demo_mode: return self._heuristic_polish(cues, style) try: return self._qwen_polish(cues, style) except Exception: return self._heuristic_polish(cues, style) def translate_subtitles( self, cues: list[SubtitleCue], target_language: str ) -> list[SubtitleCue]: """Translate cue text to target_language while preserving timing.""" if self.settings.demo_mode: return self._heuristic_translate(cues, target_language) try: return self._qwen_translate(cues, target_language) except Exception: return self._heuristic_translate(cues, target_language) # ────────────────────────────────────────────────────────────── # Demo / fallback implementations # ────────────────────────────────────────────────────────────── def _heuristic_polish( self, cues: list[SubtitleCue], style: str | None ) -> list[SubtitleCue]: """Apply simple text transformations that look like an AI polish.""" polished: list[SubtitleCue] = [] for cue in cues: text = (cue.text or "").strip() if not text: polished.append(cue.model_copy()) continue # Shorten redundant phrasing (heuristic) text = re.sub(r"\s+", " ", text) text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE) text = text.rstrip(" ,.;:") # Add light emphasis based on style if style and style.lower() == "dramatic" and not text.endswith("!"): text = text + "!" polished.append( SubtitleCue( start_seconds=cue.start_seconds, end_seconds=cue.end_seconds, text=text, ) ) return polished def _heuristic_translate( self, cues: list[SubtitleCue], target_language: str ) -> list[SubtitleCue]: """Demo translation: append a marker so the UX shows the action ran.""" marker = f"[{target_language[:2].upper()}]" translated: list[SubtitleCue] = [] for cue in cues: text = (cue.text or "").strip() translated.append( SubtitleCue( start_seconds=cue.start_seconds, end_seconds=cue.end_seconds, text=f"{marker} {text}" if text else "", ) ) return translated # ────────────────────────────────────────────────────────────── # Production Qwen calls (used when DEMO_MODE=false on AMD GPU) # ────────────────────────────────────────────────────────────── def _ensure_llm(self): try: from vllm import LLM except Exception as exc: raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc if self._llm is None: self._llm = LLM( model=self.settings.qwen_text_model_id, dtype=self.settings.preferred_torch_dtype, trust_remote_code=True, ) return self._llm def _qwen_polish( self, cues: list[SubtitleCue], style: str | None ) -> list[SubtitleCue]: from vllm import SamplingParams llm = self._ensure_llm() joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) prompt = f""" Rewrite each subtitle line to be punchier and easier to read on short-form vertical video. Keep the same number of lines and the same approximate length per line. Style preference: {style or 'natural'}. Return one rewritten line per row, prefixed with the original index. No commentary. Input: {joined} """.strip() outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800)) raw = outputs[0].outputs[0].text rewritten = self._parse_indexed_lines(raw, expected=len(cues)) return [ SubtitleCue( start_seconds=cue.start_seconds, end_seconds=cue.end_seconds, text=rewritten[i] if i < len(rewritten) else cue.text, ) for i, cue in enumerate(cues) ] def _qwen_translate( self, cues: list[SubtitleCue], target_language: str ) -> list[SubtitleCue]: from vllm import SamplingParams llm = self._ensure_llm() joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues)) prompt = f""" Translate each subtitle line into {target_language}. Preserve line count and order. Return one translated line per row, prefixed with the original index. No commentary. Input: {joined} """.strip() outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000)) raw = outputs[0].outputs[0].text translated = self._parse_indexed_lines(raw, expected=len(cues)) return [ SubtitleCue( start_seconds=cue.start_seconds, end_seconds=cue.end_seconds, text=translated[i] if i < len(translated) else cue.text, ) for i, cue in enumerate(cues) ] def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]: lines = [] for line in raw.splitlines(): stripped = line.strip() if not stripped: continue match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped) lines.append(match.group(1).strip() if match else stripped) if len(lines) >= expected: break return lines def _heuristic_detect( self, transcript: list[TranscriptSegment], profile: ChannelProfile ) -> list[ClipCandidate]: style_terms = { "funny": ["react", "punchy", "mistake", "surprising"], "informative": ["important", "practical", "takeaway", "explanation"], "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"], "educational": ["question", "answer", "context", "takeaway"], } preferred_terms = style_terms.get(profile.clip_style.lower(), []) niche = _effective_niche(profile) profile_terms = [ term for term in f"{niche} {profile.channel_description}".lower().split()[:30] if len(term) > 2 ] scored: list[tuple[float, TranscriptSegment]] = [] for segment in transcript: text = segment.text.lower() score = 45.0 score += 12 if "?" in segment.text else 0 score += 8 if any(term in text for term in preferred_terms) else 0 score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0 score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0 score += 5 if any(term in text for term in profile_terms) else 0 score += min(len(segment.text) / 12, 10) scored.append((min(score, 100), segment)) scored.sort(key=lambda item: item[0], reverse=True) clips: list[ClipCandidate] = [] clip_count = min(profile.clip_count, self.settings.max_clips) for score, segment in scored[:clip_count]: start = max(0.0, segment.start_seconds - 5.0) end = start + float(profile.clip_length_seconds) clips.append( ClipCandidate( id=uuid4().hex, start_seconds=start, end_seconds=end, title=self._title_for(segment.text), reason=self._reason_for(profile, niche), score=round(score, 1), subtitle_text=segment.text, metadata={"model": "heuristic-fallback"}, ) ) return sorted(clips, key=lambda clip: clip.start_seconds) def _title_for(self, text: str) -> str: clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'") words = clean.split() if len(words) > 1: title = " ".join(words[:7]) else: title = clean[:48] return title[:72].rstrip() or "Highlight" def _reason_for(self, profile: ChannelProfile, niche: str) -> str: language = profile.primary_language.lower() style = _localized_profile_word(profile.clip_style, language, "style") niche_label = _localized_profile_word(niche, language, "niche") if "thai" in language: return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}" if "japanese" in language: return f"{niche_label} の視聴者に合う {style} スタイルの候補です。" if "chinese" in language: return f"符合 {niche_label} 受众期待的 {style} 风格。" if "korean" in language: return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다." return f"Matches the {profile.clip_style} style for a {niche} audience." def _effective_niche(profile: ChannelProfile) -> str: if profile.niche.lower() == "other" and profile.niche_custom: return profile.niche_custom return profile.niche def _localized_profile_word(value: str, language: str, group: str) -> str: key = value.lower().replace(" ", "_") localized = { "thai": { "niche": { "education": "การศึกษา", "gaming": "เกม", "podcast": "พอดแคสต์", "commentary": "เล่า/วิเคราะห์", "cars": "รถยนต์", "beauty": "บิวตี้", "fitness": "ฟิตเนส", "finance": "การเงิน", "tech": "เทคโนโลยี", "lifestyle": "ไลฟ์สไตล์", "music": "ดนตรี", }, "style": { "informative": "ให้ข้อมูล", "funny": "ตลก", "dramatic": "ดราม่า", "educational": "สอนเข้าใจง่าย", "commentary": "วิเคราะห์", }, }, "japanese": { "niche": { "education": "教育", "gaming": "ゲーム", "podcast": "ポッドキャスト", "commentary": "解説", "cars": "車", "beauty": "美容", "fitness": "フィットネス", "finance": "金融", "tech": "テック", "lifestyle": "ライフスタイル", "music": "音楽", }, "style": { "informative": "情報性の高い", "funny": "ユーモアのある", "dramatic": "ドラマチックな", "educational": "学びやすい", "commentary": "解説型の", }, }, "chinese": { "niche": { "education": "教育", "gaming": "游戏", "podcast": "播客", "commentary": "解说", "cars": "汽车", "beauty": "美妆", "fitness": "健身", "finance": "金融", "tech": "科技", "lifestyle": "生活方式", "music": "音乐", }, "style": { "informative": "信息量高", "funny": "有趣", "dramatic": "戏剧化", "educational": "教学型", "commentary": "评论型", }, }, "korean": { "niche": { "education": "교육", "gaming": "게임", "podcast": "팟캐스트", "commentary": "해설", "cars": "자동차", "beauty": "뷰티", "fitness": "피트니스", "finance": "금융", "tech": "테크", "lifestyle": "라이프스타일", "music": "음악", }, "style": { "informative": "정보형", "funny": "재미있는", "dramatic": "극적인", "educational": "교육형", "commentary": "해설형", }, }, } for language_key, groups in localized.items(): if language_key in language: return groups.get(group, {}).get(key, value) return value