Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Running

File size: 17,760 Bytes

import json
import re
from uuid import uuid4

from app.core.config import Settings
from app.models.schemas import ChannelProfile, ClipCandidate, SubtitleCue, TranscriptSegment


class QwenHighlightDetector:
    def __init__(self, settings: Settings) -> None:
        self.settings = settings
        self._llm = None

    def detect(
        self, transcript: list[TranscriptSegment], profile: ChannelProfile
    ) -> list[ClipCandidate]:
        if self.settings.demo_mode:
            return self._heuristic_detect(transcript, profile)

        try:
            return self._qwen_detect(transcript, profile)
        except Exception:
            return self._heuristic_detect(transcript, profile)

    def _qwen_detect(
        self, transcript: list[TranscriptSegment], profile: ChannelProfile
    ) -> list[ClipCandidate]:
        try:
            from vllm import LLM, SamplingParams
        except Exception as exc:
            raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc

        if self._llm is None:
            self._llm = LLM(
                model=self.settings.qwen_text_model_id,
                dtype=self.settings.preferred_torch_dtype,
                trust_remote_code=True,
            )

        transcript_text = "\n".join(
            f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
            for segment in transcript
        )
        niche = _effective_niche(profile)
        channel_description = profile.channel_description or "No extra channel description provided."
        clip_count = min(profile.clip_count, self.settings.max_clips)
        prompt = f"""
You are selecting short-form clips for a creator.
Profile:
- niche: {niche}
- creator description: {channel_description}
- style: {profile.clip_style}
- target length seconds: {profile.clip_length_seconds}
- target number of clips: {clip_count}
- language: {profile.primary_language}
- platform: {profile.target_platform.value}

Return strict JSON only. Shape:
[
  {{
    "start_seconds": 12.0,
    "end_seconds": 72.0,
    "title": "short title",
    "reason": "why this will engage viewers",
    "score": 91,
    "subtitle_text": "clean subtitle text"
  }}
]

Transcript:
{transcript_text}
""".strip()
        sampling = SamplingParams(temperature=0.2, max_tokens=1200)
        outputs = self._llm.generate([prompt], sampling)
        text = outputs[0].outputs[0].text
        payload = self._parse_json_array(text)
        clips = [
            ClipCandidate(
                id=uuid4().hex,
                start_seconds=float(item["start_seconds"]),
                end_seconds=float(item["end_seconds"]),
                title=str(item.get("title") or "Highlight"),
                reason=str(item.get("reason") or "High engagement potential"),
                score=float(item.get("score") or 75),
                subtitle_text=str(item.get("subtitle_text") or ""),
                metadata={"model": self.settings.qwen_text_model_id},
            )
            for item in payload[:clip_count]
        ]
        return clips or self._heuristic_detect(transcript, profile)

    def _parse_json_array(self, text: str) -> list[dict]:
        match = re.search(r"\[[\s\S]*\]", text)
        if not match:
            raise ValueError("No JSON array in Qwen response")
        payload = json.loads(match.group(0))
        if not isinstance(payload, list):
            raise ValueError("Qwen response is not a list")
        return payload

    # ──────────────────────────────────────────────────────────────
    # AI subtitle actions (Polish, Translate)
    # ──────────────────────────────────────────────────────────────

    def polish_subtitles(
        self, cues: list[SubtitleCue], style: str | None = None
    ) -> list[SubtitleCue]:
        """Rewrite cue text to be punchier and more readable on short-form video.

        Demo mode returns deterministic polished text so the UX is testable
        without GPU. Production mode calls Qwen2.5.
        """
        if self.settings.demo_mode:
            return self._heuristic_polish(cues, style)
        try:
            return self._qwen_polish(cues, style)
        except Exception:
            return self._heuristic_polish(cues, style)

    def translate_subtitles(
        self, cues: list[SubtitleCue], target_language: str
    ) -> list[SubtitleCue]:
        """Translate cue text to target_language while preserving timing."""
        if self.settings.demo_mode:
            return self._heuristic_translate(cues, target_language)
        try:
            return self._qwen_translate(cues, target_language)
        except Exception:
            return self._heuristic_translate(cues, target_language)

    # ──────────────────────────────────────────────────────────────
    # Demo / fallback implementations
    # ──────────────────────────────────────────────────────────────

    def _heuristic_polish(
        self, cues: list[SubtitleCue], style: str | None
    ) -> list[SubtitleCue]:
        """Apply simple text transformations that look like an AI polish."""
        polished: list[SubtitleCue] = []
        for cue in cues:
            text = (cue.text or "").strip()
            if not text:
                polished.append(cue.model_copy())
                continue
            # Shorten redundant phrasing (heuristic)
            text = re.sub(r"\s+", " ", text)
            text = re.sub(r"^(so|well|like|um|uh|you know|i mean)[,\s]+", "", text, flags=re.IGNORECASE)
            text = text.rstrip(" ,.;:")
            # Add light emphasis based on style
            if style and style.lower() == "dramatic" and not text.endswith("!"):
                text = text + "!"
            polished.append(
                SubtitleCue(
                    start_seconds=cue.start_seconds,
                    end_seconds=cue.end_seconds,
                    text=text,
                )
            )
        return polished

    def _heuristic_translate(
        self, cues: list[SubtitleCue], target_language: str
    ) -> list[SubtitleCue]:
        """Demo translation: append a marker so the UX shows the action ran."""
        marker = f"[{target_language[:2].upper()}]"
        translated: list[SubtitleCue] = []
        for cue in cues:
            text = (cue.text or "").strip()
            translated.append(
                SubtitleCue(
                    start_seconds=cue.start_seconds,
                    end_seconds=cue.end_seconds,
                    text=f"{marker} {text}" if text else "",
                )
            )
        return translated

    # ──────────────────────────────────────────────────────────────
    # Production Qwen calls (used when DEMO_MODE=false on AMD GPU)
    # ──────────────────────────────────────────────────────────────

    def _ensure_llm(self):
        try:
            from vllm import LLM
        except Exception as exc:
            raise RuntimeError("vLLM with ROCm backend is required for Qwen") from exc
        if self._llm is None:
            self._llm = LLM(
                model=self.settings.qwen_text_model_id,
                dtype=self.settings.preferred_torch_dtype,
                trust_remote_code=True,
            )
        return self._llm

    def _qwen_polish(
        self, cues: list[SubtitleCue], style: str | None
    ) -> list[SubtitleCue]:
        from vllm import SamplingParams

        llm = self._ensure_llm()
        joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
        prompt = f"""
Rewrite each subtitle line to be punchier and easier to read on short-form vertical video.
Keep the same number of lines and the same approximate length per line.
Style preference: {style or 'natural'}.
Return one rewritten line per row, prefixed with the original index. No commentary.

Input:
{joined}
""".strip()
        outputs = llm.generate([prompt], SamplingParams(temperature=0.3, max_tokens=800))
        raw = outputs[0].outputs[0].text
        rewritten = self._parse_indexed_lines(raw, expected=len(cues))
        return [
            SubtitleCue(
                start_seconds=cue.start_seconds,
                end_seconds=cue.end_seconds,
                text=rewritten[i] if i < len(rewritten) else cue.text,
            )
            for i, cue in enumerate(cues)
        ]

    def _qwen_translate(
        self, cues: list[SubtitleCue], target_language: str
    ) -> list[SubtitleCue]:
        from vllm import SamplingParams

        llm = self._ensure_llm()
        joined = "\n".join(f"{i + 1}. {cue.text}" for i, cue in enumerate(cues))
        prompt = f"""
Translate each subtitle line into {target_language}. Preserve line count and order.
Return one translated line per row, prefixed with the original index. No commentary.

Input:
{joined}
""".strip()
        outputs = llm.generate([prompt], SamplingParams(temperature=0.2, max_tokens=1000))
        raw = outputs[0].outputs[0].text
        translated = self._parse_indexed_lines(raw, expected=len(cues))
        return [
            SubtitleCue(
                start_seconds=cue.start_seconds,
                end_seconds=cue.end_seconds,
                text=translated[i] if i < len(translated) else cue.text,
            )
            for i, cue in enumerate(cues)
        ]

    def _parse_indexed_lines(self, raw: str, expected: int) -> list[str]:
        lines = []
        for line in raw.splitlines():
            stripped = line.strip()
            if not stripped:
                continue
            match = re.match(r"^\s*\d+[.)\s-]+\s*(.*)$", stripped)
            lines.append(match.group(1).strip() if match else stripped)
            if len(lines) >= expected:
                break
        return lines

    def _heuristic_detect(
        self, transcript: list[TranscriptSegment], profile: ChannelProfile
    ) -> list[ClipCandidate]:
        style_terms = {
            "funny": ["react", "punchy", "mistake", "surprising"],
            "informative": ["important", "practical", "takeaway", "explanation"],
            "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
            "educational": ["question", "answer", "context", "takeaway"],
        }
        preferred_terms = style_terms.get(profile.clip_style.lower(), [])
        niche = _effective_niche(profile)
        profile_terms = [
            term
            for term in f"{niche} {profile.channel_description}".lower().split()[:30]
            if len(term) > 2
        ]
        scored: list[tuple[float, TranscriptSegment]] = []
        for segment in transcript:
            text = segment.text.lower()
            score = 45.0
            score += 12 if "?" in segment.text else 0
            score += 8 if any(term in text for term in preferred_terms) else 0
            score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
            score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
            score += 5 if any(term in text for term in profile_terms) else 0
            score += min(len(segment.text) / 12, 10)
            scored.append((min(score, 100), segment))

        scored.sort(key=lambda item: item[0], reverse=True)
        clips: list[ClipCandidate] = []
        clip_count = min(profile.clip_count, self.settings.max_clips)
        for score, segment in scored[:clip_count]:
            start = max(0.0, segment.start_seconds - 5.0)
            end = start + float(profile.clip_length_seconds)
            clips.append(
                ClipCandidate(
                    id=uuid4().hex,
                    start_seconds=start,
                    end_seconds=end,
                    title=self._title_for(segment.text),
                    reason=self._reason_for(profile, niche),
                    score=round(score, 1),
                    subtitle_text=segment.text,
                    metadata={"model": "heuristic-fallback"},
                )
            )
        return sorted(clips, key=lambda clip: clip.start_seconds)

    def _title_for(self, text: str) -> str:
        clean = re.sub(r"\s+", " ", text).strip(" \t\r\n.,!?;:()[]{}\"'")
        words = clean.split()
        if len(words) > 1:
            title = " ".join(words[:7])
        else:
            title = clean[:48]
        return title[:72].rstrip() or "Highlight"

    def _reason_for(self, profile: ChannelProfile, niche: str) -> str:
        language = profile.primary_language.lower()
        style = _localized_profile_word(profile.clip_style, language, "style")
        niche_label = _localized_profile_word(niche, language, "niche")
        if "thai" in language:
            return f"ตรงกับสไตล์ {style} สำหรับผู้ชมช่องแนว {niche_label}"
        if "japanese" in language:
            return f"{niche_label} の視聴者に合う {style} スタイルの候補です。"
        if "chinese" in language:
            return f"符合 {niche_label} 受众期待的 {style} 风格。"
        if "korean" in language:
            return f"{niche_label} 시청자에게 맞는 {style} 스타일의 후보입니다."
        return f"Matches the {profile.clip_style} style for a {niche} audience."


def _effective_niche(profile: ChannelProfile) -> str:
    if profile.niche.lower() == "other" and profile.niche_custom:
        return profile.niche_custom
    return profile.niche


def _localized_profile_word(value: str, language: str, group: str) -> str:
    key = value.lower().replace(" ", "_")
    localized = {
        "thai": {
            "niche": {
                "education": "การศึกษา",
                "gaming": "เกม",
                "podcast": "พอดแคสต์",
                "commentary": "เล่า/วิเคราะห์",
                "cars": "รถยนต์",
                "beauty": "บิวตี้",
                "fitness": "ฟิตเนส",
                "finance": "การเงิน",
                "tech": "เทคโนโลยี",
                "lifestyle": "ไลฟ์สไตล์",
                "music": "ดนตรี",
            },
            "style": {
                "informative": "ให้ข้อมูล",
                "funny": "ตลก",
                "dramatic": "ดราม่า",
                "educational": "สอนเข้าใจง่าย",
                "commentary": "วิเคราะห์",
            },
        },
        "japanese": {
            "niche": {
                "education": "教育",
                "gaming": "ゲーム",
                "podcast": "ポッドキャスト",
                "commentary": "解説",
                "cars": "車",
                "beauty": "美容",
                "fitness": "フィットネス",
                "finance": "金融",
                "tech": "テック",
                "lifestyle": "ライフスタイル",
                "music": "音楽",
            },
            "style": {
                "informative": "情報性の高い",
                "funny": "ユーモアのある",
                "dramatic": "ドラマチックな",
                "educational": "学びやすい",
                "commentary": "解説型の",
            },
        },
        "chinese": {
            "niche": {
                "education": "教育",
                "gaming": "游戏",
                "podcast": "播客",
                "commentary": "解说",
                "cars": "汽车",
                "beauty": "美妆",
                "fitness": "健身",
                "finance": "金融",
                "tech": "科技",
                "lifestyle": "生活方式",
                "music": "音乐",
            },
            "style": {
                "informative": "信息量高",
                "funny": "有趣",
                "dramatic": "戏剧化",
                "educational": "教学型",
                "commentary": "评论型",
            },
        },
        "korean": {
            "niche": {
                "education": "교육",
                "gaming": "게임",
                "podcast": "팟캐스트",
                "commentary": "해설",
                "cars": "자동차",
                "beauty": "뷰티",
                "fitness": "피트니스",
                "finance": "금융",
                "tech": "테크",
                "lifestyle": "라이프스타일",
                "music": "음악",
            },
            "style": {
                "informative": "정보형",
                "funny": "재미있는",
                "dramatic": "극적인",
                "educational": "교육형",
                "commentary": "해설형",
            },
        },
    }
    for language_key, groups in localized.items():
        if language_key in language:
            return groups.get(group, {}).get(key, value)
    return value