Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Running

JakgritB commited on 7 days ago

Commit

dbc3c35

1 Parent(s): 12a024d

feat(backend): add modular video processing API

Add FastAPI job endpoints, ROCm-aware configuration, Whisper and Qwen service boundaries, ffmpeg clip rendering, subtitle generation, timing logs, and file-backed MVP job storage.

Files changed (22) hide show

backend/Dockerfile +17 -0
backend/app/__init__.py +1 -0
backend/app/core/__init__.py +1 -0
backend/app/core/config.py +68 -0
backend/app/core/timing.py +20 -0
backend/app/main.py +122 -0
backend/app/models/__init__.py +1 -0
backend/app/models/schemas.py +92 -0
backend/app/services/__init__.py +1 -0
backend/app/services/clips.py +112 -0
backend/app/services/highlight.py +139 -0
backend/app/services/multimodal.py +18 -0
backend/app/services/pipeline.py +157 -0
backend/app/services/subtitles.py +44 -0
backend/app/services/transcription.py +95 -0
backend/app/services/video_input.py +80 -0
backend/app/storage.py +58 -0
backend/app/utils/__init__.py +1 -0
backend/app/utils/rocm.py +33 -0
backend/app/workers/__init__.py +1 -0
backend/app/workers/celery_app.py +15 -0
backend/pyproject.toml +42 -0

backend/Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+ARG ROCM_PYTORCH_IMAGE=rocm/pytorch:latest
+FROM ${ROCM_PYTORCH_IMAGE}
+WORKDIR /app
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends ffmpeg git curl \
+    && rm -rf /var/lib/apt/lists/*
+COPY pyproject.toml ./
+ARG INSTALL_EXTRAS=.
+RUN pip install --upgrade pip && pip install -e "${INSTALL_EXTRAS}"
+COPY app ./app
+EXPOSE 8000
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

backend/app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """AI Clip Studio backend."""

backend/app/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Core configuration and instrumentation."""

backend/app/core/config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from functools import lru_cache
+import os
+from pathlib import Path
+from pydantic import Field
+from pydantic import BaseModel
+class Settings(BaseModel):
+    app_name: str = "ElevenClip.AI"
+    demo_mode: bool = True
+    storage_dir: Path = Path("data")
+    frontend_origin: str = "http://localhost:5173"
+    whisper_model_id: str = "openai/whisper-large-v3"
+    qwen_text_model_id: str = "Qwen/Qwen2.5-7B-Instruct"
+    qwen_vl_model_id: str = "Qwen/Qwen2-VL-7B-Instruct"
+    hf_token: str | None = None
+    preferred_torch_dtype: str = "bfloat16"
+    target_clip_count: int = Field(default=5, ge=1, le=20)
+    max_clips: int = Field(default=10, ge=1, le=50)
+    ffmpeg_binary: str = "ffmpeg"
+    ffprobe_binary: str = "ffprobe"
+    ffmpeg_video_codec: str = "h264_amf"
+    ffmpeg_cpu_codec: str = "libx264"
+    redis_url: str = "redis://redis:6379/0"
+    celery_enabled: bool = False
+@lru_cache
+def get_settings() -> Settings:
+    settings = Settings(
+        demo_mode=_bool_env("DEMO_MODE", True),
+        storage_dir=Path(os.getenv("STORAGE_DIR", "data")),
+        frontend_origin=os.getenv("FRONTEND_ORIGIN", "http://localhost:5173"),
+        whisper_model_id=os.getenv("WHISPER_MODEL_ID", "openai/whisper-large-v3"),
+        qwen_text_model_id=os.getenv("QWEN_TEXT_MODEL_ID", "Qwen/Qwen2.5-7B-Instruct"),
+        qwen_vl_model_id=os.getenv("QWEN_VL_MODEL_ID", "Qwen/Qwen2-VL-7B-Instruct"),
+        hf_token=os.getenv("HF_TOKEN") or None,
+        preferred_torch_dtype=os.getenv("TORCH_DTYPE", "bfloat16"),
+        target_clip_count=_int_env("TARGET_CLIP_COUNT", 5),
+        max_clips=_int_env("MAX_CLIPS", 10),
+        ffmpeg_binary=os.getenv("FFMPEG_BINARY", "ffmpeg"),
+        ffprobe_binary=os.getenv("FFPROBE_BINARY", "ffprobe"),
+        ffmpeg_video_codec=os.getenv("FFMPEG_VIDEO_CODEC", "h264_amf"),
+        ffmpeg_cpu_codec=os.getenv("FFMPEG_CPU_CODEC", "libx264"),
+        redis_url=os.getenv("REDIS_URL", "redis://redis:6379/0"),
+        celery_enabled=_bool_env("CELERY_ENABLED", False),
+    )
+    settings.storage_dir.mkdir(parents=True, exist_ok=True)
+    return settings
+def _bool_env(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+def _int_env(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return int(value)

backend/app/core/timing.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from collections.abc import Iterator
+from contextlib import contextmanager
+from time import perf_counter
+class TimingLog:
+    def __init__(self) -> None:
+        self._steps: dict[str, float] = {}
+    @contextmanager
+    def measure(self, name: str) -> Iterator[None]:
+        started = perf_counter()
+        try:
+            yield
+        finally:
+            self._steps[name] = round(perf_counter() - started, 3)
+    def to_dict(self) -> dict[str, float]:
+        total = round(sum(self._steps.values()), 3)
+        return {**self._steps, "total": total}

backend/app/main.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from fastapi import BackgroundTasks, FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+from app.core.config import get_settings
+from app.models.schemas import (
+    ChannelProfile,
+    ClipCandidate,
+    ClipPatch,
+    HealthResponse,
+    JobSnapshot,
+    RegenerateClipRequest,
+    YoutubeJobRequest,
+)
+from app.services.pipeline import VideoPipeline
+from app.services.video_input import save_upload
+from app.storage import JobStore
+from app.utils.rocm import detect_accelerator
+settings = get_settings()
+store = JobStore(settings)
+pipeline = VideoPipeline(settings, store)
+app = FastAPI(title=settings.app_name, version="0.1.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[settings.frontend_origin, "http://localhost:5173", "http://127.0.0.1:5173"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.mount("/media", StaticFiles(directory=settings.storage_dir), name="media")
+@app.get("/health", response_model=HealthResponse)
+async def health() -> HealthResponse:
+    return HealthResponse(
+        ok=True,
+        app=settings.app_name,
+        demo_mode=settings.demo_mode,
+        accelerator=detect_accelerator(),
+    )
+@app.post("/api/jobs/youtube", response_model=JobSnapshot)
+async def create_youtube_job(
+    request: YoutubeJobRequest, background_tasks: BackgroundTasks
+) -> JobSnapshot:
+    snapshot = store.create_job(
+        request.profile, {"kind": "youtube", "url": str(request.youtube_url)}
+    )
+    background_tasks.add_task(
+        pipeline.process_source, snapshot.id, "youtube", str(request.youtube_url), request.profile
+    )
+    return snapshot
+@app.post("/api/jobs/upload", response_model=JobSnapshot)
+async def create_upload_job(
+    background_tasks: BackgroundTasks,
+    profile_json: str = Form(...),
+    file: UploadFile = File(...),
+) -> JobSnapshot:
+    try:
+        profile = ChannelProfile.model_validate_json(profile_json)
+    except Exception as exc:
+        raise HTTPException(status_code=422, detail=f"Invalid profile JSON: {exc}") from exc
+    snapshot = store.create_job(profile, {"kind": "upload", "filename": file.filename})
+    source_path = await save_upload(file, store.job_dir(snapshot.id))
+    background_tasks.add_task(pipeline.process_source, snapshot.id, "upload", str(source_path), profile)
+    return snapshot
+@app.get("/api/jobs/{job_id}", response_model=JobSnapshot)
+async def get_job(job_id: str) -> JobSnapshot:
+    try:
+        return store.get_job(job_id)
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="Job not found") from exc
+@app.patch("/api/jobs/{job_id}/clips/{clip_id}", response_model=ClipCandidate)
+async def update_clip(job_id: str, clip_id: str, patch: ClipPatch) -> ClipCandidate:
+    try:
+        return pipeline.patch_clip(job_id, clip_id, patch.model_dump())
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="Job not found") from exc
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Clip not found") from exc
+@app.post("/api/jobs/{job_id}/clips/{clip_id}/regenerate", response_model=ClipCandidate)
+async def regenerate_clip(
+    job_id: str, clip_id: str, request: RegenerateClipRequest
+) -> ClipCandidate:
+    try:
+        return pipeline.regenerate_clip(
+            job_id,
+            clip_id,
+            clip_style=request.clip_style,
+            clip_length_seconds=request.clip_length_seconds,
+            subtitle_text=request.subtitle_text,
+        )
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=404, detail="Source video not found") from exc
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail="Clip not found") from exc
+@app.get("/api/jobs/{job_id}/clips/{clip_id}/download")
+async def download_clip(job_id: str, clip_id: str) -> FileResponse:
+    snapshot = store.get_job(job_id)
+    clip = next((item for item in snapshot.clips if item.id == clip_id), None)
+    if clip is None or clip.download_url is None:
+        raise HTTPException(status_code=404, detail="Clip not found")
+    filename = clip.download_url.rsplit("/", 1)[-1]
+    path = store.job_dir(job_id) / filename
+    if not path.exists():
+        raise HTTPException(status_code=404, detail="Clip file not found")
+    return FileResponse(path, media_type="video/mp4", filename=filename)

backend/app/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Pydantic models."""

backend/app/models/schemas.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Literal
+from pydantic import BaseModel, Field, HttpUrl, field_validator
+def utc_now() -> datetime:
+    return datetime.now(timezone.utc)
+class TargetPlatform(str, Enum):
+    tiktok = "tiktok"
+    youtube_shorts = "youtube_shorts"
+    instagram_reels = "instagram_reels"
+class ChannelProfile(BaseModel):
+    niche: str = Field(default="education", min_length=2, max_length=80)
+    clip_style: str = Field(default="informative", min_length=2, max_length=80)
+    clip_length_seconds: int = Field(default=60, ge=15, le=180)
+    primary_language: str = Field(default="Thai", min_length=2, max_length=40)
+    target_platform: TargetPlatform = TargetPlatform.tiktok
+    @field_validator("niche", "clip_style", "primary_language")
+    @classmethod
+    def clean_text(cls, value: str) -> str:
+        return value.strip()
+class YoutubeJobRequest(BaseModel):
+    youtube_url: HttpUrl
+    profile: ChannelProfile
+class TranscriptSegment(BaseModel):
+    id: str
+    start_seconds: float = Field(ge=0)
+    end_seconds: float = Field(ge=0)
+    text: str
+    language: str | None = None
+class ClipCandidate(BaseModel):
+    id: str
+    start_seconds: float = Field(ge=0)
+    end_seconds: float = Field(ge=0)
+    title: str
+    reason: str
+    score: float = Field(ge=0, le=100)
+    subtitle_text: str = ""
+    video_url: str | None = None
+    download_url: str | None = None
+    approved: bool = False
+    deleted: bool = False
+    metadata: dict[str, Any] = Field(default_factory=dict)
+class ClipPatch(BaseModel):
+    start_seconds: float | None = Field(default=None, ge=0)
+    end_seconds: float | None = Field(default=None, ge=0)
+    subtitle_text: str | None = None
+    approved: bool | None = None
+    deleted: bool | None = None
+class RegenerateClipRequest(BaseModel):
+    clip_style: str | None = None
+    clip_length_seconds: int | None = Field(default=None, ge=15, le=180)
+    subtitle_text: str | None = None
+class JobSnapshot(BaseModel):
+    id: str
+    status: Literal["queued", "running", "completed", "failed"]
+    progress: float = Field(ge=0, le=1)
+    message: str
+    source: dict[str, Any]
+    profile: ChannelProfile
+    transcript: list[TranscriptSegment] = Field(default_factory=list)
+    clips: list[ClipCandidate] = Field(default_factory=list)
+    timings: dict[str, float] = Field(default_factory=dict)
+    error: str | None = None
+    created_at: datetime = Field(default_factory=utc_now)
+    updated_at: datetime = Field(default_factory=utc_now)
+class HealthResponse(BaseModel):
+    ok: bool
+    app: str
+    demo_mode: bool
+    accelerator: dict[str, Any]

backend/app/services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Pipeline services."""

backend/app/services/clips.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import shutil
+import subprocess
+from pathlib import Path
+from app.core.config import Settings
+from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
+from app.services.subtitles import write_single_caption_srt, write_srt
+from app.storage import JobStore
+class ClipGenerator:
+    def __init__(self, settings: Settings, store: JobStore) -> None:
+        self.settings = settings
+        self.store = store
+    def generate(
+        self,
+        job_id: str,
+        video_path: Path,
+        clips: list[ClipCandidate],
+        transcript: list[TranscriptSegment],
+        profile: ChannelProfile,
+    ) -> list[ClipCandidate]:
+        rendered: list[ClipCandidate] = []
+        for index, clip in enumerate(clips, start=1):
+            rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
+        return rendered
+    def render_one(
+        self,
+        job_id: str,
+        video_path: Path,
+        clip: ClipCandidate,
+        transcript: list[TranscriptSegment],
+        profile: ChannelProfile,
+        index: int = 1,
+    ) -> ClipCandidate:
+        job_dir = self.store.job_dir(job_id)
+        output_name = f"clip_{index:02}_{clip.id[:8]}.mp4"
+        subtitle_name = f"clip_{index:02}_{clip.id[:8]}.srt"
+        output_path = job_dir / output_name
+        subtitle_path = job_dir / subtitle_name
+        duration = max(1.0, clip.end_seconds - clip.start_seconds)
+        if clip.subtitle_text.strip():
+            write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
+        else:
+            write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
+        self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
+        clip.video_url = self.store.media_url(job_id, output_name)
+        clip.download_url = clip.video_url
+        clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
+        return clip
+    def _run_ffmpeg(
+        self,
+        video_path: Path,
+        output_path: Path,
+        subtitle_path: Path,
+        clip: ClipCandidate,
+        profile: ChannelProfile,
+    ) -> None:
+        ffmpeg = shutil.which(self.settings.ffmpeg_binary)
+        if not ffmpeg or not video_path.exists() or video_path.stat().st_size == 0:
+            output_path.write_bytes(b"")
+            return
+        duration = max(1.0, clip.end_seconds - clip.start_seconds)
+        filters = [self._platform_filter(profile), self._subtitle_filter(subtitle_path)]
+        command = [
+            ffmpeg,
+            "-y",
+            "-ss",
+            f"{clip.start_seconds:.3f}",
+            "-i",
+            str(video_path),
+            "-t",
+            f"{duration:.3f}",
+            "-vf",
+            ",".join(filters),
+            "-c:v",
+            self.settings.ffmpeg_video_codec,
+            "-c:a",
+            "aac",
+            "-b:a",
+            "160k",
+            "-movflags",
+            "+faststart",
+            str(output_path),
+        ]
+        try:
+            subprocess.run(command, check=True, capture_output=True, text=True, timeout=180)
+            return
+        except Exception:
+            fallback = command.copy()
+            fallback[fallback.index(self.settings.ffmpeg_video_codec)] = self.settings.ffmpeg_cpu_codec
+            try:
+                subprocess.run(fallback, check=True, capture_output=True, text=True, timeout=180)
+                return
+            except Exception:
+                output_path.write_bytes(b"")
+    def _platform_filter(self, profile: ChannelProfile) -> str:
+        if profile.target_platform.value in {"tiktok", "youtube_shorts", "instagram_reels"}:
+            return "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920"
+        return "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2"
+    def _subtitle_filter(self, subtitle_path: Path) -> str:
+        escaped = str(subtitle_path.resolve()).replace("\\", "/").replace(":", "\\:")
+        style = "Fontname=Arial,Fontsize=18,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000"
+        return f"subtitles='{escaped}':force_style='{style}'"

backend/app/services/highlight.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import json
+import re
+from uuid import uuid4
+from app.core.config import Settings
+from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
+class QwenHighlightDetector:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+        self._llm = None
+    def detect(
+        self, transcript: list[TranscriptSegment], profile: ChannelProfile
+    ) -> list[ClipCandidate]:
+        if self.settings.demo_mode:
+            return self._heuristic_detect(transcript, profile)
+        try:
+            return self._qwen_detect(transcript, profile)
+        except Exception:
+            return self._heuristic_detect(transcript, profile)
+    def _qwen_detect(
+        self, transcript: list[TranscriptSegment], profile: ChannelProfile
+    ) -> list[ClipCandidate]:
+        try:
+            from vllm import LLM, SamplingParams
+        except Exception as exc:
+            raise RuntimeError("vLLM with ROCm backend is required for Qwen inference") from exc
+        if self._llm is None:
+            self._llm = LLM(
+                model=self.settings.qwen_text_model_id,
+                dtype=self.settings.preferred_torch_dtype,
+                trust_remote_code=True,
+            )
+        transcript_text = "\n".join(
+            f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
+            for segment in transcript
+        )
+        prompt = f"""
+You are selecting short-form clips for a creator.
+Profile:
+- niche: {profile.niche}
+- style: {profile.clip_style}
+- target length seconds: {profile.clip_length_seconds}
+- language: {profile.primary_language}
+- platform: {profile.target_platform.value}
+Return strict JSON only. Shape:
+[
+  {{
+    "start_seconds": 12.0,
+    "end_seconds": 72.0,
+    "title": "short title",
+    "reason": "why this will engage viewers",
+    "score": 91,
+    "subtitle_text": "clean subtitle text"
+  }}
+]
+Transcript:
+{transcript_text}
+""".strip()
+        sampling = SamplingParams(temperature=0.2, max_tokens=1200)
+        outputs = self._llm.generate([prompt], sampling)
+        text = outputs[0].outputs[0].text
+        payload = self._parse_json_array(text)
+        clips = [
+            ClipCandidate(
+                id=uuid4().hex,
+                start_seconds=float(item["start_seconds"]),
+                end_seconds=float(item["end_seconds"]),
+                title=str(item.get("title") or "Highlight"),
+                reason=str(item.get("reason") or "High engagement potential"),
+                score=float(item.get("score") or 75),
+                subtitle_text=str(item.get("subtitle_text") or ""),
+                metadata={"model": self.settings.qwen_text_model_id},
+            )
+            for item in payload[: self.settings.max_clips]
+        ]
+        return clips or self._heuristic_detect(transcript, profile)
+    def _parse_json_array(self, text: str) -> list[dict]:
+        match = re.search(r"\[[\s\S]*\]", text)
+        if not match:
+            raise ValueError("No JSON array in Qwen response")
+        payload = json.loads(match.group(0))
+        if not isinstance(payload, list):
+            raise ValueError("Qwen response is not a list")
+        return payload
+    def _heuristic_detect(
+        self, transcript: list[TranscriptSegment], profile: ChannelProfile
+    ) -> list[ClipCandidate]:
+        style_terms = {
+            "funny": ["react", "punchy", "mistake", "surprising"],
+            "informative": ["important", "practical", "takeaway", "explanation"],
+            "dramatic": ["problem", "surprising", "before-and-after", "stop scrolling"],
+            "educational": ["question", "answer", "context", "takeaway"],
+        }
+        preferred_terms = style_terms.get(profile.clip_style.lower(), [])
+        scored: list[tuple[float, TranscriptSegment]] = []
+        for segment in transcript:
+            text = segment.text.lower()
+            score = 45.0
+            score += 12 if "?" in segment.text else 0
+            score += 8 if any(term in text for term in preferred_terms) else 0
+            score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
+            score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
+            score += min(len(segment.text) / 12, 10)
+            scored.append((min(score, 100), segment))
+        scored.sort(key=lambda item: item[0], reverse=True)
+        clips: list[ClipCandidate] = []
+        for score, segment in scored[: self.settings.target_clip_count]:
+            start = max(0.0, segment.start_seconds - 5.0)
+            end = start + float(profile.clip_length_seconds)
+            clips.append(
+                ClipCandidate(
+                    id=uuid4().hex,
+                    start_seconds=start,
+                    end_seconds=end,
+                    title=self._title_for(segment.text),
+                    reason=f"Matches the {profile.clip_style} style for a {profile.niche} audience.",
+                    score=round(score, 1),
+                    subtitle_text=segment.text,
+                    metadata={"model": "heuristic-fallback"},
+                )
+            )
+        return sorted(clips, key=lambda clip: clip.start_seconds)
+    def _title_for(self, text: str) -> str:
+        words = re.sub(r"[^A-Za-z0-9 ]+", "", text).split()
+        title = " ".join(words[:7])
+        return title or "Highlight"

backend/app/services/multimodal.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from app.core.config import Settings
+from app.models.schemas import ClipCandidate
+class QwenVisualAnalyzer:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+    def enrich(self, video_path: str, clips: list[ClipCandidate]) -> list[ClipCandidate]:
+        if self.settings.demo_mode:
+            return clips
+        # Hook for Qwen2-VL frame/audio scoring on AMD ROCm.
+        # Keep this side-effect free until the hackathon demo has stable frame sampling assets.
+        for clip in clips:
+            clip.metadata["visual_model"] = self.settings.qwen_vl_model_id
+            clip.metadata["visual_status"] = "not_configured"
+        return clips

backend/app/services/pipeline.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import asyncio
+from pathlib import Path
+from app.core.config import Settings
+from app.core.timing import TimingLog
+from app.models.schemas import ChannelProfile, ClipCandidate
+from app.services.clips import ClipGenerator
+from app.services.highlight import QwenHighlightDetector
+from app.services.multimodal import QwenVisualAnalyzer
+from app.services.transcription import WhisperTranscriber
+from app.services.video_input import resolve_youtube_url
+from app.storage import JobStore
+class VideoPipeline:
+    def __init__(self, settings: Settings, store: JobStore) -> None:
+        self.settings = settings
+        self.store = store
+        self.transcriber = WhisperTranscriber(settings)
+        self.highlight_detector = QwenHighlightDetector(settings)
+        self.visual_analyzer = QwenVisualAnalyzer(settings)
+        self.clip_generator = ClipGenerator(settings, store)
+    async def process_source(
+        self,
+        job_id: str,
+        source_kind: str,
+        source_value: str,
+        profile: ChannelProfile,
+    ) -> None:
+        timings = TimingLog()
+        try:
+            self.store.update_job(
+                job_id, status="running", progress=0.05, message="Preparing video input"
+            )
+            with timings.measure("input"):
+                if source_kind == "youtube":
+                    video_path = await resolve_youtube_url(
+                        source_value, self.store.job_dir(job_id), self.settings
+                    )
+                else:
+                    video_path = Path(source_value)
+            self.store.update_job(
+                job_id, progress=0.25, message="Transcribing with Whisper Large V3"
+            )
+            with timings.measure("transcription"):
+                transcript = await asyncio.to_thread(
+                    self.transcriber.transcribe, str(video_path), profile
+                )
+            self.store.write_json(
+                job_id,
+                "transcript.json",
+                [segment.model_dump(mode="json") for segment in transcript],
+            )
+            self.store.update_job(job_id, transcript=transcript, timings=timings.to_dict())
+            self.store.update_job(job_id, progress=0.55, message="Scoring highlights with Qwen")
+            with timings.measure("highlight_detection"):
+                clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
+            self.store.update_job(job_id, progress=0.65, message="Checking visual highlights")
+            with timings.measure("multimodal_analysis"):
+                clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
+            self.store.update_job(job_id, progress=0.78, message="Generating clips and subtitles")
+            with timings.measure("clip_generation"):
+                rendered = await asyncio.to_thread(
+                    self.clip_generator.generate, job_id, video_path, clips, transcript, profile
+                )
+            self.store.write_json(
+                job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
+            )
+            self.store.update_job(
+                job_id,
+                status="completed",
+                progress=1,
+                message="Clips ready",
+                transcript=transcript,
+                clips=rendered,
+                timings=timings.to_dict(),
+            )
+        except Exception as exc:
+            self.store.update_job(
+                job_id,
+                status="failed",
+                progress=1,
+                message="Processing failed",
+                error=str(exc),
+                timings=timings.to_dict(),
+            )
+    def patch_clip(self, job_id: str, clip_id: str, updates: dict) -> ClipCandidate:
+        snapshot = self.store.get_job(job_id)
+        patched: ClipCandidate | None = None
+        clips: list[ClipCandidate] = []
+        for clip in snapshot.clips:
+            if clip.id == clip_id:
+                clean_updates = {key: value for key, value in updates.items() if value is not None}
+                clip = clip.model_copy(update=clean_updates)
+                if clip.end_seconds <= clip.start_seconds:
+                    clip = clip.model_copy(update={"end_seconds": clip.start_seconds + 1})
+                patched = clip
+            clips.append(clip)
+        if patched is None:
+            raise KeyError(clip_id)
+        self.store.update_job(job_id, clips=clips)
+        return patched
+    def regenerate_clip(
+        self,
+        job_id: str,
+        clip_id: str,
+        clip_style: str | None = None,
+        clip_length_seconds: int | None = None,
+        subtitle_text: str | None = None,
+    ) -> ClipCandidate:
+        snapshot = self.store.get_job(job_id)
+        source_path = self._source_path(job_id)
+        clips: list[ClipCandidate] = []
+        regenerated: ClipCandidate | None = None
+        for index, clip in enumerate(snapshot.clips, start=1):
+            if clip.id == clip_id:
+                profile = snapshot.profile.model_copy(
+                    update={
+                        key: value
+                        for key, value in {
+                            "clip_style": clip_style,
+                            "clip_length_seconds": clip_length_seconds,
+                        }.items()
+                        if value is not None
+                    }
+                )
+                if clip_length_seconds is not None:
+                    clip = clip.model_copy(
+                        update={"end_seconds": clip.start_seconds + clip_length_seconds}
+                    )
+                if subtitle_text is not None:
+                    clip = clip.model_copy(update={"subtitle_text": subtitle_text})
+                clip = self.clip_generator.render_one(
+                    job_id, source_path, clip, snapshot.transcript, profile, index
+                )
+                clip.metadata["regenerated"] = True
+                regenerated = clip
+            clips.append(clip)
+        if regenerated is None:
+            raise KeyError(clip_id)
+        self.store.update_job(job_id, clips=clips)
+        return regenerated
+    def _source_path(self, job_id: str) -> Path:
+        job_dir = self.store.job_dir(job_id)
+        matches = sorted(job_dir.glob("source.*"))
+        if not matches:
+            raise FileNotFoundError("source video missing")
+        return matches[0]

backend/app/services/subtitles.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from pathlib import Path
+from app.models.schemas import TranscriptSegment
+def seconds_to_srt_time(value: float) -> str:
+    millis = int(round(value * 1000))
+    hours, remainder = divmod(millis, 3_600_000)
+    minutes, remainder = divmod(remainder, 60_000)
+    seconds, millis = divmod(remainder, 1000)
+    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
+def write_srt(path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]) -> None:
+    rows: list[str] = []
+    index = 1
+    for segment in segments:
+        if segment.end_seconds < clip_start or segment.start_seconds > clip_end:
+            continue
+        start = max(0.0, segment.start_seconds - clip_start)
+        end = min(clip_end - clip_start, segment.end_seconds - clip_start)
+        rows.extend(
+            [
+                str(index),
+                f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(max(end, start + 0.8))}",
+                segment.text.strip(),
+                "",
+            ]
+        )
+        index += 1
+    if not rows:
+        rows = ["1", "00:00:00,000 --> 00:00:03,000", "", ""]
+    path.write_text("\n".join(rows), encoding="utf-8")
+def write_single_caption_srt(path: Path, duration: float, text: str) -> None:
+    safe_duration = max(duration, 1.0)
+    rows = [
+        "1",
+        f"00:00:00,000 --> {seconds_to_srt_time(safe_duration)}",
+        text.strip(),
+        "",
+    ]
+    path.write_text("\n".join(rows), encoding="utf-8")

backend/app/services/transcription.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from uuid import uuid4
+from app.core.config import Settings
+from app.models.schemas import ChannelProfile, TranscriptSegment
+from app.utils.rocm import torch_device_index
+class WhisperTranscriber:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+        self._pipeline = None
+    def transcribe(self, video_path: str, profile: ChannelProfile) -> list[TranscriptSegment]:
+        if self.settings.demo_mode:
+            return self._demo_transcript(profile)
+        try:
+            from transformers import pipeline
+        except Exception as exc:
+            raise RuntimeError("transformers is required for Whisper transcription") from exc
+        if self._pipeline is None:
+            self._pipeline = pipeline(
+                task="automatic-speech-recognition",
+                model=self.settings.whisper_model_id,
+                device=torch_device_index(),
+                token=self.settings.hf_token,
+                chunk_length_s=30,
+                return_timestamps=True,
+            )
+        generate_kwargs = {"task": "transcribe"}
+        if profile.primary_language and profile.primary_language.lower() != "auto":
+            generate_kwargs["language"] = profile.primary_language.lower()
+        result = self._pipeline(str(video_path), generate_kwargs=generate_kwargs)
+        chunks = result.get("chunks") or []
+        if not chunks:
+            text = result.get("text", "").strip()
+            return [
+                TranscriptSegment(
+                    id=uuid4().hex,
+                    start_seconds=0,
+                    end_seconds=max(profile.clip_length_seconds, 15),
+                    text=text,
+                    language=profile.primary_language,
+                )
+            ]
+        segments: list[TranscriptSegment] = []
+        for chunk in chunks:
+            timestamp = chunk.get("timestamp") or (0, 0)
+            start = float(timestamp[0] or 0)
+            end = float(timestamp[1] or start + 5)
+            text = (chunk.get("text") or "").strip()
+            if text:
+                segments.append(
+                    TranscriptSegment(
+                        id=uuid4().hex,
+                        start_seconds=start,
+                        end_seconds=max(end, start + 1),
+                        text=text,
+                        language=profile.primary_language,
+                    )
+                )
+        return segments
+    def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
+        style = profile.clip_style.lower()
+        niche = profile.niche.lower()
+        lines = [
+            "This opening sets up the main problem creators face when a long video hides the best moments.",
+            "Here is the surprising mistake most teams make when they choose clips only by view count.",
+            "The important question is simple: which moment would make someone stop scrolling right now?",
+            f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
+            "This section has the clearest explanation and a strong before-and-after contrast.",
+            "Then the guest reacts with a punchy line that works well as a short hook.",
+            "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",
+            "The final segment wraps the idea with a direct callout that is easy to subtitle.",
+        ]
+        segments: list[TranscriptSegment] = []
+        cursor = 0.0
+        for line in lines:
+            end = cursor + 15.0
+            segments.append(
+                TranscriptSegment(
+                    id=uuid4().hex,
+                    start_seconds=cursor,
+                    end_seconds=end,
+                    text=line,
+                    language=profile.primary_language,
+                )
+            )
+            cursor = end
+        return segments

backend/app/services/video_input.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import asyncio
+import shutil
+import subprocess
+from pathlib import Path
+from fastapi import UploadFile
+from app.core.config import Settings
+async def save_upload(upload: UploadFile, job_dir: Path) -> Path:
+    suffix = Path(upload.filename or "upload.mp4").suffix or ".mp4"
+    destination = job_dir / f"source{suffix.lower()}"
+    with destination.open("wb") as handle:
+        while chunk := await upload.read(1024 * 1024):
+            handle.write(chunk)
+    return destination
+async def resolve_youtube_url(url: str, job_dir: Path, settings: Settings) -> Path:
+    if settings.demo_mode:
+        return await asyncio.to_thread(create_demo_video, job_dir, settings)
+    try:
+        import yt_dlp
+    except Exception as exc:
+        raise RuntimeError("yt-dlp is required for YouTube ingestion") from exc
+    output_template = str(job_dir / "source.%(ext)s")
+    ydl_opts = {
+        "outtmpl": output_template,
+        "format": "bv*[ext=mp4]+ba[ext=m4a]/b[ext=mp4]/best",
+        "merge_output_format": "mp4",
+        "quiet": True,
+        "noprogress": True,
+    }
+    def download() -> Path:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        matches = sorted(job_dir.glob("source.*"))
+        if not matches:
+            raise RuntimeError("yt-dlp finished without producing a video")
+        return matches[0]
+    return await asyncio.to_thread(download)
+def create_demo_video(job_dir: Path, settings: Settings) -> Path:
+    destination = job_dir / "source.mp4"
+    ffmpeg = shutil.which(settings.ffmpeg_binary)
+    if not ffmpeg:
+        destination.write_bytes(b"")
+        return destination
+    command = [
+        ffmpeg,
+        "-y",
+        "-f",
+        "lavfi",
+        "-i",
+        "testsrc2=size=1280x720:rate=30:duration=120",
+        "-f",
+        "lavfi",
+        "-i",
+        "sine=frequency=660:sample_rate=48000:duration=120",
+        "-shortest",
+        "-c:v",
+        "libx264",
+        "-pix_fmt",
+        "yuv420p",
+        "-c:a",
+        "aac",
+        str(destination),
+    ]
+    try:
+        subprocess.run(command, check=True, capture_output=True, text=True, timeout=45)
+    except Exception:
+        destination.write_bytes(b"")
+    return destination

backend/app/storage.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+from pathlib import Path
+from uuid import uuid4
+from app.core.config import Settings
+from app.models.schemas import ChannelProfile, JobSnapshot, utc_now
+class JobStore:
+    def __init__(self, settings: Settings) -> None:
+        self.settings = settings
+        self.root = settings.storage_dir
+        self.jobs_root = self.root / "jobs"
+        self.jobs_root.mkdir(parents=True, exist_ok=True)
+    def create_job(self, profile: ChannelProfile, source: dict) -> JobSnapshot:
+        job_id = uuid4().hex
+        job_dir = self.job_dir(job_id)
+        job_dir.mkdir(parents=True, exist_ok=True)
+        snapshot = JobSnapshot(
+            id=job_id,
+            status="queued",
+            progress=0,
+            message="Queued",
+            source=source,
+            profile=profile,
+        )
+        self.save_job(snapshot)
+        return snapshot
+    def job_dir(self, job_id: str) -> Path:
+        return self.jobs_root / job_id
+    def media_url(self, job_id: str, filename: str) -> str:
+        return f"/media/jobs/{job_id}/{filename}"
+    def save_job(self, snapshot: JobSnapshot) -> JobSnapshot:
+        snapshot.updated_at = utc_now()
+        path = self.job_dir(snapshot.id) / "job.json"
+        path.write_text(snapshot.model_dump_json(indent=2), encoding="utf-8")
+        return snapshot
+    def get_job(self, job_id: str) -> JobSnapshot:
+        path = self.job_dir(job_id) / "job.json"
+        if not path.exists():
+            raise FileNotFoundError(job_id)
+        data = json.loads(path.read_text(encoding="utf-8"))
+        return JobSnapshot.model_validate(data)
+    def update_job(self, job_id: str, **updates) -> JobSnapshot:
+        snapshot = self.get_job(job_id)
+        updated = snapshot.model_copy(update=updates)
+        return self.save_job(updated)
+    def write_json(self, job_id: str, filename: str, payload: object) -> Path:
+        path = self.job_dir(job_id) / filename
+        path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
+        return path

backend/app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Runtime helpers."""

backend/app/utils/rocm.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import Any
+def detect_accelerator() -> dict[str, Any]:
+    try:
+        import torch
+    except Exception as exc:
+        return {
+            "torch_available": False,
+            "cuda_api_available": False,
+            "rocm_hip_version": None,
+            "device_name": None,
+            "error": str(exc),
+        }
+    cuda_available = bool(torch.cuda.is_available())
+    device_name = torch.cuda.get_device_name(0) if cuda_available else None
+    return {
+        "torch_available": True,
+        "cuda_api_available": cuda_available,
+        "rocm_hip_version": getattr(torch.version, "hip", None),
+        "cuda_version": getattr(torch.version, "cuda", None),
+        "device_name": device_name,
+        "device_count": torch.cuda.device_count() if cuda_available else 0,
+    }
+def torch_device_index() -> int:
+    try:
+        import torch
+    except Exception:
+        return -1
+    return 0 if torch.cuda.is_available() else -1

backend/app/workers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Optional async workers."""

backend/app/workers/celery_app.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from celery import Celery
+from app.core.config import get_settings
+settings = get_settings()
+celery_app = Celery("ai_clip_studio", broker=settings.redis_url, backend=settings.redis_url)
+celery_app.conf.task_serializer = "json"
+celery_app.conf.result_serializer = "json"
+celery_app.conf.accept_content = ["json"]
+@celery_app.task(name="pipeline.process_job")
+def process_job(job_id: str) -> str:
+    return f"Queued job {job_id}. FastAPI background tasks are active by default."

backend/pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[project]
+name = "elevenclip-ai-backend"
+version = "0.1.0"
+description = "FastAPI backend for ElevenClip.AI on AMD ROCm"
+requires-python = ">=3.11"
+dependencies = [
+  "fastapi>=0.115.0",
+  "uvicorn[standard]>=0.30.0",
+  "pydantic>=2.8.0",
+  "python-multipart>=0.0.9",
+  "yt-dlp>=2025.1.15",
+  "celery[redis]>=5.4.0",
+  "redis>=5.0.0"
+]
+[project.optional-dependencies]
+ai = [
+  "transformers>=4.47.0",
+  "accelerate>=1.2.0",
+  "sentencepiece>=0.2.0",
+  "safetensors>=0.4.5"
+]
+rocm-inference = [
+  "vllm>=0.6.6",
+  "optimum-amd>=0.1.0; platform_system == 'Linux'"
+]
+dev = [
+  "pytest>=8.3.0",
+  "httpx>=0.27.0",
+  "ruff>=0.6.0"
+]
+[build-system]
+requires = ["setuptools>=69.0"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["app*"]
+[tool.ruff]
+line-length = 100
+target-version = "py311"