Spaces:

lablab-ai-amd-developer-hackathon
/

ElevenClip-AI

Running

JakgritB commited on 6 days ago

Commit

5dadf47

1 Parent(s): 00b7145

feat(backend): enrich clipping pipeline metadata

Add channel profile description fields, per-job progress metadata, clip count support, clip-by-clip render progress, and shorter timed subtitle cues.

Files changed (6) hide show

backend/app/models/schemas.py +9 -1
backend/app/services/clips.py +8 -2
backend/app/services/highlight.py +23 -4
backend/app/services/pipeline.py +86 -7
backend/app/services/subtitles.py +95 -16
backend/app/services/transcription.py +11 -1

backend/app/models/schemas.py CHANGED Viewed

@@ -17,12 +17,15 @@ class TargetPlatform(str, Enum):
 class ChannelProfile(BaseModel):
     niche: str = Field(default="education", min_length=2, max_length=80)
     clip_style: str = Field(default="informative", min_length=2, max_length=80)
     clip_length_seconds: int = Field(default=60, ge=15, le=180)
     primary_language: str = Field(default="Thai", min_length=2, max_length=40)
     target_platform: TargetPlatform = TargetPlatform.tiktok
-    @field_validator("niche", "clip_style", "primary_language")
     @classmethod
     def clean_text(cls, value: str) -> str:
         return value.strip()
@@ -75,6 +78,11 @@ class JobSnapshot(BaseModel):
     status: Literal["queued", "running", "completed", "failed"]
     progress: float = Field(ge=0, le=1)
     message: str
     source: dict[str, Any]
     profile: ChannelProfile
     transcript: list[TranscriptSegment] = Field(default_factory=list)

 class ChannelProfile(BaseModel):
     niche: str = Field(default="education", min_length=2, max_length=80)
+    niche_custom: str = Field(default="", max_length=80)
+    channel_description: str = Field(default="", max_length=700)
     clip_style: str = Field(default="informative", min_length=2, max_length=80)
     clip_length_seconds: int = Field(default=60, ge=15, le=180)
+    clip_count: int = Field(default=5, ge=1, le=20)
     primary_language: str = Field(default="Thai", min_length=2, max_length=40)
     target_platform: TargetPlatform = TargetPlatform.tiktok
+    @field_validator("niche", "niche_custom", "channel_description", "clip_style", "primary_language")
     @classmethod
     def clean_text(cls, value: str) -> str:
         return value.strip()
     status: Literal["queued", "running", "completed", "failed"]
     progress: float = Field(ge=0, le=1)
     message: str
+    current_step: str = ""
+    step_index: int = Field(default=0, ge=0)
+    step_total: int = Field(default=6, ge=1)
+    active_clip_index: int = Field(default=0, ge=0)
+    active_clip_total: int = Field(default=0, ge=0)
     source: dict[str, Any]
     profile: ChannelProfile
     transcript: list[TranscriptSegment] = Field(default_factory=list)

backend/app/services/clips.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import shutil
 import subprocess
 from pathlib import Path
 from app.core.config import Settings
 from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
@@ -20,9 +21,13 @@ class ClipGenerator:
         clips: list[ClipCandidate],
         transcript: list[TranscriptSegment],
         profile: ChannelProfile,
     ) -> list[ClipCandidate]:
         rendered: list[ClipCandidate] = []
         for index, clip in enumerate(clips, start=1):
             rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
         return rendered
@@ -43,14 +48,15 @@ class ClipGenerator:
         duration = max(1.0, clip.end_seconds - clip.start_seconds)
         if clip.subtitle_text.strip():
-            write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
         else:
-            write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
         self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
         clip.video_url = self.store.media_url(job_id, output_name)
         clip.download_url = clip.video_url
         clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
         return clip
     def _run_ffmpeg(

 import shutil
 import subprocess
 from pathlib import Path
+from typing import Callable
 from app.core.config import Settings
 from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
         clips: list[ClipCandidate],
         transcript: list[TranscriptSegment],
         profile: ChannelProfile,
+        progress_callback: Callable[[int, int], None] | None = None,
     ) -> list[ClipCandidate]:
         rendered: list[ClipCandidate] = []
+        total = len(clips)
         for index, clip in enumerate(clips, start=1):
+            if progress_callback:
+                progress_callback(index, total)
             rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
         return rendered
         duration = max(1.0, clip.end_seconds - clip.start_seconds)
         if clip.subtitle_text.strip():
+            subtitle_cues = write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
         else:
+            subtitle_cues = write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
         self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
         clip.video_url = self.store.media_url(job_id, output_name)
         clip.download_url = clip.video_url
         clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
+        clip.metadata["subtitle_cues"] = subtitle_cues
         return clip
     def _run_ffmpeg(

backend/app/services/highlight.py CHANGED Viewed

@@ -41,12 +41,17 @@ class QwenHighlightDetector:
             f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
             for segment in transcript
         )
         prompt = f"""
 You are selecting short-form clips for a creator.
 Profile:
-- niche: {profile.niche}
 - style: {profile.clip_style}
 - target length seconds: {profile.clip_length_seconds}
 - language: {profile.primary_language}
 - platform: {profile.target_platform.value}
@@ -80,7 +85,7 @@ Transcript:
                 subtitle_text=str(item.get("subtitle_text") or ""),
                 metadata={"model": self.settings.qwen_text_model_id},
             )
-            for item in payload[: self.settings.max_clips]
         ]
         return clips or self._heuristic_detect(transcript, profile)
@@ -103,6 +108,12 @@ Transcript:
             "educational": ["question", "answer", "context", "takeaway"],
         }
         preferred_terms = style_terms.get(profile.clip_style.lower(), [])
         scored: list[tuple[float, TranscriptSegment]] = []
         for segment in transcript:
             text = segment.text.lower()
@@ -111,12 +122,14 @@ Transcript:
             score += 8 if any(term in text for term in preferred_terms) else 0
             score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
             score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
             score += min(len(segment.text) / 12, 10)
             scored.append((min(score, 100), segment))
         scored.sort(key=lambda item: item[0], reverse=True)
         clips: list[ClipCandidate] = []
-        for score, segment in scored[: self.settings.target_clip_count]:
             start = max(0.0, segment.start_seconds - 5.0)
             end = start + float(profile.clip_length_seconds)
             clips.append(
@@ -125,7 +138,7 @@ Transcript:
                     start_seconds=start,
                     end_seconds=end,
                     title=self._title_for(segment.text),
-                    reason=f"Matches the {profile.clip_style} style for a {profile.niche} audience.",
                     score=round(score, 1),
                     subtitle_text=segment.text,
                     metadata={"model": "heuristic-fallback"},
@@ -137,3 +150,9 @@ Transcript:
         words = re.sub(r"[^A-Za-z0-9 ]+", "", text).split()
         title = " ".join(words[:7])
         return title or "Highlight"

             f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
             for segment in transcript
         )
+        niche = _effective_niche(profile)
+        channel_description = profile.channel_description or "No extra channel description provided."
+        clip_count = min(profile.clip_count, self.settings.max_clips)
         prompt = f"""
 You are selecting short-form clips for a creator.
 Profile:
+- niche: {niche}
+- creator description: {channel_description}
 - style: {profile.clip_style}
 - target length seconds: {profile.clip_length_seconds}
+- target number of clips: {clip_count}
 - language: {profile.primary_language}
 - platform: {profile.target_platform.value}
                 subtitle_text=str(item.get("subtitle_text") or ""),
                 metadata={"model": self.settings.qwen_text_model_id},
             )
+            for item in payload[:clip_count]
         ]
         return clips or self._heuristic_detect(transcript, profile)
             "educational": ["question", "answer", "context", "takeaway"],
         }
         preferred_terms = style_terms.get(profile.clip_style.lower(), [])
+        niche = _effective_niche(profile)
+        profile_terms = [
+            term
+            for term in f"{niche} {profile.channel_description}".lower().split()[:30]
+            if len(term) > 2
+        ]
         scored: list[tuple[float, TranscriptSegment]] = []
         for segment in transcript:
             text = segment.text.lower()
             score += 8 if any(term in text for term in preferred_terms) else 0
             score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
             score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
+            score += 5 if any(term in text for term in profile_terms) else 0
             score += min(len(segment.text) / 12, 10)
             scored.append((min(score, 100), segment))
         scored.sort(key=lambda item: item[0], reverse=True)
         clips: list[ClipCandidate] = []
+        clip_count = min(profile.clip_count, self.settings.max_clips)
+        for score, segment in scored[:clip_count]:
             start = max(0.0, segment.start_seconds - 5.0)
             end = start + float(profile.clip_length_seconds)
             clips.append(
                     start_seconds=start,
                     end_seconds=end,
                     title=self._title_for(segment.text),
+                    reason=f"Matches the {profile.clip_style} style for a {niche} audience.",
                     score=round(score, 1),
                     subtitle_text=segment.text,
                     metadata={"model": "heuristic-fallback"},
         words = re.sub(r"[^A-Za-z0-9 ]+", "", text).split()
         title = " ".join(words[:7])
         return title or "Highlight"
+def _effective_niche(profile: ChannelProfile) -> str:
+    if profile.niche.lower() == "other" and profile.niche_custom:
+        return profile.niche_custom
+    return profile.niche

backend/app/services/pipeline.py CHANGED Viewed

@@ -31,7 +31,13 @@ class VideoPipeline:
         timings = TimingLog()
         try:
             self.store.update_job(
-                job_id, status="running", progress=0.05, message="Preparing video input"
             )
             with timings.measure("input"):
                 if source_kind == "youtube":
@@ -42,7 +48,12 @@ class VideoPipeline:
                     video_path = Path(source_value)
             self.store.update_job(
-                job_id, progress=0.25, message="Transcribing with Whisper Large V3"
             )
             with timings.measure("transcription"):
                 transcript = await asyncio.to_thread(
@@ -53,22 +64,84 @@ class VideoPipeline:
                 "transcript.json",
                 [segment.model_dump(mode="json") for segment in transcript],
             )
-            self.store.update_job(job_id, transcript=transcript, timings=timings.to_dict())
-            self.store.update_job(job_id, progress=0.55, message="Scoring highlights with Qwen")
             with timings.measure("highlight_detection"):
                 clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
-            self.store.update_job(job_id, progress=0.65, message="Checking visual highlights")
             with timings.measure("multimodal_analysis"):
                 clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
-            self.store.update_job(job_id, progress=0.78, message="Generating clips and subtitles")
             with timings.measure("clip_generation"):
                 rendered = await asyncio.to_thread(
-                    self.clip_generator.generate, job_id, video_path, clips, transcript, profile
                 )
             self.store.write_json(
                 job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
             )
@@ -77,6 +150,11 @@ class VideoPipeline:
                 status="completed",
                 progress=1,
                 message="Clips ready",
                 transcript=transcript,
                 clips=rendered,
                 timings=timings.to_dict(),
@@ -87,6 +165,7 @@ class VideoPipeline:
                 status="failed",
                 progress=1,
                 message="Processing failed",
                 error=str(exc),
                 timings=timings.to_dict(),
             )

         timings = TimingLog()
         try:
             self.store.update_job(
+                job_id,
+                status="running",
+                progress=0.04,
+                message="Preparing video input",
+                current_step="input",
+                step_index=1,
+                step_total=6,
             )
             with timings.measure("input"):
                 if source_kind == "youtube":
                     video_path = Path(source_value)
             self.store.update_job(
+                job_id,
+                progress=0.18,
+                message="Transcribing with Whisper Large V3",
+                current_step="transcription",
+                step_index=2,
+                step_total=6,
             )
             with timings.measure("transcription"):
                 transcript = await asyncio.to_thread(
                 "transcript.json",
                 [segment.model_dump(mode="json") for segment in transcript],
             )
+            self.store.update_job(
+                job_id,
+                progress=0.42,
+                message="Transcript ready",
+                transcript=transcript,
+                timings=timings.to_dict(),
+            )
+            self.store.update_job(
+                job_id,
+                progress=0.48,
+                message="Scoring highlights with Qwen",
+                current_step="highlight_detection",
+                step_index=3,
+                step_total=6,
+            )
             with timings.measure("highlight_detection"):
                 clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
+            self.store.update_job(
+                job_id,
+                progress=0.62,
+                message="Checking visual highlights",
+                current_step="multimodal_analysis",
+                step_index=4,
+                step_total=6,
+            )
             with timings.measure("multimodal_analysis"):
                 clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
+            clip_total = len(clips)
+            self.store.update_job(
+                job_id,
+                progress=0.72,
+                message=f"Preparing to render {clip_total} clips",
+                current_step="clip_generation",
+                step_index=5,
+                step_total=6,
+                active_clip_index=0,
+                active_clip_total=clip_total,
+            )
+            def update_render_progress(index: int, total: int) -> None:
+                progress = 0.72 + (0.22 * ((index - 1) / max(total, 1)))
+                self.store.update_job(
+                    job_id,
+                    progress=min(progress, 0.94),
+                    message=f"Rendering clip {index}/{total}",
+                    current_step="clip_generation",
+                    step_index=5,
+                    step_total=6,
+                    active_clip_index=index,
+                    active_clip_total=total,
+                    timings=timings.to_dict(),
+                )
             with timings.measure("clip_generation"):
                 rendered = await asyncio.to_thread(
+                    self.clip_generator.generate,
+                    job_id,
+                    video_path,
+                    clips,
+                    transcript,
+                    profile,
+                    update_render_progress,
                 )
+            self.store.update_job(
+                job_id,
+                progress=0.97,
+                message="Finalizing clips",
+                current_step="finalizing",
+                step_index=6,
+                step_total=6,
+                active_clip_index=clip_total,
+                active_clip_total=clip_total,
+                timings=timings.to_dict(),
+            )
             self.store.write_json(
                 job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
             )
                 status="completed",
                 progress=1,
                 message="Clips ready",
+                current_step="completed",
+                step_index=6,
+                step_total=6,
+                active_clip_index=clip_total,
+                active_clip_total=clip_total,
                 transcript=transcript,
                 clips=rendered,
                 timings=timings.to_dict(),
                 status="failed",
                 progress=1,
                 message="Processing failed",
+                current_step="failed",
                 error=str(exc),
                 timings=timings.to_dict(),
             )

backend/app/services/subtitles.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from pathlib import Path
 from app.models.schemas import TranscriptSegment
@@ -11,7 +12,10 @@ def seconds_to_srt_time(value: float) -> str:
     return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
-def write_srt(path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]) -> None:
     rows: list[str] = []
     index = 1
     for segment in segments:
@@ -19,26 +23,101 @@ def write_srt(path: Path, clip_start: float, clip_end: float, segments: list[Tra
             continue
         start = max(0.0, segment.start_seconds - clip_start)
         end = min(clip_end - clip_start, segment.end_seconds - clip_start)
-        rows.extend(
-            [
-                str(index),
-                f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(max(end, start + 0.8))}",
-                segment.text.strip(),
-                "",
-            ]
-        )
-        index += 1
     if not rows:
-        rows = ["1", "00:00:00,000 --> 00:00:03,000", "", ""]
     path.write_text("\n".join(rows), encoding="utf-8")
-def write_single_caption_srt(path: Path, duration: float, text: str) -> None:
     safe_duration = max(duration, 1.0)
-    rows = [
-        "1",
-        f"00:00:00,000 --> {seconds_to_srt_time(safe_duration)}",
         text.strip(),
         "",
     ]
-    path.write_text("\n".join(rows), encoding="utf-8")

+import re
 from pathlib import Path
 from app.models.schemas import TranscriptSegment
     return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
+def write_srt(
+    path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]
+) -> list[dict]:
+    cues: list[dict] = []
     rows: list[str] = []
     index = 1
     for segment in segments:
             continue
         start = max(0.0, segment.start_seconds - clip_start)
         end = min(clip_end - clip_start, segment.end_seconds - clip_start)
+        for cue in split_timed_caption(segment.text, start, max(end, start + 1.2)):
+            rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
+            cues.append(cue)
+            index += 1
     if not rows:
+        cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
+        rows = _srt_row(1, 0.0, 3.0, "")
     path.write_text("\n".join(rows), encoding="utf-8")
+    return cues
+def write_single_caption_srt(path: Path, duration: float, text: str) -> list[dict]:
     safe_duration = max(duration, 1.0)
+    cues = split_timed_caption(text, 0.0, safe_duration)
+    rows: list[str] = []
+    for index, cue in enumerate(cues, start=1):
+        rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
+    if not rows:
+        cues = [{"start_seconds": 0.0, "end_seconds": min(safe_duration, 3.0), "text": ""}]
+        rows = _srt_row(1, cues[0]["start_seconds"], cues[0]["end_seconds"], "")
+    path.write_text("\n".join(rows), encoding="utf-8")
+    return cues
+def split_timed_caption(text: str, start: float, end: float) -> list[dict]:
+    phrases = split_caption_text(text)
+    if not phrases:
+        return []
+    total_duration = max(end - start, 1.2)
+    max_cues = max(1, int(total_duration / 1.2))
+    if len(phrases) > max_cues:
+        phrases = _merge_phrases(phrases, max_cues)
+    cue_duration = min(4.0, max(1.2, total_duration / len(phrases)))
+    cues: list[dict] = []
+    cursor = start
+    for index, phrase in enumerate(phrases):
+        remaining = len(phrases) - index
+        max_end = end - ((remaining - 1) * 1.2)
+        cue_end = min(max_end, cursor + cue_duration)
+        cue_end = max(cue_end, cursor + 1.2)
+        if index == len(phrases) - 1:
+            cue_end = end
+        cues.append(
+            {
+                "start_seconds": round(cursor, 3),
+                "end_seconds": round(max(cue_end, cursor + 0.8), 3),
+                "text": phrase,
+            }
+        )
+        cursor = cue_end
+    return cues
+def split_caption_text(text: str, max_chars: int = 42, max_words: int = 7) -> list[str]:
+    clean = re.sub(r"\s+", " ", text.strip())
+    if not clean:
+        return []
+    words = clean.split()
+    if len(words) <= 1:
+        return [clean[index : index + max_chars] for index in range(0, len(clean), max_chars)]
+    phrases: list[str] = []
+    current: list[str] = []
+    for word in words:
+        candidate = " ".join([*current, word]).strip()
+        punctuation_break = bool(current and re.search(r"[,.!?;:]$", current[-1]))
+        if current and (len(candidate) > max_chars or len(current) >= max_words or punctuation_break):
+            phrases.append(" ".join(current))
+            current = [word]
+        else:
+            current.append(word)
+    if current:
+        phrases.append(" ".join(current))
+    return phrases
+def _merge_phrases(phrases: list[str], target_count: int) -> list[str]:
+    if target_count <= 1:
+        return [" ".join(phrases)]
+    merged: list[str] = []
+    bucket_size = len(phrases) / target_count
+    for index in range(target_count):
+        start = round(index * bucket_size)
+        end = round((index + 1) * bucket_size)
+        merged.append(" ".join(phrases[start:end]).strip())
+    return [phrase for phrase in merged if phrase]
+def _srt_row(index: int, start: float, end: float, text: str) -> list[str]:
+    return [
+        str(index),
+        f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(end)}",
         text.strip(),
         "",
     ]

backend/app/services/transcription.py CHANGED Viewed

@@ -67,12 +67,22 @@ class WhisperTranscriber:
     def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
         style = profile.clip_style.lower()
-        niche = profile.niche.lower()
         lines = [
             "This opening sets up the main problem creators face when a long video hides the best moments.",
             "Here is the surprising mistake most teams make when they choose clips only by view count.",
             "The important question is simple: which moment would make someone stop scrolling right now?",
             f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
             "This section has the clearest explanation and a strong before-and-after contrast.",
             "Then the guest reacts with a punchy line that works well as a short hook.",
             "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",

     def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
         style = profile.clip_style.lower()
+        niche_value = (
+            profile.niche_custom
+            if profile.niche.lower() == "other" and profile.niche_custom
+            else profile.niche
+        )
+        niche = niche_value.lower()
+        creator_context = (
+            profile.channel_description
+            or "The creator wants clips that feel useful and easy to share."
+        )
         lines = [
             "This opening sets up the main problem creators face when a long video hides the best moments.",
             "Here is the surprising mistake most teams make when they choose clips only by view count.",
             "The important question is simple: which moment would make someone stop scrolling right now?",
             f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
+            f"The channel context is simple: {creator_context}",
             "This section has the clearest explanation and a strong before-and-after contrast.",
             "Then the guest reacts with a punchy line that works well as a short hook.",
             "A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",