JakgritB commited on
Commit ·
5dadf47
1
Parent(s): 00b7145
feat(backend): enrich clipping pipeline metadata
Browse filesAdd channel profile description fields, per-job progress metadata, clip count support, clip-by-clip render progress, and shorter timed subtitle cues.
backend/app/models/schemas.py
CHANGED
|
@@ -17,12 +17,15 @@ class TargetPlatform(str, Enum):
|
|
| 17 |
|
| 18 |
class ChannelProfile(BaseModel):
|
| 19 |
niche: str = Field(default="education", min_length=2, max_length=80)
|
|
|
|
|
|
|
| 20 |
clip_style: str = Field(default="informative", min_length=2, max_length=80)
|
| 21 |
clip_length_seconds: int = Field(default=60, ge=15, le=180)
|
|
|
|
| 22 |
primary_language: str = Field(default="Thai", min_length=2, max_length=40)
|
| 23 |
target_platform: TargetPlatform = TargetPlatform.tiktok
|
| 24 |
|
| 25 |
-
@field_validator("niche", "clip_style", "primary_language")
|
| 26 |
@classmethod
|
| 27 |
def clean_text(cls, value: str) -> str:
|
| 28 |
return value.strip()
|
|
@@ -75,6 +78,11 @@ class JobSnapshot(BaseModel):
|
|
| 75 |
status: Literal["queued", "running", "completed", "failed"]
|
| 76 |
progress: float = Field(ge=0, le=1)
|
| 77 |
message: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
source: dict[str, Any]
|
| 79 |
profile: ChannelProfile
|
| 80 |
transcript: list[TranscriptSegment] = Field(default_factory=list)
|
|
|
|
| 17 |
|
| 18 |
class ChannelProfile(BaseModel):
|
| 19 |
niche: str = Field(default="education", min_length=2, max_length=80)
|
| 20 |
+
niche_custom: str = Field(default="", max_length=80)
|
| 21 |
+
channel_description: str = Field(default="", max_length=700)
|
| 22 |
clip_style: str = Field(default="informative", min_length=2, max_length=80)
|
| 23 |
clip_length_seconds: int = Field(default=60, ge=15, le=180)
|
| 24 |
+
clip_count: int = Field(default=5, ge=1, le=20)
|
| 25 |
primary_language: str = Field(default="Thai", min_length=2, max_length=40)
|
| 26 |
target_platform: TargetPlatform = TargetPlatform.tiktok
|
| 27 |
|
| 28 |
+
@field_validator("niche", "niche_custom", "channel_description", "clip_style", "primary_language")
|
| 29 |
@classmethod
|
| 30 |
def clean_text(cls, value: str) -> str:
|
| 31 |
return value.strip()
|
|
|
|
| 78 |
status: Literal["queued", "running", "completed", "failed"]
|
| 79 |
progress: float = Field(ge=0, le=1)
|
| 80 |
message: str
|
| 81 |
+
current_step: str = ""
|
| 82 |
+
step_index: int = Field(default=0, ge=0)
|
| 83 |
+
step_total: int = Field(default=6, ge=1)
|
| 84 |
+
active_clip_index: int = Field(default=0, ge=0)
|
| 85 |
+
active_clip_total: int = Field(default=0, ge=0)
|
| 86 |
source: dict[str, Any]
|
| 87 |
profile: ChannelProfile
|
| 88 |
transcript: list[TranscriptSegment] = Field(default_factory=list)
|
backend/app/services/clips.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import shutil
|
| 2 |
import subprocess
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
|
| 5 |
from app.core.config import Settings
|
| 6 |
from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
|
|
@@ -20,9 +21,13 @@ class ClipGenerator:
|
|
| 20 |
clips: list[ClipCandidate],
|
| 21 |
transcript: list[TranscriptSegment],
|
| 22 |
profile: ChannelProfile,
|
|
|
|
| 23 |
) -> list[ClipCandidate]:
|
| 24 |
rendered: list[ClipCandidate] = []
|
|
|
|
| 25 |
for index, clip in enumerate(clips, start=1):
|
|
|
|
|
|
|
| 26 |
rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
|
| 27 |
return rendered
|
| 28 |
|
|
@@ -43,14 +48,15 @@ class ClipGenerator:
|
|
| 43 |
|
| 44 |
duration = max(1.0, clip.end_seconds - clip.start_seconds)
|
| 45 |
if clip.subtitle_text.strip():
|
| 46 |
-
write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
|
| 47 |
else:
|
| 48 |
-
write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
|
| 49 |
self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
|
| 50 |
|
| 51 |
clip.video_url = self.store.media_url(job_id, output_name)
|
| 52 |
clip.download_url = clip.video_url
|
| 53 |
clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
|
|
|
|
| 54 |
return clip
|
| 55 |
|
| 56 |
def _run_ffmpeg(
|
|
|
|
| 1 |
import shutil
|
| 2 |
import subprocess
|
| 3 |
from pathlib import Path
|
| 4 |
+
from typing import Callable
|
| 5 |
|
| 6 |
from app.core.config import Settings
|
| 7 |
from app.models.schemas import ChannelProfile, ClipCandidate, TranscriptSegment
|
|
|
|
| 21 |
clips: list[ClipCandidate],
|
| 22 |
transcript: list[TranscriptSegment],
|
| 23 |
profile: ChannelProfile,
|
| 24 |
+
progress_callback: Callable[[int, int], None] | None = None,
|
| 25 |
) -> list[ClipCandidate]:
|
| 26 |
rendered: list[ClipCandidate] = []
|
| 27 |
+
total = len(clips)
|
| 28 |
for index, clip in enumerate(clips, start=1):
|
| 29 |
+
if progress_callback:
|
| 30 |
+
progress_callback(index, total)
|
| 31 |
rendered.append(self.render_one(job_id, video_path, clip, transcript, profile, index))
|
| 32 |
return rendered
|
| 33 |
|
|
|
|
| 48 |
|
| 49 |
duration = max(1.0, clip.end_seconds - clip.start_seconds)
|
| 50 |
if clip.subtitle_text.strip():
|
| 51 |
+
subtitle_cues = write_single_caption_srt(subtitle_path, duration, clip.subtitle_text)
|
| 52 |
else:
|
| 53 |
+
subtitle_cues = write_srt(subtitle_path, clip.start_seconds, clip.end_seconds, transcript)
|
| 54 |
self._run_ffmpeg(video_path, output_path, subtitle_path, clip, profile)
|
| 55 |
|
| 56 |
clip.video_url = self.store.media_url(job_id, output_name)
|
| 57 |
clip.download_url = clip.video_url
|
| 58 |
clip.metadata["subtitle_file"] = self.store.media_url(job_id, subtitle_name)
|
| 59 |
+
clip.metadata["subtitle_cues"] = subtitle_cues
|
| 60 |
return clip
|
| 61 |
|
| 62 |
def _run_ffmpeg(
|
backend/app/services/highlight.py
CHANGED
|
@@ -41,12 +41,17 @@ class QwenHighlightDetector:
|
|
| 41 |
f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
|
| 42 |
for segment in transcript
|
| 43 |
)
|
|
|
|
|
|
|
|
|
|
| 44 |
prompt = f"""
|
| 45 |
You are selecting short-form clips for a creator.
|
| 46 |
Profile:
|
| 47 |
-
- niche: {
|
|
|
|
| 48 |
- style: {profile.clip_style}
|
| 49 |
- target length seconds: {profile.clip_length_seconds}
|
|
|
|
| 50 |
- language: {profile.primary_language}
|
| 51 |
- platform: {profile.target_platform.value}
|
| 52 |
|
|
@@ -80,7 +85,7 @@ Transcript:
|
|
| 80 |
subtitle_text=str(item.get("subtitle_text") or ""),
|
| 81 |
metadata={"model": self.settings.qwen_text_model_id},
|
| 82 |
)
|
| 83 |
-
for item in payload[:
|
| 84 |
]
|
| 85 |
return clips or self._heuristic_detect(transcript, profile)
|
| 86 |
|
|
@@ -103,6 +108,12 @@ Transcript:
|
|
| 103 |
"educational": ["question", "answer", "context", "takeaway"],
|
| 104 |
}
|
| 105 |
preferred_terms = style_terms.get(profile.clip_style.lower(), [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
scored: list[tuple[float, TranscriptSegment]] = []
|
| 107 |
for segment in transcript:
|
| 108 |
text = segment.text.lower()
|
|
@@ -111,12 +122,14 @@ Transcript:
|
|
| 111 |
score += 8 if any(term in text for term in preferred_terms) else 0
|
| 112 |
score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
|
| 113 |
score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
|
|
|
|
| 114 |
score += min(len(segment.text) / 12, 10)
|
| 115 |
scored.append((min(score, 100), segment))
|
| 116 |
|
| 117 |
scored.sort(key=lambda item: item[0], reverse=True)
|
| 118 |
clips: list[ClipCandidate] = []
|
| 119 |
-
|
|
|
|
| 120 |
start = max(0.0, segment.start_seconds - 5.0)
|
| 121 |
end = start + float(profile.clip_length_seconds)
|
| 122 |
clips.append(
|
|
@@ -125,7 +138,7 @@ Transcript:
|
|
| 125 |
start_seconds=start,
|
| 126 |
end_seconds=end,
|
| 127 |
title=self._title_for(segment.text),
|
| 128 |
-
reason=f"Matches the {profile.clip_style} style for a {
|
| 129 |
score=round(score, 1),
|
| 130 |
subtitle_text=segment.text,
|
| 131 |
metadata={"model": "heuristic-fallback"},
|
|
@@ -137,3 +150,9 @@ Transcript:
|
|
| 137 |
words = re.sub(r"[^A-Za-z0-9 ]+", "", text).split()
|
| 138 |
title = " ".join(words[:7])
|
| 139 |
return title or "Highlight"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
f"[{segment.start_seconds:.1f}-{segment.end_seconds:.1f}] {segment.text}"
|
| 42 |
for segment in transcript
|
| 43 |
)
|
| 44 |
+
niche = _effective_niche(profile)
|
| 45 |
+
channel_description = profile.channel_description or "No extra channel description provided."
|
| 46 |
+
clip_count = min(profile.clip_count, self.settings.max_clips)
|
| 47 |
prompt = f"""
|
| 48 |
You are selecting short-form clips for a creator.
|
| 49 |
Profile:
|
| 50 |
+
- niche: {niche}
|
| 51 |
+
- creator description: {channel_description}
|
| 52 |
- style: {profile.clip_style}
|
| 53 |
- target length seconds: {profile.clip_length_seconds}
|
| 54 |
+
- target number of clips: {clip_count}
|
| 55 |
- language: {profile.primary_language}
|
| 56 |
- platform: {profile.target_platform.value}
|
| 57 |
|
|
|
|
| 85 |
subtitle_text=str(item.get("subtitle_text") or ""),
|
| 86 |
metadata={"model": self.settings.qwen_text_model_id},
|
| 87 |
)
|
| 88 |
+
for item in payload[:clip_count]
|
| 89 |
]
|
| 90 |
return clips or self._heuristic_detect(transcript, profile)
|
| 91 |
|
|
|
|
| 108 |
"educational": ["question", "answer", "context", "takeaway"],
|
| 109 |
}
|
| 110 |
preferred_terms = style_terms.get(profile.clip_style.lower(), [])
|
| 111 |
+
niche = _effective_niche(profile)
|
| 112 |
+
profile_terms = [
|
| 113 |
+
term
|
| 114 |
+
for term in f"{niche} {profile.channel_description}".lower().split()[:30]
|
| 115 |
+
if len(term) > 2
|
| 116 |
+
]
|
| 117 |
scored: list[tuple[float, TranscriptSegment]] = []
|
| 118 |
for segment in transcript:
|
| 119 |
text = segment.text.lower()
|
|
|
|
| 122 |
score += 8 if any(term in text for term in preferred_terms) else 0
|
| 123 |
score += 8 if any(term in text for term in ["mistake", "surprising", "stop scrolling"]) else 0
|
| 124 |
score += 6 if any(term in text for term in ["takeaway", "answer", "reacts"]) else 0
|
| 125 |
+
score += 5 if any(term in text for term in profile_terms) else 0
|
| 126 |
score += min(len(segment.text) / 12, 10)
|
| 127 |
scored.append((min(score, 100), segment))
|
| 128 |
|
| 129 |
scored.sort(key=lambda item: item[0], reverse=True)
|
| 130 |
clips: list[ClipCandidate] = []
|
| 131 |
+
clip_count = min(profile.clip_count, self.settings.max_clips)
|
| 132 |
+
for score, segment in scored[:clip_count]:
|
| 133 |
start = max(0.0, segment.start_seconds - 5.0)
|
| 134 |
end = start + float(profile.clip_length_seconds)
|
| 135 |
clips.append(
|
|
|
|
| 138 |
start_seconds=start,
|
| 139 |
end_seconds=end,
|
| 140 |
title=self._title_for(segment.text),
|
| 141 |
+
reason=f"Matches the {profile.clip_style} style for a {niche} audience.",
|
| 142 |
score=round(score, 1),
|
| 143 |
subtitle_text=segment.text,
|
| 144 |
metadata={"model": "heuristic-fallback"},
|
|
|
|
| 150 |
words = re.sub(r"[^A-Za-z0-9 ]+", "", text).split()
|
| 151 |
title = " ".join(words[:7])
|
| 152 |
return title or "Highlight"
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _effective_niche(profile: ChannelProfile) -> str:
|
| 156 |
+
if profile.niche.lower() == "other" and profile.niche_custom:
|
| 157 |
+
return profile.niche_custom
|
| 158 |
+
return profile.niche
|
backend/app/services/pipeline.py
CHANGED
|
@@ -31,7 +31,13 @@ class VideoPipeline:
|
|
| 31 |
timings = TimingLog()
|
| 32 |
try:
|
| 33 |
self.store.update_job(
|
| 34 |
-
job_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
)
|
| 36 |
with timings.measure("input"):
|
| 37 |
if source_kind == "youtube":
|
|
@@ -42,7 +48,12 @@ class VideoPipeline:
|
|
| 42 |
video_path = Path(source_value)
|
| 43 |
|
| 44 |
self.store.update_job(
|
| 45 |
-
job_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
)
|
| 47 |
with timings.measure("transcription"):
|
| 48 |
transcript = await asyncio.to_thread(
|
|
@@ -53,22 +64,84 @@ class VideoPipeline:
|
|
| 53 |
"transcript.json",
|
| 54 |
[segment.model_dump(mode="json") for segment in transcript],
|
| 55 |
)
|
| 56 |
-
self.store.update_job(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
-
self.store.update_job(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
with timings.measure("highlight_detection"):
|
| 60 |
clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
|
| 61 |
|
| 62 |
-
self.store.update_job(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
with timings.measure("multimodal_analysis"):
|
| 64 |
clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
with timings.measure("clip_generation"):
|
| 68 |
rendered = await asyncio.to_thread(
|
| 69 |
-
self.clip_generator.generate,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
self.store.write_json(
|
| 73 |
job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
|
| 74 |
)
|
|
@@ -77,6 +150,11 @@ class VideoPipeline:
|
|
| 77 |
status="completed",
|
| 78 |
progress=1,
|
| 79 |
message="Clips ready",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
transcript=transcript,
|
| 81 |
clips=rendered,
|
| 82 |
timings=timings.to_dict(),
|
|
@@ -87,6 +165,7 @@ class VideoPipeline:
|
|
| 87 |
status="failed",
|
| 88 |
progress=1,
|
| 89 |
message="Processing failed",
|
|
|
|
| 90 |
error=str(exc),
|
| 91 |
timings=timings.to_dict(),
|
| 92 |
)
|
|
|
|
| 31 |
timings = TimingLog()
|
| 32 |
try:
|
| 33 |
self.store.update_job(
|
| 34 |
+
job_id,
|
| 35 |
+
status="running",
|
| 36 |
+
progress=0.04,
|
| 37 |
+
message="Preparing video input",
|
| 38 |
+
current_step="input",
|
| 39 |
+
step_index=1,
|
| 40 |
+
step_total=6,
|
| 41 |
)
|
| 42 |
with timings.measure("input"):
|
| 43 |
if source_kind == "youtube":
|
|
|
|
| 48 |
video_path = Path(source_value)
|
| 49 |
|
| 50 |
self.store.update_job(
|
| 51 |
+
job_id,
|
| 52 |
+
progress=0.18,
|
| 53 |
+
message="Transcribing with Whisper Large V3",
|
| 54 |
+
current_step="transcription",
|
| 55 |
+
step_index=2,
|
| 56 |
+
step_total=6,
|
| 57 |
)
|
| 58 |
with timings.measure("transcription"):
|
| 59 |
transcript = await asyncio.to_thread(
|
|
|
|
| 64 |
"transcript.json",
|
| 65 |
[segment.model_dump(mode="json") for segment in transcript],
|
| 66 |
)
|
| 67 |
+
self.store.update_job(
|
| 68 |
+
job_id,
|
| 69 |
+
progress=0.42,
|
| 70 |
+
message="Transcript ready",
|
| 71 |
+
transcript=transcript,
|
| 72 |
+
timings=timings.to_dict(),
|
| 73 |
+
)
|
| 74 |
|
| 75 |
+
self.store.update_job(
|
| 76 |
+
job_id,
|
| 77 |
+
progress=0.48,
|
| 78 |
+
message="Scoring highlights with Qwen",
|
| 79 |
+
current_step="highlight_detection",
|
| 80 |
+
step_index=3,
|
| 81 |
+
step_total=6,
|
| 82 |
+
)
|
| 83 |
with timings.measure("highlight_detection"):
|
| 84 |
clips = await asyncio.to_thread(self.highlight_detector.detect, transcript, profile)
|
| 85 |
|
| 86 |
+
self.store.update_job(
|
| 87 |
+
job_id,
|
| 88 |
+
progress=0.62,
|
| 89 |
+
message="Checking visual highlights",
|
| 90 |
+
current_step="multimodal_analysis",
|
| 91 |
+
step_index=4,
|
| 92 |
+
step_total=6,
|
| 93 |
+
)
|
| 94 |
with timings.measure("multimodal_analysis"):
|
| 95 |
clips = await asyncio.to_thread(self.visual_analyzer.enrich, str(video_path), clips)
|
| 96 |
|
| 97 |
+
clip_total = len(clips)
|
| 98 |
+
self.store.update_job(
|
| 99 |
+
job_id,
|
| 100 |
+
progress=0.72,
|
| 101 |
+
message=f"Preparing to render {clip_total} clips",
|
| 102 |
+
current_step="clip_generation",
|
| 103 |
+
step_index=5,
|
| 104 |
+
step_total=6,
|
| 105 |
+
active_clip_index=0,
|
| 106 |
+
active_clip_total=clip_total,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def update_render_progress(index: int, total: int) -> None:
|
| 110 |
+
progress = 0.72 + (0.22 * ((index - 1) / max(total, 1)))
|
| 111 |
+
self.store.update_job(
|
| 112 |
+
job_id,
|
| 113 |
+
progress=min(progress, 0.94),
|
| 114 |
+
message=f"Rendering clip {index}/{total}",
|
| 115 |
+
current_step="clip_generation",
|
| 116 |
+
step_index=5,
|
| 117 |
+
step_total=6,
|
| 118 |
+
active_clip_index=index,
|
| 119 |
+
active_clip_total=total,
|
| 120 |
+
timings=timings.to_dict(),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
with timings.measure("clip_generation"):
|
| 124 |
rendered = await asyncio.to_thread(
|
| 125 |
+
self.clip_generator.generate,
|
| 126 |
+
job_id,
|
| 127 |
+
video_path,
|
| 128 |
+
clips,
|
| 129 |
+
transcript,
|
| 130 |
+
profile,
|
| 131 |
+
update_render_progress,
|
| 132 |
)
|
| 133 |
|
| 134 |
+
self.store.update_job(
|
| 135 |
+
job_id,
|
| 136 |
+
progress=0.97,
|
| 137 |
+
message="Finalizing clips",
|
| 138 |
+
current_step="finalizing",
|
| 139 |
+
step_index=6,
|
| 140 |
+
step_total=6,
|
| 141 |
+
active_clip_index=clip_total,
|
| 142 |
+
active_clip_total=clip_total,
|
| 143 |
+
timings=timings.to_dict(),
|
| 144 |
+
)
|
| 145 |
self.store.write_json(
|
| 146 |
job_id, "clips.json", [clip.model_dump(mode="json") for clip in rendered]
|
| 147 |
)
|
|
|
|
| 150 |
status="completed",
|
| 151 |
progress=1,
|
| 152 |
message="Clips ready",
|
| 153 |
+
current_step="completed",
|
| 154 |
+
step_index=6,
|
| 155 |
+
step_total=6,
|
| 156 |
+
active_clip_index=clip_total,
|
| 157 |
+
active_clip_total=clip_total,
|
| 158 |
transcript=transcript,
|
| 159 |
clips=rendered,
|
| 160 |
timings=timings.to_dict(),
|
|
|
|
| 165 |
status="failed",
|
| 166 |
progress=1,
|
| 167 |
message="Processing failed",
|
| 168 |
+
current_step="failed",
|
| 169 |
error=str(exc),
|
| 170 |
timings=timings.to_dict(),
|
| 171 |
)
|
backend/app/services/subtitles.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
| 3 |
from app.models.schemas import TranscriptSegment
|
|
@@ -11,7 +12,10 @@ def seconds_to_srt_time(value: float) -> str:
|
|
| 11 |
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
|
| 12 |
|
| 13 |
|
| 14 |
-
def write_srt(
|
|
|
|
|
|
|
|
|
|
| 15 |
rows: list[str] = []
|
| 16 |
index = 1
|
| 17 |
for segment in segments:
|
|
@@ -19,26 +23,101 @@ def write_srt(path: Path, clip_start: float, clip_end: float, segments: list[Tra
|
|
| 19 |
continue
|
| 20 |
start = max(0.0, segment.start_seconds - clip_start)
|
| 21 |
end = min(clip_end - clip_start, segment.end_seconds - clip_start)
|
| 22 |
-
|
| 23 |
-
[
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
segment.text.strip(),
|
| 27 |
-
"",
|
| 28 |
-
]
|
| 29 |
-
)
|
| 30 |
-
index += 1
|
| 31 |
if not rows:
|
| 32 |
-
|
|
|
|
| 33 |
path.write_text("\n".join(rows), encoding="utf-8")
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
-
def write_single_caption_srt(path: Path, duration: float, text: str) ->
|
| 37 |
safe_duration = max(duration, 1.0)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
text.strip(),
|
| 42 |
"",
|
| 43 |
]
|
| 44 |
-
path.write_text("\n".join(rows), encoding="utf-8")
|
|
|
|
| 1 |
+
import re
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
from app.models.schemas import TranscriptSegment
|
|
|
|
| 12 |
return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
|
| 13 |
|
| 14 |
|
| 15 |
+
def write_srt(
|
| 16 |
+
path: Path, clip_start: float, clip_end: float, segments: list[TranscriptSegment]
|
| 17 |
+
) -> list[dict]:
|
| 18 |
+
cues: list[dict] = []
|
| 19 |
rows: list[str] = []
|
| 20 |
index = 1
|
| 21 |
for segment in segments:
|
|
|
|
| 23 |
continue
|
| 24 |
start = max(0.0, segment.start_seconds - clip_start)
|
| 25 |
end = min(clip_end - clip_start, segment.end_seconds - clip_start)
|
| 26 |
+
for cue in split_timed_caption(segment.text, start, max(end, start + 1.2)):
|
| 27 |
+
rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
|
| 28 |
+
cues.append(cue)
|
| 29 |
+
index += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
if not rows:
|
| 31 |
+
cues = [{"start_seconds": 0.0, "end_seconds": 3.0, "text": ""}]
|
| 32 |
+
rows = _srt_row(1, 0.0, 3.0, "")
|
| 33 |
path.write_text("\n".join(rows), encoding="utf-8")
|
| 34 |
+
return cues
|
| 35 |
|
| 36 |
|
| 37 |
+
def write_single_caption_srt(path: Path, duration: float, text: str) -> list[dict]:
|
| 38 |
safe_duration = max(duration, 1.0)
|
| 39 |
+
cues = split_timed_caption(text, 0.0, safe_duration)
|
| 40 |
+
rows: list[str] = []
|
| 41 |
+
for index, cue in enumerate(cues, start=1):
|
| 42 |
+
rows.extend(_srt_row(index, cue["start_seconds"], cue["end_seconds"], cue["text"]))
|
| 43 |
+
if not rows:
|
| 44 |
+
cues = [{"start_seconds": 0.0, "end_seconds": min(safe_duration, 3.0), "text": ""}]
|
| 45 |
+
rows = _srt_row(1, cues[0]["start_seconds"], cues[0]["end_seconds"], "")
|
| 46 |
+
path.write_text("\n".join(rows), encoding="utf-8")
|
| 47 |
+
return cues
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def split_timed_caption(text: str, start: float, end: float) -> list[dict]:
|
| 51 |
+
phrases = split_caption_text(text)
|
| 52 |
+
if not phrases:
|
| 53 |
+
return []
|
| 54 |
+
|
| 55 |
+
total_duration = max(end - start, 1.2)
|
| 56 |
+
max_cues = max(1, int(total_duration / 1.2))
|
| 57 |
+
if len(phrases) > max_cues:
|
| 58 |
+
phrases = _merge_phrases(phrases, max_cues)
|
| 59 |
+
|
| 60 |
+
cue_duration = min(4.0, max(1.2, total_duration / len(phrases)))
|
| 61 |
+
cues: list[dict] = []
|
| 62 |
+
cursor = start
|
| 63 |
+
for index, phrase in enumerate(phrases):
|
| 64 |
+
remaining = len(phrases) - index
|
| 65 |
+
max_end = end - ((remaining - 1) * 1.2)
|
| 66 |
+
cue_end = min(max_end, cursor + cue_duration)
|
| 67 |
+
cue_end = max(cue_end, cursor + 1.2)
|
| 68 |
+
if index == len(phrases) - 1:
|
| 69 |
+
cue_end = end
|
| 70 |
+
cues.append(
|
| 71 |
+
{
|
| 72 |
+
"start_seconds": round(cursor, 3),
|
| 73 |
+
"end_seconds": round(max(cue_end, cursor + 0.8), 3),
|
| 74 |
+
"text": phrase,
|
| 75 |
+
}
|
| 76 |
+
)
|
| 77 |
+
cursor = cue_end
|
| 78 |
+
return cues
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def split_caption_text(text: str, max_chars: int = 42, max_words: int = 7) -> list[str]:
|
| 82 |
+
clean = re.sub(r"\s+", " ", text.strip())
|
| 83 |
+
if not clean:
|
| 84 |
+
return []
|
| 85 |
+
|
| 86 |
+
words = clean.split()
|
| 87 |
+
if len(words) <= 1:
|
| 88 |
+
return [clean[index : index + max_chars] for index in range(0, len(clean), max_chars)]
|
| 89 |
+
|
| 90 |
+
phrases: list[str] = []
|
| 91 |
+
current: list[str] = []
|
| 92 |
+
for word in words:
|
| 93 |
+
candidate = " ".join([*current, word]).strip()
|
| 94 |
+
punctuation_break = bool(current and re.search(r"[,.!?;:]$", current[-1]))
|
| 95 |
+
if current and (len(candidate) > max_chars or len(current) >= max_words or punctuation_break):
|
| 96 |
+
phrases.append(" ".join(current))
|
| 97 |
+
current = [word]
|
| 98 |
+
else:
|
| 99 |
+
current.append(word)
|
| 100 |
+
if current:
|
| 101 |
+
phrases.append(" ".join(current))
|
| 102 |
+
return phrases
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _merge_phrases(phrases: list[str], target_count: int) -> list[str]:
|
| 106 |
+
if target_count <= 1:
|
| 107 |
+
return [" ".join(phrases)]
|
| 108 |
+
merged: list[str] = []
|
| 109 |
+
bucket_size = len(phrases) / target_count
|
| 110 |
+
for index in range(target_count):
|
| 111 |
+
start = round(index * bucket_size)
|
| 112 |
+
end = round((index + 1) * bucket_size)
|
| 113 |
+
merged.append(" ".join(phrases[start:end]).strip())
|
| 114 |
+
return [phrase for phrase in merged if phrase]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _srt_row(index: int, start: float, end: float, text: str) -> list[str]:
|
| 118 |
+
return [
|
| 119 |
+
str(index),
|
| 120 |
+
f"{seconds_to_srt_time(start)} --> {seconds_to_srt_time(end)}",
|
| 121 |
text.strip(),
|
| 122 |
"",
|
| 123 |
]
|
|
|
backend/app/services/transcription.py
CHANGED
|
@@ -67,12 +67,22 @@ class WhisperTranscriber:
|
|
| 67 |
|
| 68 |
def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
|
| 69 |
style = profile.clip_style.lower()
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
lines = [
|
| 72 |
"This opening sets up the main problem creators face when a long video hides the best moments.",
|
| 73 |
"Here is the surprising mistake most teams make when they choose clips only by view count.",
|
| 74 |
"The important question is simple: which moment would make someone stop scrolling right now?",
|
| 75 |
f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
|
|
|
|
| 76 |
"This section has the clearest explanation and a strong before-and-after contrast.",
|
| 77 |
"Then the guest reacts with a punchy line that works well as a short hook.",
|
| 78 |
"A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",
|
|
|
|
| 67 |
|
| 68 |
def _demo_transcript(self, profile: ChannelProfile) -> list[TranscriptSegment]:
|
| 69 |
style = profile.clip_style.lower()
|
| 70 |
+
niche_value = (
|
| 71 |
+
profile.niche_custom
|
| 72 |
+
if profile.niche.lower() == "other" and profile.niche_custom
|
| 73 |
+
else profile.niche
|
| 74 |
+
)
|
| 75 |
+
niche = niche_value.lower()
|
| 76 |
+
creator_context = (
|
| 77 |
+
profile.channel_description
|
| 78 |
+
or "The creator wants clips that feel useful and easy to share."
|
| 79 |
+
)
|
| 80 |
lines = [
|
| 81 |
"This opening sets up the main problem creators face when a long video hides the best moments.",
|
| 82 |
"Here is the surprising mistake most teams make when they choose clips only by view count.",
|
| 83 |
"The important question is simple: which moment would make someone stop scrolling right now?",
|
| 84 |
f"For a {niche} channel, the answer changes because the audience expects a {style} rhythm.",
|
| 85 |
+
f"The channel context is simple: {creator_context}",
|
| 86 |
"This section has the clearest explanation and a strong before-and-after contrast.",
|
| 87 |
"Then the guest reacts with a punchy line that works well as a short hook.",
|
| 88 |
"A practical takeaway lands here, with enough context to stand alone as a sixty second clip.",
|