diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..076f58049936f42603b4e1a070dc965eb4316b09 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,15 @@ +.git +.env +.env.* +!.env.example +.venv +__pycache__ +.pytest_cache +.humeo_* +.tmp_review_frames +.tmp_review_frames_ticketc +output +output* +*.log +*.zip +*.pyc diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..d1eaaae6f4b71a996fba468c461ffbf599f74deb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf filter=lfs diff=lfs merge=lfs -text diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..a78fc156804e6d7261a277a4dcfc2e829a7a7a53 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.12-slim-bookworm + +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PORT=7860 + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y ffmpeg && \ + rm -rf /var/lib/apt/lists/* + +COPY . /app + +RUN pip install --upgrade pip && \ + pip install ./humeo-core && \ + pip install . + +EXPOSE 7860 + +CMD ["python", "app.py"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f2d768392bb52fd2079bb0a642954ba1bbe8688a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 NotABot + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 6838c02488f486801cce092812db9b806d4eab2d..ce8910f6ecc271fdc01677157ae28439a9c09bfa 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,199 @@ ---- -title: Clipforge -emoji: πŸ† -colorFrom: blue -colorTo: gray -sdk: docker -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +--- +title: ClipForge +sdk: docker +app_port: 7860 +--- + +# ClipForge + +Current default preset: + +- `native_highlight` captions +- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages +- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available +- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set + +Long podcast or interview β†’ vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render. + +**Architecture (static HTML, GitHub Pages):** +[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html) + +## Hugging Face Space + +This repo includes a Hugging Face Docker Space entrypoint in `app.py` with the ClipForge upload/link UI. + +- Paste a YouTube/video URL or upload one local video file +- Watch live pipeline progress in the ClipForge UI +- Preview and download rendered `short_*.mp4` clips from the UI +- Regenerate from the same source with a steering prompt + +Required Space secrets: + +- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY` +- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY` + +If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when +`ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper. + +## Repo layout + +| Path | Role | +|------|------| +| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters | +| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server | + +## Pipeline (actual order) + +```text +YouTube URL + β†’ ingest (source.mp4, transcript.json) + β†’ clip selection (Gemini β†’ clips.json) + β†’ hook detection (Gemini β†’ hooks.json) + β†’ content pruning (Gemini β†’ prune.json) + β†’ keyframes + layout vision (Gemini vision β†’ layout_vision.json) + β†’ ASS subtitles + humeo-core ffmpeg render β†’ short_.mp4 +``` + +Details: **`docs/PIPELINE.md`**. + +## Five layouts + +A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**). + +## Requirements + +- **Python** β‰₯ 3.10 +- **`uv`** β€” install: [astral.sh/uv](https://docs.astral.sh/uv/) +- **`ffmpeg`** β€” on `PATH` for extract/render +- **API keys** β€” see **`docs/ENVIRONMENT.md`** + - `GOOGLE_API_KEY` or `GEMINI_API_KEY` β€” preferred for Gemini stages + - `OPENROUTER_API_KEY` β€” supported fallback for those same Gemini-like stages when Google keys are unavailable + - `OPENAI_API_KEY` β€” if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`) + +Copy **`.env.example`** β†’ **`.env`** (never commit `.env`). + +## Install + +```bash +uv venv +uv sync +``` + +Optional local WhisperX (heavy; Windows often uses OpenAI API instead): + +```bash +uv sync --extra whisper +``` + +## Run + +```bash +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" +humeo --long-to-shorts "C:\path\to\video.mp4" +``` + +Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**). + +## CLI guide (all flags) + +Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`. + +### Required + +| Flag | Meaning | +|------|---------| +| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). | + +### Paths and cache behavior + +| Flag | Meaning | +|------|---------| +| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). | +| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). | +| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. | +| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). | +| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. | + +### Model selection and stage forcing + +| Flag | Meaning | +|------|---------| +| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). | +| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). | +| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. | +| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. | +| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. | +| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. | +| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). | + +### Pruning and subtitles + +| Flag | Meaning | +|------|---------| +| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). | +| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). | +| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). | +| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). | +| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). | + +### Logging + +| Flag | Meaning | +|------|---------| +| `--verbose`, `-v` | Enable debug logging. | + +### Common command recipes + +```bash +# Basic run +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" + +# Local MP4 +humeo --long-to-shorts "C:\path\to\video.mp4" + +# Full fresh run for debugging / prompt tuning +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose + +# Re-run only clip selection after prompt edits +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection + +# Keep intermediates in a fixed local folder +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work + +# Compare different prune levels on same source +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive +``` + +## Documentation + +| Doc | Purpose | +|-----|---------| +| **`docs/README.md`** | Index of all files under `docs/` | +| **`docs/STUDY_ORDER.md`** | Read order for onboarding | +| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts | +| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout | +| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git | +| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example | +| **`docs/full_run_output.txt`** | Example full run log (text) | +| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping Β§9 | +| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap | +| **`docs/TODO.md`** | Backlog | +| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) | +| **`docs/SOLUTIONS.md`** | Design rationale | +| **`TERMINOLOGY.md`** | Glossary | + +## Tests + +```bash +uv sync --extra dev +uv run pytest +``` + +## Sharing outputs + +`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**. + +## License + +See **`LICENSE`** (root) and **`humeo-core/LICENSE`**. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..cc25374683b045a589bdfd92ac0cf53a6cde8647 --- /dev/null +++ b/app.py @@ -0,0 +1,808 @@ +from __future__ import annotations + +import html +import json +import logging +import os +import queue +import re +import shutil +import subprocess +import sys +import tempfile +import threading +import time +import traceback +import uuid +from dataclasses import dataclass, field +from pathlib import Path +from typing import Annotated + + +def _bootstrap_local_paths() -> None: + repo_root = Path(__file__).resolve().parent + for candidate in (repo_root / "src", repo_root / "humeo-core" / "src"): + candidate_str = str(candidate) + if candidate.is_dir() and candidate_str not in sys.path: + sys.path.insert(0, candidate_str) + + +_bootstrap_local_paths() +if not (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip(): + os.environ["HUMEO_TRANSCRIBE_PROVIDER"] = ( + "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai" + ) + +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.responses import FileResponse, HTMLResponse, JSONResponse + +from humeo.config import PipelineConfig +from humeo.pipeline import run_pipeline + + +APP_TITLE = "ClipForge" +LOG_FORMAT = "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s" +MAX_LOG_LINES = 700 +LLM_KEY_NAMES = ("GOOGLE_API_KEY", "GEMINI_API_KEY", "OPENROUTER_API_KEY") + + +class QueueLogHandler(logging.Handler): + def __init__(self, sink: queue.Queue[str]): + super().__init__() + self._sink = sink + + def emit(self, record: logging.LogRecord) -> None: + try: + self._sink.put_nowait(self.format(record)) + except Exception: + pass + + +@dataclass +class ClipFile: + name: str + url: str + duration: str + + +@dataclass +class Job: + id: str + run_root: Path + output_dir: Path + work_dir: Path + source: str + source_path: Path | None = None + steering_note: str | None = None + status: str = "Queued" + nav_status: str = "Processing..." + error: str | None = None + done: bool = False + created_at: float = field(default_factory=time.time) + logs: list[str] = field(default_factory=list) + clips: dict[str, ClipFile] = field(default_factory=dict) + steps: list[dict[str, object]] = field( + default_factory=lambda: [ + {"name": "Uploading video", "pct": 100, "state": "done"}, + {"name": "Generating transcript", "pct": 5, "state": "active"}, + {"name": "Choosing short clips", "pct": 0, "state": "pending"}, + {"name": "Producing clips", "pct": 0, "state": "pending"}, + {"name": "Adding subtitles & light edits", "pct": 0, "state": "pending"}, + ] + ) + + +JOBS: dict[str, Job] = {} +JOBS_LOCK = threading.Lock() + + +def _append_log(job: Job, line: str) -> None: + job.logs.append(line) + if len(job.logs) > MAX_LOG_LINES: + job.logs = job.logs[-MAX_LOG_LINES:] + + +def _set_step(job: Job, idx: int, pct: int, state: str = "active") -> None: + for step_idx, step in enumerate(job.steps): + if step_idx < idx: + step["pct"] = 100 + step["state"] = "done" + elif step_idx == idx: + step["pct"] = max(int(step.get("pct", 0)), min(100, pct)) + step["state"] = state + elif step.get("state") != "done": + step["state"] = "pending" + + +def _update_stage_from_log(job: Job, line: str) -> None: + if "STAGE 1: INGESTION" in line: + job.status = "Generating transcript" + _set_step(job, 1, 15) + elif "Transcribing" in line: + job.status = "Generating transcript" + _set_step(job, 1, 45) + elif "Transcript already exists" in line or "Transcription complete" in line: + _set_step(job, 1, 90) + elif "STAGE 2: CLIP SELECTION" in line: + job.status = "Choosing short clips" + _set_step(job, 2, 20) + elif "STAGE 2.25: HOOK DETECTION" in line: + job.status = "Finding hooks" + _set_step(job, 2, 55) + elif "STAGE 2.5: CONTENT PRUNING" in line: + job.status = "Tightening clip windows" + _set_step(job, 2, 78) + elif "STAGE 2.75: CLIP ASSEMBLY" in line: + job.status = "Assembling clips" + _set_step(job, 3, 18) + elif "STAGE 3: CLIP LAYOUTS" in line: + job.status = "Choosing layout" + _set_step(job, 3, 38) + elif "STAGE 4: RENDER" in line: + job.status = "Producing clips" + _set_step(job, 3, 62) + elif "reframe_clip_ffmpeg" in line: + _set_step(job, 4, min(90, 20 + len(job.clips) * 12)) + elif "RENDER QA" in line or "Render QA summary" in line: + job.status = "Checking clips" + _set_step(job, 4, 82) + elif "PIPELINE COMPLETE" in line: + job.status = "Complete" + job.nav_status = "Done" + for step in job.steps: + step["pct"] = 100 + step["state"] = "done" + + +def _install_log_handler(message_queue: queue.Queue[str]) -> tuple[logging.Handler, int, dict[str, int]]: + handler = QueueLogHandler(message_queue) + handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt="%H:%M:%S")) + + root_logger = logging.getLogger() + previous_level = root_logger.level + root_logger.addHandler(handler) + root_logger.setLevel(logging.INFO) + + previous_logger_levels: dict[str, int] = {} + for logger_name in ("urllib3", "httpx", "httpcore"): + logger = logging.getLogger(logger_name) + previous_logger_levels[logger_name] = logger.level + logger.setLevel(logging.WARNING) + + return handler, previous_level, previous_logger_levels + + +def _remove_log_handler( + handler: logging.Handler, + previous_root_level: int, + previous_logger_levels: dict[str, int], +) -> None: + root_logger = logging.getLogger() + root_logger.removeHandler(handler) + root_logger.setLevel(previous_root_level) + for logger_name, level in previous_logger_levels.items(): + logging.getLogger(logger_name).setLevel(level) + + +def _duration_label(path: Path) -> str: + try: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + str(path), + ], + check=True, + capture_output=True, + text=True, + timeout=15, + ) + total = max(0, int(round(float(result.stdout.strip())))) + except Exception: + total = 0 + return f"{total // 60}:{total % 60:02d}" if total else "0:00" + + +def _publish_files(job: Job) -> None: + for path in sorted(job.output_dir.glob("short_*.mp4")): + if path.name not in job.clips and path.is_file(): + job.clips[path.name] = ClipFile( + name=path.name, + url=f"/api/jobs/{job.id}/files/{path.name}", + duration=_duration_label(path), + ) + + +def _validate_credentials() -> None: + if not any((os.environ.get(name) or "").strip() for name in LLM_KEY_NAMES): + raise HTTPException( + status_code=400, + detail="Missing LLM secret. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENROUTER_API_KEY in the Space secrets.", + ) + + provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip().lower() + if provider in {"", "auto"}: + provider = "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai" + if provider == "elevenlabs" and not (os.environ.get("ELEVENLABS_API_KEY") or "").strip(): + raise HTTPException(status_code=400, detail="Missing ELEVENLABS_API_KEY Space secret.") + if provider in {"openai", "api"} and not (os.environ.get("OPENAI_API_KEY") or "").strip(): + raise HTTPException(status_code=400, detail="Missing OPENAI_API_KEY Space secret.") + + +def _safe_url(value: str | None) -> str | None: + value = (value or "").strip() + if not value: + return None + if not re.match(r"^https?://", value, flags=re.I): + raise HTTPException(status_code=400, detail="Paste a valid http(s) video URL.") + return value + + +def _snapshot(job: Job) -> dict[str, object]: + return { + "id": job.id, + "status": job.status, + "nav_status": job.nav_status, + "done": job.done, + "error": job.error, + "logs": "\n".join(job.logs[-MAX_LOG_LINES:]), + "steps": job.steps, + "clips": [clip.__dict__ for clip in job.clips.values()], + } + + +def _run_job(job_id: str) -> None: + with JOBS_LOCK: + job = JOBS[job_id] + message_queue: queue.Queue[str] = queue.Queue() + handler, previous_root_level, previous_logger_levels = _install_log_handler(message_queue) + + def drain_queue() -> None: + with JOBS_LOCK: + local_job = JOBS[job_id] + while True: + try: + line = message_queue.get_nowait() + except queue.Empty: + break + _append_log(local_job, line) + _update_stage_from_log(local_job, line) + _publish_files(local_job) + + try: + with JOBS_LOCK: + _append_log(job, f"Prepared source: {job.source}") + _append_log(job, f"Run id: {job.id}") + _set_step(job, 1, 8) + + config = PipelineConfig( + source=job.source, + youtube_url=job.source, + output_dir=job.output_dir, + work_dir=job.work_dir, + use_video_cache=False, + clean_run=True, + interactive=False, + prune_level="balanced", + overwrite_outputs=True, + steering_notes=[job.steering_note] if job.steering_note else [], + ) + + worker_error: str | None = None + outputs: list[Path] = [] + + def pipeline_worker() -> None: + nonlocal outputs, worker_error + try: + outputs = run_pipeline(config) + except Exception as exc: + worker_error = str(exc) + for line in traceback.format_exc().splitlines(): + if line.strip(): + message_queue.put_nowait(line) + + thread = threading.Thread(target=pipeline_worker, daemon=True) + thread.start() + while thread.is_alive(): + drain_queue() + time.sleep(0.35) + drain_queue() + + with JOBS_LOCK: + local_job = JOBS[job_id] + for output in outputs: + if Path(output).exists(): + local_job.clips[Path(output).name] = ClipFile( + name=Path(output).name, + url=f"/api/jobs/{job_id}/files/{Path(output).name}", + duration=_duration_label(Path(output)), + ) + if worker_error: + local_job.error = worker_error + local_job.status = f"Failed: {worker_error}" + local_job.nav_status = "Failed" + else: + local_job.status = "Complete" if local_job.clips else "Complete - no clips generated" + local_job.nav_status = "Done" + for step in local_job.steps: + step["pct"] = 100 + step["state"] = "done" + local_job.done = True + finally: + _remove_log_handler(handler, previous_root_level, previous_logger_levels) + + +async def _stage_upload(uploaded_file: UploadFile, run_root: Path) -> Path: + suffix = Path(uploaded_file.filename or "input.mp4").suffix or ".mp4" + staged_path = run_root / f"input{suffix}" + with staged_path.open("wb") as handle: + while chunk := await uploaded_file.read(1024 * 1024): + handle.write(chunk) + return staged_path + + +app = FastAPI(title=APP_TITLE) + + +@app.get("/", response_class=HTMLResponse) +def index() -> str: + return INDEX_HTML + + +@app.post("/api/jobs") +async def create_job( + video_url: Annotated[str | None, Form()] = None, + regen_prompt: Annotated[str | None, Form()] = None, + source_job_id: Annotated[str | None, Form()] = None, + file: Annotated[UploadFile | None, File()] = None, +) -> JSONResponse: + _validate_credentials() + job_id = uuid.uuid4().hex[:12] + run_root = Path(tempfile.mkdtemp(prefix=f"clipforge-{job_id}-")) + work_dir = run_root / "work" + output_dir = run_root / "output" + work_dir.mkdir(parents=True, exist_ok=True) + output_dir.mkdir(parents=True, exist_ok=True) + + source_path: Path | None = None + source = _safe_url(video_url) + source_job_id = (source_job_id or "").strip() + if source_job_id: + with JOBS_LOCK: + previous = JOBS.get(source_job_id) + if previous is None: + raise HTTPException(status_code=404, detail="Previous job not found for regeneration.") + if previous.source_path and previous.source_path.exists(): + source_path = run_root / previous.source_path.name + shutil.copy2(previous.source_path, source_path) + source = str(source_path) + else: + source = previous.source + elif file is not None: + source_path = await _stage_upload(file, run_root) + source = str(source_path) + + if not source: + raise HTTPException(status_code=400, detail="Upload a video file or paste a video URL first.") + + job = Job( + id=job_id, + run_root=run_root, + output_dir=output_dir, + work_dir=work_dir, + source=source, + source_path=source_path, + steering_note=(regen_prompt or "").strip() or None, + ) + with JOBS_LOCK: + JOBS[job_id] = job + + threading.Thread(target=_run_job, args=(job_id,), daemon=True).start() + return JSONResponse(_snapshot(job)) + + +@app.get("/api/jobs/{job_id}") +def get_job(job_id: str) -> JSONResponse: + with JOBS_LOCK: + job = JOBS.get(job_id) + if job is None: + raise HTTPException(status_code=404, detail="Job not found.") + _publish_files(job) + return JSONResponse(_snapshot(job)) + + +@app.get("/api/jobs/{job_id}/files/{filename}") +def get_job_file(job_id: str, filename: str) -> FileResponse: + with JOBS_LOCK: + job = JOBS.get(job_id) + if job is None: + raise HTTPException(status_code=404, detail="Job not found.") + path = (job.output_dir / Path(filename).name).resolve(strict=False) + if job.output_dir.resolve(strict=False) not in path.parents or not path.is_file(): + raise HTTPException(status_code=404, detail="File not found.") + return FileResponse(path, media_type="video/mp4", filename=path.name) + + +@app.get("/health") +def health() -> dict[str, str]: + return {"ok": "true"} + + +INDEX_HTML = r""" + + + + +ClipForge - Video to Clips + + + + + + +
+
+
AI Video Editor
+

Convert your long video to short clips for social media

+

Paste a link or upload a file - we handle the rest

+
+
+ + +
+
+ + +
+
+ +
+
File
+
Click to browse or drag & drop
+
MP4, MOV, AVI - up to your Space limit
+
+
+ +
+
+
+
+
+
Working on it
+

Your clips are being crafted

+

Sit back - long videos can take a little while

+
+
+
Up
Uploading video 0%
+
Text
Generating transcript
+
Cut
Choosing short clips
+
Film
Producing clips
+
Edit
Adding subtitles & light edits
+
+
+
Tips while you wait
+
β—† Clips are automatically trimmed around the strongest hook.
+
β—† The system can pick centered speaker or split presentation layout per clip.
+
β—† Word-by-word subtitles are added by default.
+
β—† You can regenerate with different instructions after the first batch.
+
+ +
+
Produce a different set
+

Describe what you're looking for and we'll re-cut your video

+ +
+ Key insights + Funny moments + Emotional + High energy + +
+
+

+
+ + + +""" + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860"))) diff --git a/humeo-core/.gitignore b/humeo-core/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..5cdfd8b15451f9536bacee169a7b07d76b8f934b --- /dev/null +++ b/humeo-core/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.pyc +*.pyo +*.egg-info/ +.pytest_cache/ +build/ +dist/ +.venv/ +.env diff --git a/humeo-core/LICENSE b/humeo-core/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..f2d768392bb52fd2079bb0a642954ba1bbe8688a --- /dev/null +++ b/humeo-core/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 NotABot + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/humeo-core/README.md b/humeo-core/README.md new file mode 100644 index 0000000000000000000000000000000000000000..22ea8a681ddc2eb2b5c7d0077f4d867a5e140e5e --- /dev/null +++ b/humeo-core/README.md @@ -0,0 +1,165 @@ +# humeo-core + +**Reusable-rocket MCP server for long-video β†’ 9:16 shorts.** + +First-principles design, from the HIVE paper + Bryan's rocket analogy: +we don't build doors and windows (general subject-tracker UI, retraining +models). We build the **container** (schemas), **landing gear** (deterministic +local extraction), and **five thrusters** (the five 9:16 layouts this video +format actually uses). Everything else is pluggable. + +## The rocket, in one picture + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Control panel (MCP tools) β”‚ <- any MCP client + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ strict JSON + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β–Ό β–Ό β–Ό β–Ό β–Ό + ingest classify_scenes select_clips plan_layout render_clip +(scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile, + keyframes + classifier) heuristic + pure filter dry-run safe) + transcript) LLM-ready) math) + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ LayoutKind β”‚ + β”‚ ──────────────── β”‚ + β”‚ zoom_call_center β”‚ + β”‚ sit_center β”‚ + β”‚ split_chart_personβ”‚ + β”‚ split_two_persons β”‚ + β”‚ split_two_charts β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +Only the classifier and clip-selector have optional LLM hooks; everything +else is deterministic, local, and cheap. + +## Why five layouts? (the "max 2 items" rule) + +The hard constraint for this format: **a short shows at most two on-screen +items** β€” where an "item" is a `person` (a human speaker) or a `chart` +(slide, graph, data visual, screenshare). That gives exactly five recipes: + +1. **`zoom_call_center`** β€” 1 person, tight zoom-call / webcam framing. +2. **`sit_center`** β€” 1 person, interview / seated framing. +3. **`split_chart_person`** β€” 1 chart + 1 person, stacked vertically + (default: **even 50/50** top/bottom, chart on top). +4. **`split_two_persons`** β€” 2 speakers, stacked vertically. +5. **`split_two_charts`** β€” 2 charts, stacked vertically. + +Because the geometry is bounded, we do NOT need a general subject-tracker +ML model or a drag-to-highlight UI. We need five small, correct pieces of +crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py` +is. + +See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms +used across these docs (subject, crop, band, seam, bbox, layout, etc.). + +## Install + +```bash +uv venv +uv sync +``` + +External requirements: `ffmpeg` and `ffprobe` on PATH. + +`scenedetect` requires OpenCV. Install `opencv-python-headless` or +`opencv-python` alongside `scenedetect`. + +## Use it as an MCP server + +```bash +humeo-core # stdio transport (primary console script) +# humeo-mcp # same entrypoint β€” kept so existing MCP configs keep working +``` + +Example Cursor/Claude Desktop config: + +```json +{ + "mcpServers": { + "humeo": { "command": "humeo-core" } + } +} +``` + +Tools exposed: + +| Tool | Purpose | +| --------------------------------- | --------------------------------------------------------------------------- | +| `list_layouts` | Enumerate the 5 supported layouts. | +| `ingest` | Scene detection + keyframe extraction (+ optional transcript). | +| `classify_scenes` | Pixel-heuristic per-scene layout classification. | +| `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). | +| `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. | +| `select_clips` | Heuristic clip picker over a word-level transcript. | +| `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. | +| `build_render_cmd` | Build the ffmpeg command (no execution) β€” review before spend. | +| `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. | + +Resource: `humeo://layouts` (JSON listing of the 5 layouts). + +### Three interchangeable region detectors + +All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used: + +``` +classify.py (pixel variance, no ML) +face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg +vision.py (multimodal LLM + OCR bboxes) +``` + +## JSON contracts (non-negotiable) + +All tools take and return Pydantic-validated JSON. The contracts live in +[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py): + +- `Scene` `{scene_id, start_time, end_time, keyframe_path?}` +- `TranscriptWord` `{word, start_time, end_time}` +- `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}` +- `SceneClassification` `{scene_id, layout, confidence, reason}` +- `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized) +- `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}` +- `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}` +- `ClipPlan` `{source_path, clips[]}` +- `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}` +- `RenderRequest` / `RenderResult` + +## First-principles decisions (what we intentionally did NOT build) + +- **No giant subject-tracker ML.** The video format has 5 fixed layouts + (with a hard "max 2 items" rule); pixel-level tracking is not needed. +- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an + agent-first workflow. If a human wants to override, they pass a + `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` / + `zoom`. +- **No end-to-end videoβ†’video model.** The HIVE paper's core insight is + that decomposed orchestration beats monolithic generation. We reify + that insight as six small composable tools. + +## Extending the pilot + +- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`. +- Plug a real reasoning model into `select_clips_with_llm(text_fn)`. +- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)` + to get per-scene bboxes + OCR text, then feed the results back through + `classify_scenes_with_vision`. This is the scene-change β†’ v3 images β†’ + LLM+OCR β†’ bbox path; see `../docs/SOLUTIONS.md Β§4` for rationale. +- All enforce strict JSON outputs, so bad model output can't corrupt + downstream stages. + +## Testing + +```bash +python -m pytest +``` + +See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale. + +## License + +MIT diff --git a/humeo-core/docs/ARCHITECTURE.md b/humeo-core/docs/ARCHITECTURE.md new file mode 100644 index 0000000000000000000000000000000000000000..79aa008279dbe8abe25831a2bfba7a12508e0976 --- /dev/null +++ b/humeo-core/docs/ARCHITECTURE.md @@ -0,0 +1,128 @@ +# Architecture β€” Reusable Rocket + +> *"We don't need to build the door or windows β€” just a container with landing +> gear and thrusters that move in different directions."* +> β€” Bryan + +That analogy maps exactly onto this MCP: + +| Rocket part | Codebase | Purpose | +| --------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------- | +| Container | `src/humeo_core/schemas.py` | Strict JSON contracts every stage reads/writes. | +| Landing gear | `src/humeo_core/primitives/ingest.py` | Deterministic local extraction (scenes, keyframes, transcript). | +| Thrusters (Γ—5) | `src/humeo_core/primitives/layouts.py` | Five fixed 9:16 crop/compose recipes (max 2 on-screen items). | +| Pilot | `primitives/classify.py` + `primitives/select_clips.py` | Heuristic + LLM-ready decision makers. | +| Compiler | `src/humeo_core/primitives/compile.py` | Deterministic ffmpeg assembly. | +| Control panel | `src/humeo_core/server.py` | MCP tools exposing every primitive. | +| Control surface | `src/humeo_core/server.py` | MCP tool surface for agents and clients. | + +## First-principles reasoning + +The HIVE paper's core insight is that good short-video editing requires +**staged reasoning with strict intermediate artifacts**, not a single +giant model call. Three consequences flow from that: + +1. **Extraction must be local and deterministic.** No model call should + ever touch raw video bytes. `ingest.py` runs ffprobe + PySceneDetect + + ffmpeg + (optional) faster-whisper. Everything it emits is JSON or + a file path. + +2. **Reasoning must be decomposed into narrow sub-tasks.** Classifying a + scene's layout is a completely different task from selecting a viral + clip. Each has its own schema, its own prompt, its own validation. + This is why `primitives/` is five files instead of one. + +3. **Every model call must emit schema-validated JSON.** Free-form model + output is not allowed to enter the pipeline. `classify_scenes_with_llm` + and `select_clips_with_llm` both `model_validate(...)` the raw output + before returning; parse failures degrade gracefully to `SIT_CENTER` + + low confidence, not crashes. + +## Why only five layouts? + +The hard rule for this format: **a short shows at most two on-screen +items**, where an "item" is a `person` or a `chart`. That gives exactly +five recipes β€” all implemented as pure functions from +`LayoutInstruction` to an ffmpeg filtergraph string in `layouts.py`: + +| Layout | Items | Recipe | +| ---------------------- | --------------- | --------------------------------------------- | +| `zoom_call_center` | 1 person | tight centered 9:16 crop (zoom β‰₯ 1.25). | +| `sit_center` | 1 person | wider centered 9:16 crop. | +| `split_chart_person` | 1 chart + person| source partitioned L/R by bboxes, stacked. | +| `split_two_persons` | 2 persons | L/R speakers, stacked top/bottom. | +| `split_two_charts` | 2 charts | L/R charts, stacked top/bottom. | + +A general subject-tracker ML model is orders of magnitude more expensive +and less reliable than five hand-written crop recipes. If a new geometry +ever shows up in future source videos, adding a sixth thruster is +strictly additive: write a new `plan_*` function, add it to `_DISPATCH`, +add an enum variant. No existing code has to change. + +## 9:16 layout math + +Source is assumed 16:9 (1920Γ—1080 by default, but probed per-clip). +Target is 1080Γ—1920. For each layout: + +### `zoom_call_center` and `sit_center` + +Standard centered aspect-ratio crop to 9:16, then scale to 1080Γ—1920: + +``` +crop=cw:ch:x:y,scale=1080:1920:flags=lanczos,setsar=1[vout] +``` + +`cw`, `ch` are the largest 9:16 window that fits in the source, divided +by `zoom`. `x`, `y` center the window on `person_x_norm` / 0.5. +Dimensions are rounded to even values so libx264 is happy. The window is +clamped inside the source so a high `person_x_norm` never crops outside. + +### Split layouts (`split_chart_person`, `split_two_persons`, `split_two_charts`) + +All three splits share one recipe β€” only the items differ: + +1. **Horizontal partition.** The source is cut at a single vertical seam + so the two source strips are **complementary** (no overlap, no gap). + When both bboxes are set (Gemini vision), the seam is the midpoint + between `left.x2` and `right.x1`. Otherwise the seam defaults to + either an even 50/50 (two-of-a-kind splits) or a 2/3 | 1/3 split + (legacy `split_chart_person` fallback). +2. **Vertical crop.** Each strip's vertical extent comes from the + corresponding bbox when provided, so each item **fills** its output + band instead of being lost in full-height source context. +3. **Cover-scale to the band.** Each strip is scaled with + `force_original_aspect_ratio=increase` + center-cropped to the band + dimensions. Bands are always fully painted; no letterbox bars. +4. **Stack.** Two branches produced by `split=2` are `vstack`-ed into + the final 1080Γ—1920. + +**Band heights** are controlled by `LayoutInstruction.top_band_ratio`, +which defaults to **0.5** (even 50/50 β€” the symmetric look Bryan asked +for after the uneven Cathy Wood shorts). Legacy 60/40 is still reachable +by setting `top_band_ratio=0.6`. + +**Stack order** (for `split_chart_person`) is controlled by +`focus_stack_order`: chart-on-top (default) or person-on-top. + +## Extensibility story + +- **Smarter classifier:** implement `LLMVisionFn` with any multimodal + model and pass it to `classify_scenes_with_llm`. The fallback heuristic + stays available for offline runs and tests. +- **Smarter clip selector:** same pattern, `LLMTextFn` β†’ `select_clips_with_llm`. +- **New layout:** add a `plan_*` planner, register in `_DISPATCH`, add a + `LayoutKind` variant. Tests in `test_layouts.py` automatically iterate + over all `LayoutKind`s, so the dispatch coverage test will catch a + missing registration immediately. + +## What we intentionally did NOT build + +- Drag-and-highlight subject-selector UI. +- A general ML subject-tracker. +- A monolithic video-in-video-out model. +- Any network calls in the core library. The MCP server is stdio-only; + the CLI runs fully offline. + +This keeps the rocket **reusable**: the same primitives power the MCP +server, the CLI, a Python library, and (soon) a web UI if that's ever +warranted. diff --git a/humeo-core/docs/MCP_USAGE.md b/humeo-core/docs/MCP_USAGE.md new file mode 100644 index 0000000000000000000000000000000000000000..36ce6f56e901596a260b7656c1c0168b0f01e7ee --- /dev/null +++ b/humeo-core/docs/MCP_USAGE.md @@ -0,0 +1,100 @@ +# Using humeo-core from an MCP client + +The installed console command is **`humeo-core`**. For backward compatibility, +**`humeo-mcp`** is also registered (same entrypoint); either works in +`"command": ...` if both are on `PATH` from the same install. + +## 1. Add to your client + +`claude_desktop_config.json` or `.cursor/mcp.json`: + +```json +{ + "mcpServers": { + "humeo": { + "command": "humeo-core" + } + } +} +``` + +## 2. A typical agent plan + +``` +β†’ humeo.list_layouts() + # discover the 5 layouts (max 2 on-screen items per short) + +β†’ humeo.ingest(source_path="/abs/long.mp4", work_dir="/abs/work", with_transcript=true) + # IngestResult: scenes[], keyframes, transcript_words[] + +β†’ humeo.classify_scenes(scenes=) + # SceneClassification[] β€” one layout per scene + +β†’ humeo.select_clips( + source_path=..., transcript_words=..., duration_sec=..., + target_count=5, min_sec=30, max_sec=60 + ) + # ClipPlan β€” top non-overlapping clips + +# For each clip, pick the layout of the scene its midpoint falls in, +# build a LayoutInstruction, and: + +β†’ humeo.build_render_cmd(request={...}) + # dry-run: returns the exact ffmpeg argv, no execution + +β†’ humeo.render_clip(request={..., "mode": "normal"}) + # actually renders the 9:16 MP4 +``` + +## 3. Strict JSON all the way + +Every request/response is validated against the schemas in +[`schemas.py`](../src/humeo_core/schemas.py). Invalid input is rejected +*before* ffmpeg is touched, so a confused agent can't accidentally +rm-rf your disk or burn GPU hours. + +## 4. Override knobs + +`LayoutInstruction` accepts: + +- `zoom`, `person_x_norm`, `chart_x_norm` β€” single-subject knobs. +- `split_chart_region`, `split_person_region`, + `split_second_chart_region`, `split_second_person_region` β€” + normalized bboxes that drive split-layout cropping. +- `top_band_ratio` β€” fraction of output height used by the top band + (default 0.5 = even 50/50, the symmetric look). +- `focus_stack_order` β€” for `split_chart_person`, chart-on-top vs + person-on-top. + +Example: chart + person with a precise bbox crop and an even split. + +```json +{ + "clip_id": "001", + "layout": "split_chart_person", + "split_chart_region": {"x1": 0.00, "y1": 0.10, "x2": 0.52, "y2": 0.95}, + "split_person_region": {"x1": 0.55, "y1": 0.05, "x2": 1.00, "y2": 1.00}, + "top_band_ratio": 0.5, + "focus_stack_order": "chart_then_person" +} +``` + +Example: two-speaker interview. + +```json +{ + "clip_id": "002", + "layout": "split_two_persons", + "split_person_region": {"x1": 0.02, "y1": 0.05, "x2": 0.48, "y2": 0.95}, + "split_second_person_region": {"x1": 0.52, "y1": 0.05, "x2": 0.98, "y2": 0.95} +} +``` + +## 5. When to stay in dry-run + +- You want to show an approval UI before spending CPU. +- You want to diff the planned ffmpeg commands against a previous run. +- You're building tests. + +`mode="dry_run"` is always safe, never writes output, and returns the +exact argv list. diff --git a/humeo-core/examples/render_request.json b/humeo-core/examples/render_request.json new file mode 100644 index 0000000000000000000000000000000000000000..038f8b4531736b7ba707d27680bfea84bd630965 --- /dev/null +++ b/humeo-core/examples/render_request.json @@ -0,0 +1,23 @@ +{ + "source_path": "/absolute/path/to/long.mp4", + "clip": { + "clip_id": "001", + "topic": "Prediction Market Explosion", + "start_time_sec": 289.0, + "end_time_sec": 331.5, + "viral_hook": "Prediction markets could explode to $5 trillion.", + "virality_score": 0.94, + "transcript": "Full text for subtitle generation...", + "suggested_overlay_title": "$5T Prediction Markets" + }, + "layout": { + "clip_id": "001", + "layout": "split_chart_person", + "zoom": 1.0, + "person_x_norm": 0.83, + "chart_x_norm": 0.0 + }, + "output_path": "/absolute/path/to/out/clip_001.mp4", + "title_text": "$5T Prediction Markets", + "mode": "dry_run" +} diff --git a/humeo-core/pyproject.toml b/humeo-core/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..73f627478b946920f687d4293ac752470841934b --- /dev/null +++ b/humeo-core/pyproject.toml @@ -0,0 +1,46 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "humeo-core" +version = "0.1.0" +description = "Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints)." +readme = "README.md" +requires-python = ">=3.10" +license = { text = "MIT" } +authors = [{ name = "Humeo" }] +keywords = ["mcp", "video", "shorts", "ffmpeg", "editing", "humeo", "hive"] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "mcp[cli]>=1.2.0", + "pydantic>=2.0", + "scenedetect>=0.6", +] + +[project.optional-dependencies] +transcribe = ["faster-whisper>=1.0"] +download = ["yt-dlp>=2024.0"] +face = ["mediapipe>=0.10", "opencv-python>=4.8"] +vision = ["Pillow>=10.0"] +dev = ["pytest>=7", "pytest-asyncio>=0.23", "Pillow>=10.0"] + +[project.scripts] +humeo-core = "humeo_core.server:main" +# Backward-compatible entry point (same module); existing MCP configs may still call `humeo-mcp`. +humeo-mcp = "humeo_core.server:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +humeo_core = ["assets/fonts/*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-ra -q" diff --git a/humeo-core/src/humeo_core.egg-info/PKG-INFO b/humeo-core/src/humeo_core.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..78b029cc75dce7925036f828838fcb487ea0d071 --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/PKG-INFO @@ -0,0 +1,197 @@ +Metadata-Version: 2.4 +Name: humeo-core +Version: 0.1.0 +Summary: Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints). +Author: Humeo +License: MIT +Keywords: mcp,video,shorts,ffmpeg,editing,humeo,hive +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Requires-Python: >=3.10 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: mcp[cli]>=1.2.0 +Requires-Dist: pydantic>=2.0 +Requires-Dist: scenedetect>=0.6 +Provides-Extra: transcribe +Requires-Dist: faster-whisper>=1.0; extra == "transcribe" +Provides-Extra: download +Requires-Dist: yt-dlp>=2024.0; extra == "download" +Provides-Extra: face +Requires-Dist: mediapipe>=0.10; extra == "face" +Requires-Dist: opencv-python>=4.8; extra == "face" +Provides-Extra: vision +Requires-Dist: Pillow>=10.0; extra == "vision" +Provides-Extra: dev +Requires-Dist: pytest>=7; extra == "dev" +Requires-Dist: pytest-asyncio>=0.23; extra == "dev" +Requires-Dist: Pillow>=10.0; extra == "dev" +Dynamic: license-file + +# humeo-core + +**Reusable-rocket MCP server for long-video β†’ 9:16 shorts.** + +First-principles design, from the HIVE paper + Bryan's rocket analogy: +we don't build doors and windows (general subject-tracker UI, retraining +models). We build the **container** (schemas), **landing gear** (deterministic +local extraction), and **five thrusters** (the five 9:16 layouts this video +format actually uses). Everything else is pluggable. + +## The rocket, in one picture + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Control panel (MCP tools) β”‚ <- any MCP client + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ strict JSON + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β–Ό β–Ό β–Ό β–Ό β–Ό + ingest classify_scenes select_clips plan_layout render_clip +(scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile, + keyframes + classifier) heuristic + pure filter dry-run safe) + transcript) LLM-ready) math) + β”‚ + β–Ό + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ LayoutKind β”‚ + β”‚ ──────────────── β”‚ + β”‚ zoom_call_center β”‚ + β”‚ sit_center β”‚ + β”‚ split_chart_personβ”‚ + β”‚ split_two_persons β”‚ + β”‚ split_two_charts β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +Only the classifier and clip-selector have optional LLM hooks; everything +else is deterministic, local, and cheap. + +## Why five layouts? (the "max 2 items" rule) + +The hard constraint for this format: **a short shows at most two on-screen +items** β€” where an "item" is a `person` (a human speaker) or a `chart` +(slide, graph, data visual, screenshare). That gives exactly five recipes: + +1. **`zoom_call_center`** β€” 1 person, tight zoom-call / webcam framing. +2. **`sit_center`** β€” 1 person, interview / seated framing. +3. **`split_chart_person`** β€” 1 chart + 1 person, stacked vertically + (default: **even 50/50** top/bottom, chart on top). +4. **`split_two_persons`** β€” 2 speakers, stacked vertically. +5. **`split_two_charts`** β€” 2 charts, stacked vertically. + +Because the geometry is bounded, we do NOT need a general subject-tracker +ML model or a drag-to-highlight UI. We need five small, correct pieces of +crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py` +is. + +See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms +used across these docs (subject, crop, band, seam, bbox, layout, etc.). + +## Install + +```bash +uv venv +uv sync +``` + +External requirements: `ffmpeg` and `ffprobe` on PATH. + +`scenedetect` requires OpenCV. Install `opencv-python-headless` or +`opencv-python` alongside `scenedetect`. + +## Use it as an MCP server + +```bash +humeo-core # stdio transport (primary console script) +# humeo-mcp # same entrypoint β€” kept so existing MCP configs keep working +``` + +Example Cursor/Claude Desktop config: + +```json +{ + "mcpServers": { + "humeo": { "command": "humeo-core" } + } +} +``` + +Tools exposed: + +| Tool | Purpose | +| --------------------------------- | --------------------------------------------------------------------------- | +| `list_layouts` | Enumerate the 5 supported layouts. | +| `ingest` | Scene detection + keyframe extraction (+ optional transcript). | +| `classify_scenes` | Pixel-heuristic per-scene layout classification. | +| `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). | +| `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. | +| `select_clips` | Heuristic clip picker over a word-level transcript. | +| `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. | +| `build_render_cmd` | Build the ffmpeg command (no execution) β€” review before spend. | +| `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. | + +Resource: `humeo://layouts` (JSON listing of the 5 layouts). + +### Three interchangeable region detectors + +All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used: + +``` +classify.py (pixel variance, no ML) +face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg +vision.py (multimodal LLM + OCR bboxes) +``` + +## JSON contracts (non-negotiable) + +All tools take and return Pydantic-validated JSON. The contracts live in +[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py): + +- `Scene` `{scene_id, start_time, end_time, keyframe_path?}` +- `TranscriptWord` `{word, start_time, end_time}` +- `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}` +- `SceneClassification` `{scene_id, layout, confidence, reason}` +- `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized) +- `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}` +- `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}` +- `ClipPlan` `{source_path, clips[]}` +- `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}` +- `RenderRequest` / `RenderResult` + +## First-principles decisions (what we intentionally did NOT build) + +- **No giant subject-tracker ML.** The video format has 5 fixed layouts + (with a hard "max 2 items" rule); pixel-level tracking is not needed. +- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an + agent-first workflow. If a human wants to override, they pass a + `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` / + `zoom`. +- **No end-to-end videoβ†’video model.** The HIVE paper's core insight is + that decomposed orchestration beats monolithic generation. We reify + that insight as six small composable tools. + +## Extending the pilot + +- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`. +- Plug a real reasoning model into `select_clips_with_llm(text_fn)`. +- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)` + to get per-scene bboxes + OCR text, then feed the results back through + `classify_scenes_with_vision`. This is the scene-change β†’ v3 images β†’ + LLM+OCR β†’ bbox path; see `../docs/SOLUTIONS.md Β§4` for rationale. +- All enforce strict JSON outputs, so bad model output can't corrupt + downstream stages. + +## Testing + +```bash +python -m pytest +``` + +See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale. + +## License + +MIT diff --git a/humeo-core/src/humeo_core.egg-info/SOURCES.txt b/humeo-core/src/humeo_core.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f73f8e3bdbf2d2ae0b44262a48caa424c7ac007 --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/SOURCES.txt @@ -0,0 +1,33 @@ +LICENSE +README.md +pyproject.toml +src/humeo_core/__init__.py +src/humeo_core/schemas.py +src/humeo_core/server.py +src/humeo_core.egg-info/PKG-INFO +src/humeo_core.egg-info/SOURCES.txt +src/humeo_core.egg-info/dependency_links.txt +src/humeo_core.egg-info/entry_points.txt +src/humeo_core.egg-info/requires.txt +src/humeo_core.egg-info/top_level.txt +src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf +src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt +src/humeo_core/assets/fonts/SourceSans3-OFL.txt +src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf +src/humeo_core/primitives/__init__.py +src/humeo_core/primitives/classify.py +src/humeo_core/primitives/compile.py +src/humeo_core/primitives/face_detect.py +src/humeo_core/primitives/ingest.py +src/humeo_core/primitives/layouts.py +src/humeo_core/primitives/select_clips.py +src/humeo_core/primitives/vision.py +tests/test_classify.py +tests/test_compile.py +tests/test_face_detect.py +tests/test_layout_bbox.py +tests/test_layouts.py +tests/test_schemas.py +tests/test_select_clips.py +tests/test_server_tools.py +tests/test_vision.py \ No newline at end of file diff --git a/humeo-core/src/humeo_core.egg-info/dependency_links.txt b/humeo-core/src/humeo_core.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/humeo-core/src/humeo_core.egg-info/entry_points.txt b/humeo-core/src/humeo_core.egg-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0e9df37e2aa1fbdad3098d34e629fdfc2a2044b --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +humeo-core = humeo_core.server:main +humeo-mcp = humeo_core.server:main diff --git a/humeo-core/src/humeo_core.egg-info/requires.txt b/humeo-core/src/humeo_core.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..5660c1df773875982439273d51d35bc37d1d26b1 --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/requires.txt @@ -0,0 +1,21 @@ +mcp[cli]>=1.2.0 +pydantic>=2.0 +scenedetect>=0.6 + +[dev] +pytest>=7 +pytest-asyncio>=0.23 +Pillow>=10.0 + +[download] +yt-dlp>=2024.0 + +[face] +mediapipe>=0.10 +opencv-python>=4.8 + +[transcribe] +faster-whisper>=1.0 + +[vision] +Pillow>=10.0 diff --git a/humeo-core/src/humeo_core.egg-info/top_level.txt b/humeo-core/src/humeo_core.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..2faab947486877e418834efb9cf6b4c9cdaa21b2 --- /dev/null +++ b/humeo-core/src/humeo_core.egg-info/top_level.txt @@ -0,0 +1 @@ +humeo_core diff --git a/humeo-core/src/humeo_core/__init__.py b/humeo-core/src/humeo_core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fde979e33456746792eace3d5e494d8cb90fc515 --- /dev/null +++ b/humeo-core/src/humeo_core/__init__.py @@ -0,0 +1,49 @@ +"""humeo-core: reusable-rocket MCP primitives for long-video-to-shorts editing. + +First-principles design (rocket analogy): + Container -> schemas.py (strict JSON contracts) + Landing gear -> primitives/ingest.py, primitives/compile.py (deterministic local) + Thrusters -> primitives/layouts.py (5 fixed 9:16 layouts, max 2 items) + Pilot -> primitives/classify.py, primitives/select_clips.py (heuristic, LLM-ready) + Control panel -> server.py (FastMCP tools that expose all primitives) +""" + +from .schemas import ( + BoundingBox, + Clip, + ClipPlan, + ClipRenderSpan, + ClipSubtitleWords, + FocusStackOrder, + IngestResult, + LayoutInstruction, + LayoutKind, + RenderRequest, + RenderResult, + RenderTheme, + Scene, + SceneClassification, + SceneRegions, + TranscriptWord, +) + +__all__ = [ + "BoundingBox", + "Clip", + "ClipPlan", + "ClipRenderSpan", + "ClipSubtitleWords", + "FocusStackOrder", + "IngestResult", + "LayoutInstruction", + "LayoutKind", + "RenderRequest", + "RenderResult", + "RenderTheme", + "Scene", + "SceneClassification", + "SceneRegions", + "TranscriptWord", +] + +__version__ = "0.1.0" diff --git a/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf new file mode 100644 index 0000000000000000000000000000000000000000..bd81aef67d88acd3bf4b88d7c6ff86900b2e9ce3 Binary files /dev/null and b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf differ diff --git a/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ab04e2a11702d69277b4525bb7fb767ed9c045c --- /dev/null +++ b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt @@ -0,0 +1,93 @@ +Copyright 2020 The League Spartan Project Authors (https://github.com/theleagueof/league-spartan) + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +https://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt new file mode 100644 index 0000000000000000000000000000000000000000..50ee76cf00fbfe42fb7c74a9b95c9508dec5bb8f --- /dev/null +++ b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt @@ -0,0 +1,93 @@ +Copyright 2010-2020 Adobe (http://www.adobe.com/), with Reserved Font Name 'Source'. All Rights Reserved. Source is a trademark of Adobe in the United States and/or other countries. + +This Font Software is licensed under the SIL Open Font License, Version 1.1. + +This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL + + +----------------------------------------------------------- +SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 +----------------------------------------------------------- + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting -- in part or in whole -- any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf new file mode 100644 index 0000000000000000000000000000000000000000..6c16581182d07be08abe83026aa8af97e857fcb5 --- /dev/null +++ b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e3ab05ccd7cb94907c31005bb5bec1d5432f0b096a2b782976e217a540eb6c +size 395372 diff --git a/humeo-core/src/humeo_core/primitives/__init__.py b/humeo-core/src/humeo_core/primitives/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c96d207211f13dc058b5844885e4b0dee8b9c8 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/__init__.py @@ -0,0 +1 @@ +"""Primitives: deterministic, composable building blocks of the rocket.""" diff --git a/humeo-core/src/humeo_core/primitives/classify.py b/humeo-core/src/humeo_core/primitives/classify.py new file mode 100644 index 0000000000000000000000000000000000000000..2cf9274101c3a548149f62c06072daf2d70ed813 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/classify.py @@ -0,0 +1,232 @@ +"""Scene classifier: assigns one of the 5 layouts to each scene. + +Two backends share the same contract: + +* ``classify_scenes_heuristic`` β€” no model call. Uses keyframe pixel analysis + (edge density + color variance + face-rectangle heuristic-free approach) + to guess which of the 5 layouts fits best. Fully offline, deterministic. + Note: the heuristic only picks between ``SIT_CENTER`` / ``ZOOM_CALL_CENTER`` / + ``SPLIT_CHART_PERSON``; the two-of-a-kind splits (``SPLIT_TWO_PERSONS`` / + ``SPLIT_TWO_CHARTS``) are only selectable by the vision-LLM backend. +* ``classify_scenes_with_llm`` β€” pluggable LLM hook. Takes a callable + ``(image_path, prompt) -> str`` so the caller (MCP client or test) can + wire up whatever multimodal model they want. Enforces strict JSON output. + +Even without a model, the heuristic is good enough for many real inputs and +keeps the whole pipeline runnable with zero external dependencies. +""" + +from __future__ import annotations + +import json +import os +import struct +from typing import Callable, Iterable + +from ..schemas import LayoutKind, Scene, SceneClassification + + +# --------------------------------------------------------------------------- +# Tiny PNG/JPEG reader β†’ down-sampled grayscale column profile +# --------------------------------------------------------------------------- +# We intentionally avoid a hard dependency on Pillow. If Pillow is available +# we use it; otherwise we fall back to reading just PNG dimensions, which is +# enough for a coarse column-variance heuristic on any pre-decoded frame. + + +def _load_grayscale(path: str) -> tuple[list[list[int]], int, int] | None: + try: + from PIL import Image # type: ignore + + img = Image.open(path).convert("L") + w, h = img.size + # Down-sample to at most 128 cols x 72 rows for cheap analysis. + tw = min(128, w) + th = min(72, h) + img = img.resize((tw, th)) + px = list(img.getdata()) + grid = [px[i * tw : (i + 1) * tw] for i in range(th)] + return grid, tw, th + except Exception: + return None + + +def _png_dims(path: str) -> tuple[int, int] | None: + try: + with open(path, "rb") as f: + head = f.read(24) + if head[:8] != b"\x89PNG\r\n\x1a\n": + return None + w, h = struct.unpack(">II", head[16:24]) + return int(w), int(h) + except Exception: + return None + + +def _column_profile(grid: list[list[int]]) -> list[float]: + if not grid: + return [] + h = len(grid) + w = len(grid[0]) + out: list[float] = [] + for x in range(w): + s = 0 + for y in range(h): + s += grid[y][x] + out.append(s / h) + return out + + +def _variance(values: Iterable[float]) -> float: + vs = list(values) + if not vs: + return 0.0 + m = sum(vs) / len(vs) + return sum((v - m) ** 2 for v in vs) / len(vs) + + +# --------------------------------------------------------------------------- +# Heuristic classifier +# --------------------------------------------------------------------------- + + +def _classify_one_heuristic(keyframe_path: str | None) -> SceneClassification: + if not keyframe_path or not os.path.exists(keyframe_path): + return SceneClassification( + scene_id="?", + layout=LayoutKind.SIT_CENTER, + confidence=0.3, + reason="no keyframe available β€” defaulting to SIT_CENTER", + ) + + gs = _load_grayscale(keyframe_path) + if gs is None: + # Can't read pixels: still return a safe default with low confidence. + return SceneClassification( + scene_id="?", + layout=LayoutKind.SIT_CENTER, + confidence=0.25, + reason="PIL unavailable or image unreadable β€” defaulting to SIT_CENTER", + ) + + grid, w, h = gs + cols = _column_profile(grid) + + def _split_contrast(left: list[float], right: list[float]) -> float: + lm = sum(left) / max(1, len(left)) + rm = sum(right) / max(1, len(right)) + lv = _variance(left) + rv = _variance(right) + between = (lm - rm) ** 2 + within = (lv + rv) / 2.0 + 1e-6 + return between / within + + # Left/right halves β€” good for symmetric two-up scenes. + mid = max(1, w // 2) + split_halves = _split_contrast(cols[:mid], cols[mid:]) + + # Left 2/3 vs right 1/3 β€” matches explainer slides (chart + talking head). + t = max(1, w // 3) + left_two_thirds = cols[: 2 * t] + right_one_third = cols[2 * t :] + split_thirds = _split_contrast(left_two_thirds, right_one_third) + + split_score = max(split_halves, split_thirds) + # Overall column variance: low variance β†’ flat composition (zoom call). + overall_var = _variance(cols) + + # Threshold tuned on Ark-style 2/3 chart + 1/3 speaker; "thirds" score catches + # layouts where half-vs-half contrast was too weak (e.g. clip 005 vs 004). + if split_score > 20.0: + return SceneClassification( + scene_id="?", + layout=LayoutKind.SPLIT_CHART_PERSON, + confidence=min(0.95, 0.5 + split_score / 200.0), + reason=( + f"chart/person contrast (halves={split_halves:.1f}, " + f"thirds={split_thirds:.1f} β†’ max={split_score:.1f})" + ), + ) + if overall_var < 100.0: + return SceneClassification( + scene_id="?", + layout=LayoutKind.ZOOM_CALL_CENTER, + confidence=0.7, + reason=f"low column variance ({overall_var:.1f}) β€” flat centered framing", + ) + return SceneClassification( + scene_id="?", + layout=LayoutKind.SIT_CENTER, + confidence=0.6, + reason=f"moderate composition (score={split_score:.1f}, var={overall_var:.1f})", + ) + + +def classify_scenes_heuristic(scenes: list[Scene]) -> list[SceneClassification]: + out: list[SceneClassification] = [] + for s in scenes: + r = _classify_one_heuristic(s.keyframe_path) + out.append(r.model_copy(update={"scene_id": s.scene_id})) + return out + + +# --------------------------------------------------------------------------- +# LLM-backed classifier (caller provides the model hook) +# --------------------------------------------------------------------------- + + +LLMVisionFn = Callable[[str, str], str] +"""Signature: (image_path, prompt) -> raw model string (expected JSON).""" + + +CLASSIFIER_PROMPT = """You are a scene layout classifier for a short-video editor. +Return ONLY a JSON object of the form: + {"layout": "", + "confidence": <0..1 float>, + "reason": "<=15 words"} + +Layout definitions: +- zoom_call_center: one person on a video call (webcam grid / talking head tight crop), subject centered. +- sit_center: one person sitting in frame, subject centered, wider framing than a zoom call. +- split_chart_person: an explainer scene with a chart/graphic on the LEFT (~2/3 of frame) and a person on the RIGHT (~1/3). + +Pick the single best match. No prose, no markdown, JSON only. +""" + + +def classify_scenes_with_llm( + scenes: list[Scene], vision_fn: LLMVisionFn +) -> list[SceneClassification]: + out: list[SceneClassification] = [] + for s in scenes: + if not s.keyframe_path: + out.append( + SceneClassification( + scene_id=s.scene_id, + layout=LayoutKind.SIT_CENTER, + confidence=0.2, + reason="no keyframe", + ) + ) + continue + raw = vision_fn(s.keyframe_path, CLASSIFIER_PROMPT) + try: + data = json.loads(raw) + out.append( + SceneClassification( + scene_id=s.scene_id, + layout=LayoutKind(data["layout"]), + confidence=float(data.get("confidence", 0.5)), + reason=str(data.get("reason", ""))[:200], + ) + ) + except Exception as e: + out.append( + SceneClassification( + scene_id=s.scene_id, + layout=LayoutKind.SIT_CENTER, + confidence=0.25, + reason=f"LLM parse error: {e!r}", + ) + ) + return out diff --git a/humeo-core/src/humeo_core/primitives/compile.py b/humeo-core/src/humeo_core/primitives/compile.py new file mode 100644 index 0000000000000000000000000000000000000000..7b7bbfbecb46f4c8a93be706cde3af3bab76c60a --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/compile.py @@ -0,0 +1,602 @@ +"""Compiler: assemble a final 9:16 clip from source + clip + layout instruction. + +Builds the ffmpeg invocation, optionally runs it. Keeping ``dry_run`` as a +first-class mode means the MCP server can return the exact command without +executing β€” ideal for an agent that wants to review before spending CPU. + +Rendering order is fixed and intentional: + +1. **Cut + crop/compose.** ``plan_layout`` produces the base filtergraph + that takes the source, applies the layout-specific crops, and emits a + labelled ``[vout]`` at the exact output resolution (e.g. 1080x1920). +2. **Overlay title** (``drawtext``) β€” skipped for split layouts because + the source itself already has a slide/chart title and an extra overlay + just obscures content. +3. **Subtitles.** ``subtitles`` filter runs **last** so text is drawn over + the finished composition, not the source. ``original_size`` is pinned + to the output resolution so libass coordinate math (MarginV, FontSize) + is in *output pixels*, not libass's default PlayResY=288 β€” which was + the bug behind the "subtitles blocked / floating in the middle" look. +4. **Mux** with the source audio stream (``0:a:0``). +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +from ..schemas import RenderRequest, RenderResult, RenderTheme, SPLIT_LAYOUTS +from .layouts import plan_layout + + +def _ensure_ffmpeg() -> str: + exe = shutil.which("ffmpeg") + if not exe: + raise RuntimeError("ffmpeg not found on PATH") + return exe + + +def _ensure_windows_fontconfig() -> dict[str, str]: + """Return subprocess env with a minimal fontconfig setup on Windows. + + Some Windows FFmpeg builds ship libass + fontconfig but do not bundle a + default fontconfig config, which makes subtitle rendering fail with: + + ``Fontconfig error: Cannot load default config file: No such file: (null)`` + + We generate a tiny config that points fontconfig at ``C:/Windows/Fonts`` and + a writable cache dir under ``%LOCALAPPDATA%/humeo``. Non-Windows platforms + pass through the existing environment unchanged. + """ + env = os.environ.copy() + if os.name != "nt": + return env + if env.get("FONTCONFIG_FILE"): + return env + + local_appdata = Path( + env.get("LOCALAPPDATA", str(Path(tempfile.gettempdir()) / "humeo-local")) + ) + cfg_dir = local_appdata / "humeo" / "fontconfig" + cache_dir = local_appdata / "humeo" / "fontconfig-cache" + cfg_dir.mkdir(parents=True, exist_ok=True) + cache_dir.mkdir(parents=True, exist_ok=True) + + cfg_file = cfg_dir / "fonts.conf" + windows_fonts = Path(env.get("WINDIR", r"C:\Windows")) / "Fonts" + if not cfg_file.exists(): + cfg_file.write_text( + "\n".join( + [ + '', + "", + f" {windows_fonts.as_posix()}", + f" {cache_dir.as_posix()}", + "", + "", + ] + ), + encoding="utf-8", + ) + + env["FONTCONFIG_PATH"] = str(cfg_dir) + env["FONTCONFIG_FILE"] = str(cfg_file) + return env + + +def _escape_drawtext(text: str) -> str: + # drawtext quoting is brittle across ffmpeg builds. Keep it simple: + # collapse whitespace, drop apostrophes, and escape the characters + # that are still significant to the filter parser. + safe = " ".join(text.split()).replace("'", "") + return safe.replace("\\", "\\\\").replace(":", "\\:") + + +# --------------------------------------------------------------------------- +# Title overlay planning +# --------------------------------------------------------------------------- +# +# ffmpeg ``drawtext`` does not wrap text by itself; whatever you hand it is +# emitted as a single line. With a fixed 72px font and no width budget, the +# "Prediction Markets vs Derivatives" title on a 1080px canvas would spill +# past both edges and show up clipped (the user reported exactly this bug). +# +# The helpers below plan a title layout BEFORE it hits drawtext: +# +# 1. Short titles (fit at 72px single line): emit the existing single +# ``drawtext`` call unchanged so golden tests and previously-calibrated +# visuals stay byte-for-byte identical. +# 2. Long titles: split at the best word boundary into two balanced lines and +# emit two stacked ``drawtext`` filters at a slightly smaller font +# (60px / 52px / 44px, auto-shrinking until both lines fit). +# 3. Single-word titles that still overflow: shrink the single line until it +# fits, then hard-truncate with an ellipsis as a last resort. +# +# The character-width estimate is deliberately conservative (0.55 * fontsize) +# so mixed-case prose with wide letters like W/M still clears the margin. +# Calibrated visually against Arial Bold on 1080p output. + +_TITLE_PRIMARY_SIZE = 72 # Current "hero" title size; preserved for short titles. +_TITLE_MIN_SIZE = 44 # Readability floor at 1080x1920 output. +_TITLE_MARGIN_PX = 60 # Horizontal safe-area on each side. +_TITLE_Y_TOP = 80 # Pixel offset of the top title baseline (matches pre-P2 look). +_TITLE_CHAR_WIDTH_RATIO = 0.55 +_TITLE_LINE_SPACING_RATIO = 1.3 + +# Keep the overlay font explicit. Without a ``font=`` directive, drawtext +# falls back to fontconfig's "Sans", which resolves to a serif (Times New +# Roman) on default Windows installs β€” the "ugly serif title" bug reported +# against v1. Arial matches the ASS subtitle ``Fontname`` below so the +# title and captions read as a single typographic family. Keep this in +# sync with the ``Fontname=Arial`` in the subtitle filter if it ever +# changes. +_TITLE_FONT_NAME = "Arial" +_REFERENCE_TITLE_FONT_NAME = "League Spartan" +_REFERENCE_CAPTION_FONT_NAME = "Source Sans 3" +_REFERENCE_TITLE_BAR_X = 28 +_REFERENCE_TITLE_BAR_Y = 32 +_REFERENCE_TITLE_BAR_W = 1024 +_REFERENCE_TITLE_BAR_H = 148 +_REFERENCE_TITLE_TEXT_X = 72 +_REFERENCE_TITLE_TEXT_Y = 54 +_REFERENCE_TITLE_SIZE = 64 +_REFERENCE_CAPTION_BAR_X = 0 +_REFERENCE_CAPTION_BAR_W = 1080 +_REFERENCE_CAPTION_BAR_H = 120 +_REFERENCE_CAPTION_TEXT_MARGIN_L = 92 +_REFERENCE_CAPTION_TEXT_MARGIN_R = 92 + + +def _fonts_dir() -> Path: + return Path(__file__).resolve().parents[1] / "assets" / "fonts" + + +def _bundled_font_path(filename: str) -> Path | None: + path = _fonts_dir() / filename + return path if path.is_file() else None + + +def _title_char_px(size_px: int) -> float: + return size_px * _TITLE_CHAR_WIDTH_RATIO + + +def _title_fits(text: str, size_px: int, usable_w: int) -> bool: + return int(len(text) * _title_char_px(size_px)) <= usable_w + + +def _wrap_title_two_lines(text: str) -> tuple[str, str]: + """Split ``text`` at the word boundary that most balances the two halves. + + Returns ``(line1, line2)``. If ``text`` has fewer than two words, returns + ``(text, "")`` and the caller should fall back to single-line shrinking. + """ + words = text.split() + if len(words) < 2: + return text, "" + best_idx = 1 + best_delta = 10**9 + for i in range(1, len(words)): + left = " ".join(words[:i]) + right = " ".join(words[i:]) + delta = abs(len(left) - len(right)) + if delta < best_delta: + best_delta = delta + best_idx = i + return " ".join(words[:best_idx]), " ".join(words[best_idx:]) + + +def _drawtext_font_arg() -> str: + """Return a drawtext font selector that is stable on the current platform.""" + if os.name == "nt": + arial = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" / "arial.ttf" + if arial.is_file(): + return f"fontfile='{_escape_filter_path(str(arial))}'" + return f"font={_TITLE_FONT_NAME}" + + +def _reference_title_font_arg() -> str: + bundled = _bundled_font_path("LeagueSpartan-Bold-static.ttf") or _bundled_font_path( + "LeagueSpartan-Bold.ttf" + ) + if bundled is not None: + return f"fontfile='{_escape_filter_path(str(bundled))}'" + return f"font={_REFERENCE_TITLE_FONT_NAME}" + + +def _drawtext_single(text: str, size: int, y: int) -> str: + esc = _escape_drawtext(text) + return ( + f"drawtext=text='{esc}':" + "expansion=none:" + f"{_drawtext_font_arg()}:" + f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:" + f"x=(w-text_w)/2:y={y}" + ) + + +def _drawtext_two(line1: str, line2: str, size: int, y_top: int) -> str: + """Two drawtext filters chained by comma β€” one ffmpeg filter chain, two lines.""" + esc1 = _escape_drawtext(line1) + esc2 = _escape_drawtext(line2) + y_bottom = y_top + int(round(size * _TITLE_LINE_SPACING_RATIO)) + return ( + f"drawtext=text='{esc1}':" + "expansion=none:" + f"{_drawtext_font_arg()}:" + f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:" + f"x=(w-text_w)/2:y={y_top}," + f"drawtext=text='{esc2}':" + "expansion=none:" + f"{_drawtext_font_arg()}:" + f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:" + f"x=(w-text_w)/2:y={y_bottom}" + ) + + +def plan_title_drawtext(title_text: str, out_w: int = 1080) -> str | None: + """Return the ``drawtext`` filter fragment for ``title_text`` or None to skip. + + The returned string is intended to be spliced into the main filtergraph + between the ``[v_prepad]`` and ``[vout]`` labels by + :func:`build_ffmpeg_cmd`. It does NOT include those labels itself. + + Backward compatibility: when the title fits on one line at the original + 72px size, the output is identical to the pre-P2 single-``drawtext`` + form (same x/y/fontsize/borderw), so golden ffmpeg tests stay green. + """ + text = " ".join((title_text or "").split()) + if not text: + return None + usable_w = max(1, out_w - 2 * _TITLE_MARGIN_PX) + + if _title_fits(text, _TITLE_PRIMARY_SIZE, usable_w): + return _drawtext_single(text, _TITLE_PRIMARY_SIZE, _TITLE_Y_TOP) + + line1, line2 = _wrap_title_two_lines(text) + if line2: + for size in (60, 52, _TITLE_MIN_SIZE): + if _title_fits(line1, size, usable_w) and _title_fits(line2, size, usable_w): + return _drawtext_two(line1, line2, size, _TITLE_Y_TOP) + + for size in (64, 56, 52, _TITLE_MIN_SIZE): + if _title_fits(text, size, usable_w): + return _drawtext_single(text, size, _TITLE_Y_TOP) + + max_chars = max(4, int(usable_w / _title_char_px(_TITLE_MIN_SIZE))) + truncated = text[: max_chars - 1].rstrip() + "..." + return _drawtext_single(truncated, _TITLE_MIN_SIZE, _TITLE_Y_TOP) + + +def _reference_title_fragment(title_text: str, out_w: int = 1080) -> str: + bar_w = min(_REFERENCE_TITLE_BAR_W, max(320, out_w - 2 * _REFERENCE_TITLE_BAR_X)) + accent_w = 16 + title = " ".join((title_text or "").split()) + usable_w = max(220, bar_w - (_REFERENCE_TITLE_TEXT_X - _REFERENCE_TITLE_BAR_X) - 30) + text_filters: list[str] = [] + if title: + if _title_fits(title, _REFERENCE_TITLE_SIZE, usable_w): + esc = _escape_drawtext(title) + text_filters.append( + f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:" + f"fontcolor=white:fontsize={_REFERENCE_TITLE_SIZE}:" + "borderw=1.2:bordercolor=0x101010@0.18:" + f"x={_REFERENCE_TITLE_TEXT_X}:" + f"y={_REFERENCE_TITLE_TEXT_Y}" + ) + else: + line1, line2 = _wrap_title_two_lines(title) + two_line_size = 54 + while ( + line2 + and two_line_size > 42 + and not ( + _title_fits(line1, two_line_size, usable_w) + and _title_fits(line2, two_line_size, usable_w) + ) + ): + two_line_size -= 2 + if line2 and _title_fits(line1, two_line_size, usable_w) and _title_fits(line2, two_line_size, usable_w): + y_top = 36 + y_bottom = y_top + int(round(two_line_size * 1.08)) + for line, y in ((line1, y_top), (line2, y_bottom)): + esc = _escape_drawtext(line) + text_filters.append( + f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:" + f"fontcolor=white:fontsize={two_line_size}:" + "borderw=1.2:bordercolor=0x101010@0.18:" + f"x={_REFERENCE_TITLE_TEXT_X}:y={y}" + ) + else: + size = _REFERENCE_TITLE_SIZE + while title and not _title_fits(title, size, usable_w) and size > 38: + size -= 2 + if title and not _title_fits(title, size, usable_w): + max_chars = max(8, int(usable_w / _title_char_px(size))) + title = title[: max_chars - 1].rstrip() + "..." + esc = _escape_drawtext(title) + text_filters.append( + f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:" + f"fontcolor=white:fontsize={size}:" + "borderw=1.2:bordercolor=0x101010@0.18:" + f"x={_REFERENCE_TITLE_TEXT_X}:" + f"y={_REFERENCE_TITLE_TEXT_Y}" + ) + text_filter = f",{','.join(text_filters)}" if text_filters else "" + return ( + f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:" + f"w={bar_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x1F1F1F@0.84:t=fill," + f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:" + f"w={accent_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x2A2453@0.98:t=fill" + f"{text_filter}" + ) + + +def _reference_caption_bar_fragment( + *, + out_w: int = 1080, + out_h: int = 1920, + margin_v: int = 166, + font_size: int = 38, +) -> str: + bar_w = min(_REFERENCE_CAPTION_BAR_W, max(320, out_w - 2 * _REFERENCE_CAPTION_BAR_X)) + bar_h = max(_REFERENCE_CAPTION_BAR_H, int(round(font_size * 2.05))) + bar_y = max( + _REFERENCE_TITLE_BAR_Y + _REFERENCE_TITLE_BAR_H + 36, + out_h - max(40, margin_v) - bar_h, + ) + return ( + f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:" + f"w={bar_w}:h={bar_h}:color=0x6570E6@1.0:t=fill," + f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:" + f"w={bar_w}:h=3:color=0xE4E7FF@0.14:t=fill" + ) + + +def _escape_filter_path(path: str) -> str: + return path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'") + + +def _has_audio_stream(media_path: str) -> bool: + probe = shutil.which("ffprobe") + if not probe: + return False + out = subprocess.run( + [ + probe, + "-v", + "error", + "-select_streams", + "a:0", + "-show_entries", + "stream=codec_type", + "-of", + "csv=p=0", + media_path, + ], + check=False, + capture_output=True, + text=True, + ) + return out.returncode == 0 and "audio" in (out.stdout or "").lower() + + +def build_ffmpeg_cmd( + req: RenderRequest, + *, + src_w: int = 1920, + src_h: int = 1080, + include_audio: bool = True, +) -> list[str]: + exe = _ensure_ffmpeg() if req.mode != "dry_run" else "ffmpeg" + + plan = plan_layout( + req.layout, out_w=req.width, out_h=req.height, src_w=src_w, src_h=src_h + ) + fg = plan.filtergraph + + if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD: + chrome_parts = [ + _reference_title_fragment(req.title_text, out_w=req.width), + _reference_caption_bar_fragment( + out_w=req.width, + out_h=req.height, + margin_v=min(req.subtitle_margin_v, 136), + font_size=max(req.subtitle_font_size, 124), + ) + if req.subtitle_path + else "", + ] + fg = fg.replace( + "[vout]", + f"[v_prepad];[v_prepad]{','.join(part for part in chrome_parts if part)}[vout]", + ) + elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT: + # The native-highlight theme mirrors the reference short in + # videoplayback (12): no separate top title card, just centered + # floating captions with per-word highlight timing. + pass + else: + # Skip the drawtext title overlay on split layouts: the top band already + # shows a slide/chart with its own baked-in title, so adding an overlay + # on top of that is pure noise (and was stacking over the chart title + # in the SPLIT_CHART_PERSON Cathy Wood shorts). + title_allowed = req.layout.layout not in SPLIT_LAYOUTS + if req.title_text and title_allowed: + # ``plan_title_drawtext`` returns a full filter fragment (possibly + # two chained ``drawtext`` calls) that fits within the output width. + # For short titles it is byte-identical to the pre-P2 single-line + # form, keeping existing golden tests green while fixing the + # "Prediction Markets vs Derivatives" edge-clip report. + title_fragment = plan_title_drawtext(req.title_text, out_w=req.width) + if title_fragment: + fg = fg.replace( + "[vout]", + f"[v_prepad];[v_prepad]{title_fragment}[vout]", + ) + + if req.subtitle_path: + subtitle_esc = _escape_filter_path(req.subtitle_path) + fonts_dir = _fonts_dir() + fontsdir_arg = ( + f":fontsdir='{_escape_filter_path(str(fonts_dir))}'" if fonts_dir.is_dir() else "" + ) + # ``original_size`` pins libass's PlayResY to the actual output so + # ``FontSize`` and ``MarginV`` are interpreted in output pixels. Without + # this, libass defaults to PlayResY=288 and then upscales to the real + # canvas (1920) -- blowing font sizes and pushing subtitles to the + # middle of the frame. ``WrapStyle=0`` enables smart word wrap so long + # lines break into readable stacks instead of running off-screen. + if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD: + force_style = ( + f"Fontname={_REFERENCE_CAPTION_FONT_NAME}," + f"FontSize={max(req.subtitle_font_size, 124)},Alignment=2," + f"MarginV={min(req.subtitle_margin_v, 136)}," + "MarginL=56,MarginR=56," + "WrapStyle=0,BorderStyle=1,Outline=2,Shadow=0," + "BackColour=&H00000000&,PrimaryColour=&H00FFFFFF&," + "Bold=1,Italic=0,Spacing=-1" + ) + subtitle_filter = ( + "[v_sub_in];" + f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:" + f"original_size={req.width}x{req.height}:" + f"force_style='{force_style}'[vout]" + ) + elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT: + subtitle_filter = ( + "[v_sub_in];" + f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:" + f"original_size={req.width}x{req.height}[vout]" + ) + else: + force_style = ( + f"Fontname=Arial," + f"FontSize={req.subtitle_font_size},Alignment=2," + f"MarginV={req.subtitle_margin_v},MarginL=60,MarginR=60," + "WrapStyle=0,BorderStyle=4," + "BackColour=&H70000000&,PrimaryColour=&H00FFFFFF&," + "Outline=0,Shadow=0,Bold=1" + ) + subtitle_filter = ( + "[v_sub_in];" + f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:" + f"original_size={req.width}x{req.height}:" + f"force_style='{force_style}'[vout]" + ) + fg = fg.replace("[vout]", subtitle_filter) + + start = req.clip.start_time_sec + dur = max(0.1, req.clip.duration_sec) + + Path(Path(req.output_path).parent).mkdir(parents=True, exist_ok=True) + + cmd: list[str] = [ + exe, + "-y", + "-ss", + f"{start:.3f}", + "-t", + f"{dur:.3f}", + "-i", + req.source_path, + "-filter_complex", + fg, + "-map", + "[vout]", + "-c:v", + "libx264", + "-preset", + "veryfast", + "-crf", + "20", + ] + + if include_audio: + cmd.extend(["-map", "0:a:0", "-c:a", "aac", "-b:a", "160k"]) + + cmd.extend(["-movflags", "+faststart", req.output_path]) + return cmd + + +def probe_source_size(source_path: str) -> tuple[int, int]: + exe = shutil.which("ffprobe") + if not exe: + return 1920, 1080 + out = subprocess.run( + [ + exe, + "-v", + "error", + "-select_streams", + "v:0", + "-show_entries", + "stream=width,height", + "-of", + "csv=p=0", + source_path, + ], + check=False, + capture_output=True, + text=True, + ) + try: + w, h = out.stdout.strip().split(",") + return int(w), int(h) + except Exception: + return 1920, 1080 + + +def render_clip(req: RenderRequest) -> RenderResult: + try: + src_w, src_h = probe_source_size(req.source_path) if req.mode != "dry_run" else (1920, 1080) + except Exception: + src_w, src_h = 1920, 1080 + + include_audio = True + if req.mode != "dry_run": + include_audio = _has_audio_stream(req.source_path) + if not include_audio: + return RenderResult( + clip_id=req.clip.clip_id, + output_path=req.output_path, + ffmpeg_cmd=[], + success=False, + error="Source media has no detectable audio stream (a:0).", + ) + + cmd = build_ffmpeg_cmd(req, src_w=src_w, src_h=src_h, include_audio=include_audio) + + if req.mode == "dry_run": + return RenderResult( + clip_id=req.clip.clip_id, + output_path=req.output_path, + ffmpeg_cmd=cmd, + success=True, + ) + try: + subprocess.run(cmd, check=True, capture_output=True, env=_ensure_windows_fontconfig()) + if include_audio and not _has_audio_stream(req.output_path): + return RenderResult( + clip_id=req.clip.clip_id, + output_path=req.output_path, + ffmpeg_cmd=cmd, + success=False, + error="Rendered output is missing audio stream (a:0).", + ) + return RenderResult( + clip_id=req.clip.clip_id, + output_path=req.output_path, + ffmpeg_cmd=cmd, + success=True, + ) + except subprocess.CalledProcessError as e: + return RenderResult( + clip_id=req.clip.clip_id, + output_path=req.output_path, + ffmpeg_cmd=cmd, + success=False, + error=e.stderr.decode("utf-8", errors="replace")[-4000:] if e.stderr else str(e), + ) diff --git a/humeo-core/src/humeo_core/primitives/face_detect.py b/humeo-core/src/humeo_core/primitives/face_detect.py new file mode 100644 index 0000000000000000000000000000000000000000..8bdbbe0f5d3b0b5850de1b0da15a1547a0a94432 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/face_detect.py @@ -0,0 +1,135 @@ +"""Local face-detection primitive β€” the MediaPipe path as another ``SceneRegions`` producer. + +Three detection backends share the *same output schema* (``SceneRegions``): + +* ``primitives/classify.py`` β€” pixel variance heuristic, no model. +* ``primitives/face_detect.py`` β€” MediaPipe face rectangle (this file). +* ``primitives/vision.py`` β€” multimodal LLM + OCR bboxes. + +Because all three emit ``SceneRegions``, the layout planner in +``primitives/vision.py`` (``classify_from_regions`` + ``layout_instruction_from_regions``) +works on all of them unchanged. That is the whole point of the primitive +boundary β€” the *detector* is swappable, the *renderer* is fixed. + +MediaPipe is imported lazily so it remains an optional extra. +""" + +from __future__ import annotations + +import logging +from typing import Callable + +from ..schemas import BoundingBox, Scene, SceneRegions + +logger = logging.getLogger(__name__) + + +# A bbox loader for any future cloud face API. Takes a keyframe path, +# returns a normalized face bbox or ``None``. Same shape as the MediaPipe +# wrapper below, which lets tests pass a stub and skip MediaPipe. +FaceBBoxFn = Callable[[str], BoundingBox | None] + + +def detect_face_regions( + scenes: list[Scene], + face_fn: FaceBBoxFn | None = None, + chart_split_threshold: float = 0.65, +) -> list[SceneRegions]: + """Populate ``SceneRegions.person_bbox`` (+ ``chart_bbox``) from a face detector. + + The face bbox is treated as the *person bbox*. If the face sits in the + right ``(1 - chart_split_threshold)`` of the frame, a *chart bbox* is + synthesised over the left region β€” mirroring the original + ``reframe.py`` split heuristic. + + Args: + scenes: scenes with ``keyframe_path`` populated. + face_fn: pluggable face detector. Defaults to MediaPipe (lazy + import) if not supplied. Pass a stub in tests. + chart_split_threshold: face x-center above this normalized value + triggers a synthetic chart bbox on the left. + """ + + if face_fn is None: + face_fn = _mediapipe_face_bbox + + out: list[SceneRegions] = [] + for s in scenes: + if not s.keyframe_path: + out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available")) + continue + try: + face = face_fn(s.keyframe_path) + except Exception as e: # one bad scene should not kill the batch + logger.warning("face detector failed on %s: %r", s.keyframe_path, e) + out.append(SceneRegions(scene_id=s.scene_id, raw_reason=f"face detector error: {e!r}")) + continue + + if face is None: + out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no face detected")) + continue + + chart = None + if face.center_x >= chart_split_threshold: + # Face pushed right β†’ assume a chart occupies the left region. + chart = BoundingBox( + x1=0.0, + y1=0.0, + x2=min(chart_split_threshold, face.x1), + y2=1.0, + label="chart_inferred", + confidence=max(0.0, face.center_x - chart_split_threshold + 0.5), + ) + + out.append( + SceneRegions( + scene_id=s.scene_id, + person_bbox=face, + chart_bbox=chart, + raw_reason="face detected" + (" + synthetic chart bbox" if chart else ""), + ) + ) + + return out + + +def _mediapipe_face_bbox(keyframe_path: str) -> BoundingBox | None: + """Return the largest-confidence face as a ``BoundingBox``, or ``None``. + + Imports MediaPipe + OpenCV lazily so they remain optional dependencies + (install ``humeo-core[face]``). + """ + + try: + import cv2 # type: ignore + import mediapipe as mp # type: ignore + except ImportError as e: + raise RuntimeError( + "MediaPipe face detection requires `pip install humeo-core[face]`" + ) from e + + img = cv2.imread(keyframe_path) + if img is None: + return None + rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + with mp.solutions.face_detection.FaceDetection( + model_selection=1, min_detection_confidence=0.5 + ) as detector: + results = detector.process(rgb) + if not results.detections: + return None + best = max(results.detections, key=lambda d: d.score[0]) + box = best.location_data.relative_bounding_box + x1 = max(0.0, min(1.0, float(box.xmin))) + y1 = max(0.0, min(1.0, float(box.ymin))) + x2 = max(x1 + 1e-6, min(1.0, x1 + float(box.width))) + y2 = max(y1 + 1e-6, min(1.0, y1 + float(box.height))) + return BoundingBox( + x1=x1, + y1=y1, + x2=x2, + y2=y2, + label="face", + confidence=float(best.score[0]), + ) diff --git a/humeo-core/src/humeo_core/primitives/ingest.py b/humeo-core/src/humeo_core/primitives/ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..6781af3e1529cafffcc1b7425483d06083890958 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/ingest.py @@ -0,0 +1,187 @@ +"""Landing gear: deterministic, local extraction. + +Everything here can run without a GPU, without an API key, and without the +internet (once inputs are present). This follows the HIVE guide's rule +"extraction stays local; LLMs only reason". + +Functions: + probe_duration β€” ffprobe wrapper + detect_scenes β€” PySceneDetect (ContentDetector) + extract_keyframes β€” ffmpeg snapshot at each scene midpoint + transcribe_audio β€” faster-whisper (optional dependency) + ingest β€” one-shot convenience runner that returns IngestResult +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +from pathlib import Path + +from ..schemas import IngestResult, Scene, TranscriptWord + + +class IngestError(RuntimeError): + pass + + +def _require(binary: str) -> str: + path = shutil.which(binary) + if not path: + raise IngestError( + f"Required binary not on PATH: {binary!r}. Install it or add the path." + ) + return path + + +def probe_duration(source_path: str) -> float: + ffprobe = _require("ffprobe") + out = subprocess.run( + [ + ffprobe, + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "json", + source_path, + ], + check=True, + capture_output=True, + text=True, + ) + data = json.loads(out.stdout) + return float(data["format"]["duration"]) + + +def detect_scenes( + source_path: str, threshold: float = 27.0, min_scene_sec: float = 1.0 +) -> list[Scene]: + """Use PySceneDetect's ContentDetector to split the video into scenes.""" + + try: + from scenedetect import detect, ContentDetector # type: ignore + except ModuleNotFoundError as e: + # scenedetect depends on OpenCV; surface the real missing module. + missing = getattr(e, "name", "") or str(e) + hint = "pip install 'scenedetect[opencv]'" if "cv2" in missing else "pip install scenedetect" + raise IngestError( + f"Scene detection unavailable (missing module: {missing}). Install with: {hint}" + ) from e + + result = detect( + source_path, + ContentDetector(threshold=threshold, min_scene_len=int(min_scene_sec * 24)), + ) + scenes: list[Scene] = [] + for i, (start, end) in enumerate(result): + scenes.append( + Scene( + scene_id=f"s{i:04d}", + start_time=float(start.get_seconds()), + end_time=float(end.get_seconds()), + ) + ) + # Guard: if PySceneDetect returns empty (e.g. a single long shot), + # fall back to one scene spanning the whole video. + if not scenes: + duration = probe_duration(source_path) + scenes.append(Scene(scene_id="s0000", start_time=0.0, end_time=duration)) + return scenes + + +def extract_keyframes( + source_path: str, scenes: list[Scene], out_dir: str +) -> list[Scene]: + """Extract one JPG per scene at its midpoint. Mutates nothing; returns copies.""" + + ffmpeg = _require("ffmpeg") + Path(out_dir).mkdir(parents=True, exist_ok=True) + updated: list[Scene] = [] + for s in scenes: + mid = s.start_time + (s.end_time - s.start_time) / 2.0 + out_path = os.path.join(out_dir, f"{s.scene_id}.jpg") + subprocess.run( + [ + ffmpeg, + "-y", + "-loglevel", + "error", + "-ss", + f"{mid:.3f}", + "-i", + source_path, + "-frames:v", + "1", + "-q:v", + "3", + out_path, + ], + check=True, + ) + updated.append(s.model_copy(update={"keyframe_path": out_path})) + return updated + + +def transcribe_audio( + source_path: str, model_name: str = "base", language: str | None = None +) -> list[TranscriptWord]: + """Word-level transcript via faster-whisper. Optional dependency.""" + + try: + from faster_whisper import WhisperModel # type: ignore + except ImportError as e: + raise IngestError( + "faster-whisper is not installed. pip install faster-whisper" + ) from e + + model = WhisperModel(model_name, device="auto", compute_type="auto") + segments, _info = model.transcribe(source_path, word_timestamps=True, language=language) + words: list[TranscriptWord] = [] + for seg in segments: + for w in getattr(seg, "words", []) or []: + if w.word is None: + continue + words.append( + TranscriptWord( + word=str(w.word).strip(), + start_time=float(w.start or 0.0), + end_time=float(w.end or 0.0), + ) + ) + return words + + +def ingest( + source_path: str, + work_dir: str, + *, + with_transcript: bool = False, + whisper_model: str = "base", +) -> IngestResult: + """Run all extraction stages and return a single ``IngestResult``.""" + + if not os.path.exists(source_path): + raise IngestError(f"source_path does not exist: {source_path}") + + Path(work_dir).mkdir(parents=True, exist_ok=True) + keyframes_dir = os.path.join(work_dir, "keyframes") + + duration = probe_duration(source_path) + scenes = detect_scenes(source_path) + scenes = extract_keyframes(source_path, scenes, keyframes_dir) + + words: list[TranscriptWord] = [] + if with_transcript: + words = transcribe_audio(source_path, model_name=whisper_model) + + return IngestResult( + source_path=os.path.abspath(source_path), + duration_sec=duration, + scenes=scenes, + transcript_words=words, + keyframes_dir=keyframes_dir, + ) diff --git a/humeo-core/src/humeo_core/primitives/layouts.py b/humeo-core/src/humeo_core/primitives/layouts.py new file mode 100644 index 0000000000000000000000000000000000000000..46c247b2f4c5079803663525534535e315b6aee6 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/layouts.py @@ -0,0 +1,707 @@ +"""The 9:16 layout thrusters β€” deterministic crop + compose math. + +First principles: this video format has a hard constraint of **at most two +on-screen items** per short (see :class:`humeo_core.schemas.LayoutKind`). That +gives exactly five recipes: + +* 1 person alone, tight β†’ ``ZOOM_CALL_CENTER`` +* 1 person alone, wider β†’ ``SIT_CENTER`` +* 1 chart + 1 person β†’ ``SPLIT_CHART_PERSON`` +* 2 persons β†’ ``SPLIT_TWO_PERSONS`` +* 2 charts β†’ ``SPLIT_TWO_CHARTS`` + +Each planner returns a pure ``ffmpeg -filter_complex`` fragment ending in +``[vout]``. The compiler (``compile.py``) glues the fragment to the cut + +audio + subtitle chain. Because every planner is a pure function that +returns a string, the whole layout system is unit-testable without ever +invoking ffmpeg. + +Split layouts share one contract: + +* Output: 9:16 frame split into a **top band** and **bottom band**. + Band heights are driven by :attr:`LayoutInstruction.top_band_ratio`. + Default is ``0.5`` (even 50/50), matching the user-requested symmetric look. +* Source strips for the two items are **complementary** β€” they partition + the source width at a single seam so the two items never overlap and + together cover the full frame width. +* Each strip is scaled to fill its output band using the "cover" + convention (``force_original_aspect_ratio=increase`` + center crop), so + the band is fully painted (no letterbox bars, no stretch). +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from ..schemas import ( + BoundingBox, + FocusStackOrder, + LayoutInstruction, + LayoutKind, + TimedCenterPoint, +) + + +# Source geometry assumption. Most podcast sources are 1920x1080; we still +# normalize everything by the actual source size so changing this is safe. +DEFAULT_SRC_W = 1920 +DEFAULT_SRC_H = 1080 +TRACKING_BLEND_SEC = 0.30 + + +@dataclass(frozen=True) +class FilterPlan: + """Result of planning a layout. + + ``filtergraph`` is the body of ``-filter_complex`` and ends with + ``[vout]`` as the final labelled stream. + """ + + filtergraph: str + out_label: str = "vout" + + +# --------------------------------------------------------------------------- +# Tiny pixel helpers +# --------------------------------------------------------------------------- + + +def _clamp01(v: float) -> float: + return max(0.0, min(1.0, v)) + + +def _even(v: int) -> int: + """Floor ``v`` to an even integer (ffmpeg ``crop``/``scale`` need even dims).""" + return v - (v % 2) + + +def _bbox_to_crop_pixels( + box: BoundingBox, src_w: int, src_h: int +) -> tuple[int, int, int, int]: + """Normalized bbox β†’ ``(cw, ch, x, y)`` with even dimensions for ffmpeg.""" + x1 = int(round(_clamp01(box.x1) * float(src_w))) + y1 = int(round(_clamp01(box.y1) * float(src_h))) + x2 = int(round(_clamp01(box.x2) * float(src_w))) + y2 = int(round(_clamp01(box.y2) * float(src_h))) + x1 = max(0, min(src_w - 2, x1)) + y1 = max(0, min(src_h - 2, y1)) + x2 = max(x1 + 2, min(src_w, x2)) + y2 = max(y1 + 2, min(src_h, y2)) + cw = _even(x2 - x1) + ch = _even(y2 - y1) + return max(2, cw), max(2, ch), _even(x1), _even(y1) + + +def _base_crop_size( + src_w: int, + src_h: int, + target_aspect: float, +) -> tuple[int, int]: + if src_w / src_h >= target_aspect: + base_ch = src_h + base_cw = int(round(base_ch * target_aspect)) + else: + base_cw = src_w + base_ch = int(round(base_cw / target_aspect)) + return _even(max(2, base_cw)), _even(max(2, base_ch)) + + +def _crop_box( + src_w: int, + src_h: int, + target_aspect: float, + zoom: float, + center_x_norm: float, + center_y_norm: float = 0.5, +) -> tuple[int, int, int, int]: + """Return ``(cw, ch, x, y)`` crop values for a centered aspect-ratio crop. + + ``zoom > 1`` means tighter crop (smaller window around the center). The + function always keeps the crop window fully inside the source frame. + """ + + zoom = max(1.0, zoom) + base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect) + + cw = _even(max(2, int(round(base_cw / zoom)))) + ch = _even(max(2, int(round(base_ch / zoom)))) + + cx = int(round(_clamp01(center_x_norm) * src_w)) + cy = int(round(_clamp01(center_y_norm) * src_h)) + x = _even(max(0, min(src_w - cw, cx - cw // 2))) + y = _even(max(0, min(src_h - ch, cy - ch // 2))) + return cw, ch, x, y + + +def _center_crop_to_9x16( + src_w: int, src_h: int, zoom: float, person_x_norm: float +) -> tuple[int, int, int, int]: + return _crop_box(src_w, src_h, 9 / 16, zoom, person_x_norm, 0.5) + + +def _crop_x_from_center(src_w: int, cw: int, center_x_norm: float) -> int: + """Return an even, in-bounds crop x for a normalized horizontal center.""" + cx = int(round(_clamp01(center_x_norm) * src_w)) + return _even(max(0, min(src_w - cw, cx - cw // 2))) + + +def _tracked_value_expr( + values: list[tuple[float, float]], + *, + clamp_min: float | None = None, + clamp_max: float | None = None, + round_even: bool = False, +) -> str: + if not values: + raise ValueError("values must not be empty") + + expr = f"{float(values[-1][0]):.3f}" + for idx in range(len(values) - 2, -1, -1): + v0, t0 = float(values[idx][0]), float(values[idx][1]) + v1, t1 = float(values[idx + 1][0]), float(values[idx + 1][1]) + if t1 <= t0: + expr = f"if(lt(t\\,{t1:.3f})\\,{v0:.3f}\\,{expr})" + continue + + switch_t = (t0 + t1) / 2.0 + blend_half = TRACKING_BLEND_SEC / 2.0 + blend_start = max(t0, switch_t - blend_half) + blend_end = min(t1, switch_t + blend_half) + + if blend_end <= blend_start: + expr = f"if(lt(t\\,{switch_t:.3f})\\,{v0:.3f}\\,{expr})" + continue + + blend_expr = ( + f"{v0:.3f}+({v1 - v0:.3f})*(t-{blend_start:.3f})/({blend_end - blend_start:.3f})" + ) + expr = ( + f"if(lt(t\\,{blend_start:.3f})\\,{v0:.3f}\\," + f"if(lt(t\\,{blend_end:.3f})\\,{blend_expr}\\,{expr}))" + ) + + if clamp_min is not None: + expr = f"max({clamp_min:.3f}\\,{expr})" + if clamp_max is not None: + expr = f"min({clamp_max:.3f}\\,{expr})" + if round_even: + expr = f"floor(({expr})/2)*2" + return expr + + +def _tracked_crop_x_expr( + *, + src_w: int, + crop_w: int, + tracking: list[TimedCenterPoint], +) -> str: + """Return an ffmpeg expression for a time-varying crop x position. + + We mostly hold each framing until the midpoint between adjacent samples, + then blend over a short window. That keeps edited talk footage from + drifting for seconds after a cut while still avoiding a one-frame jump + in the crop position. + """ + if not tracking: + raise ValueError("tracking must not be empty") + + center_points = [ + (_clamp01(point.x_norm) * src_w, float(point.t_sec)) + for point in tracking + ] + center_expr = _tracked_value_expr( + center_points, + clamp_min=0.0, + clamp_max=float(src_w), + ) + max_x = max(0, src_w - crop_w) + return f"floor(max(0\\,min({max_x}\\,({center_expr})-{crop_w}/2))/2)*2" + + +def _tracked_crop_exprs( + *, + src_w: int, + src_h: int, + target_aspect: float, + default_zoom: float, + center_y_norm: float, + tracking: list[TimedCenterPoint], +) -> tuple[str, str, str, str]: + if not tracking: + raise ValueError("tracking must not be empty") + + base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect) + width_points: list[tuple[float, float]] = [] + height_points: list[tuple[float, float]] = [] + center_points: list[tuple[float, float]] = [] + for point in tracking: + zoom = max(1.0, float(point.zoom if point.zoom is not None else default_zoom)) + width_points.append((float(_even(max(2, int(round(base_cw / zoom))))), float(point.t_sec))) + height_points.append((float(_even(max(2, int(round(base_ch / zoom))))), float(point.t_sec))) + center_points.append((_clamp01(point.x_norm) * src_w, float(point.t_sec))) + + w_expr = _tracked_value_expr( + width_points, + clamp_min=2.0, + clamp_max=float(base_cw), + round_even=True, + ) + h_expr = _tracked_value_expr( + height_points, + clamp_min=2.0, + clamp_max=float(base_ch), + round_even=True, + ) + center_expr = _tracked_value_expr( + center_points, + clamp_min=0.0, + clamp_max=float(src_w), + ) + center_y_px = _clamp01(center_y_norm) * src_h + x_expr = f"floor(max(0\\,min({src_w}-out_w\\,({center_expr})-out_w/2))/2)*2" + y_expr = f"floor(max(0\\,min({src_h}-out_h\\,{center_y_px:.3f}-out_h/2))/2)*2" + return w_expr, h_expr, x_expr, y_expr + + +# --------------------------------------------------------------------------- +# Split helpers β€” shared by all three split layouts +# --------------------------------------------------------------------------- + + +# Minimum source-strip width for a split, as a fraction of source width. +# Prevents a chart/person bbox that hugs one edge from starving the other. +_MIN_SPLIT_STRIP_FRAC = 0.2 +_CHART_STRIP_VERTICAL_PAD_FRAC = 0.12 + + +@dataclass(frozen=True) +class _SplitStrip: + """A source-frame crop rectangle destined for one output band.""" + + cw: int + ch: int + x: int + y: int + + def filter_crop(self, input_label: str, out_w: int, band_h: int, out_label: str) -> str: + """Return ``[input]crop=...,scale=...,crop=...,setsar=1[out_label]``. + + Uses the "cover" convention: scale so the band is fully painted, then + center-crop any overflow. Bands always get filled β€” no letterbox bars. + """ + return ( + f"[{input_label}]crop={self.cw}:{self.ch}:{self.x}:{self.y}," + f"scale={out_w}:{band_h}:force_original_aspect_ratio=increase," + f"crop={out_w}:{band_h},setsar=1[{out_label}]" + ) + + +def _bbox_strip( + box: BoundingBox | None, + *, + src_w: int, + src_h: int, + x_start: int, + x_end: int, +) -> _SplitStrip: + """Build a source crop for one band. + + Horizontal range is fixed by ``[x_start, x_end)`` (from the seam math so + strips partition the source width). Vertical range comes from ``box`` + when available β€” that's what makes the chart **fill** the output band + instead of being squashed inside full-height source context. + """ + x = _even(max(0, min(src_w - 2, x_start))) + cw = _even(max(2, min(src_w - x, x_end - x))) + + if box is not None: + y1 = int(round(_clamp01(box.y1) * float(src_h))) + y2 = int(round(_clamp01(box.y2) * float(src_h))) + y = _even(max(0, min(src_h - 2, y1))) + ch = _even(max(2, min(src_h - y, y2 - y))) + else: + y = 0 + ch = _even(src_h) + + return _SplitStrip(cw=cw, ch=ch, x=x, y=y) + + +def _chart_strip_with_vertical_pad( + strip: _SplitStrip, + *, + src_h: int, + pad_frac: float = _CHART_STRIP_VERTICAL_PAD_FRAC, +) -> _SplitStrip: + """Relax chart crops vertically so cover-scaling trims fewer chart edges.""" + + pad = _even(max(0, int(round(strip.ch * max(0.0, pad_frac))))) + if pad <= 0: + return strip + + top = max(0, strip.y - pad) + bottom = min(src_h, strip.y + strip.ch + pad) + ch = _even(max(2, bottom - top)) + if ch <= strip.ch: + return strip + y = _even(max(0, min(src_h - ch, top))) + return _SplitStrip(cw=strip.cw, ch=ch, x=strip.x, y=y) + + +def _compute_seam( + *, + left_box: BoundingBox | None, + right_box: BoundingBox | None, + src_w: int, + src_h: int, + default_fraction: float = 0.5, +) -> int: + """Return an even x-coordinate that partitions the source into two strips. + + When both bboxes are known, the seam is the midpoint of the gap/overlap + between ``left_box.x2`` and ``right_box.x1``. Falls back to + ``default_fraction * src_w`` (0.5 = even) otherwise. The seam is clamped + so neither strip is thinner than :data:`_MIN_SPLIT_STRIP_FRAC` of source. + """ + if left_box is not None and right_box is not None: + _, _, left_x, _ = _bbox_to_crop_pixels(left_box, src_w, src_h) + left_cw, _, _, _ = _bbox_to_crop_pixels(left_box, src_w, src_h) + _, _, right_x, _ = _bbox_to_crop_pixels(right_box, src_w, src_h) + + left_right = left_x + left_cw + seam = int(round((left_right + right_x) / 2.0)) + else: + seam = int(round(default_fraction * float(src_w))) + + seam = _even(seam) + min_strip = _even(max(2, int(round(src_w * _MIN_SPLIT_STRIP_FRAC)))) + if min_strip * 2 >= src_w: + min_strip = _even(max(2, src_w // 4)) + return max(min_strip, min(src_w - min_strip, seam)) + + +def _band_heights(out_h: int, top_ratio: float) -> tuple[int, int]: + """Return ``(top_h, bot_h)`` even band heights that sum to ``out_h``.""" + top_h = _even(int(round(out_h * top_ratio))) + top_h = max(2, min(out_h - 2, top_h)) + bot_h = out_h - top_h + return top_h, bot_h + + +def _stack_filtergraph( + *, + top_strip: _SplitStrip, + bot_strip: _SplitStrip, + out_w: int, + top_h: int, + bot_h: int, +) -> str: + """Compose the split filter graph: ``[0:v]split=2 β†’ two crops β†’ vstack β†’ [vout]``.""" + top_fg = top_strip.filter_crop("src1", out_w, top_h, "top") + bot_fg = bot_strip.filter_crop("src2", out_w, bot_h, "bot") + return ( + f"[0:v]split=2[src1][src2];" + f"{top_fg};" + f"{bot_fg};" + f"[top][bot]vstack=inputs=2[vout]" + ) + + +# --------------------------------------------------------------------------- +# Layout: single-subject (centered) β€” 1 person +# --------------------------------------------------------------------------- + + +def plan_zoom_call_center( + instruction: LayoutInstruction, + *, + out_w: int, + out_h: int, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """1 person, tight zoom-call framing. ``zoom`` clamped to ``>= 1.25``.""" + zoom = max(instruction.zoom, 1.25) + cw, ch, x, y = _center_crop_to_9x16(src_w, src_h, zoom, instruction.person_x_norm) + if instruction.person_tracking: + if any(point.zoom is not None for point in instruction.person_tracking): + w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs( + src_w=src_w, + src_h=src_h, + target_aspect=9 / 16, + default_zoom=zoom, + center_y_norm=0.5, + tracking=instruction.person_tracking, + ) + fg = ( + f"[0:v]setpts=PTS-STARTPTS[vsrc];" + f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + else: + x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking) + fg = ( + f"[0:v]setpts=PTS-STARTPTS[vsrc];" + f"[vsrc]crop={cw}:{ch}:{x_expr}:{y}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + else: + fg = ( + f"[0:v]crop={cw}:{ch}:{x}:{y}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + return FilterPlan(filtergraph=fg) + + +def plan_sit_center( + instruction: LayoutInstruction, + *, + out_w: int, + out_h: int, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """1 person, interview/seated framing. Vertical center biased to ``0.48`` + so faces sit slightly above the 9:16 middle instead of centered on a + subject's chest. + """ + zoom = max(instruction.zoom, 1.0) + cw, ch, x, y = _crop_box( + src_w, src_h, 9 / 16, zoom, instruction.person_x_norm, 0.48 + ) + if instruction.person_tracking: + if any(point.zoom is not None for point in instruction.person_tracking): + w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs( + src_w=src_w, + src_h=src_h, + target_aspect=9 / 16, + default_zoom=zoom, + center_y_norm=0.48, + tracking=instruction.person_tracking, + ) + fg = ( + f"[0:v]setpts=PTS-STARTPTS[vsrc];" + f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + else: + x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking) + fg = ( + f"[0:v]setpts=PTS-STARTPTS[vsrc];" + f"[vsrc]crop={cw}:{ch}:{x_expr}:{y}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + else: + fg = ( + f"[0:v]crop={cw}:{ch}:{x}:{y}," + f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]" + ) + return FilterPlan(filtergraph=fg) + + +# --------------------------------------------------------------------------- +# Split layouts β€” 2 items stacked vertically +# --------------------------------------------------------------------------- + + +def plan_split_chart_person( + instruction: LayoutInstruction, + *, + out_w: int, + out_h: int, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """1 chart + 1 person. + + **Horizontal partition.** Chart occupies the left source strip, person the + right strip. When both bboxes are set (Gemini vision), the seam sits at + the midpoint between ``chart.x2`` and ``person.x1`` so the strips are + complementary (no overlap, no gap). Otherwise the seam defaults to a + 2/3 | 1/3 split (chart left, person right), matching the Ark-style + explainer-slide geometry this codebase was originally written against. + + **Vertical crop.** Each strip's vertical extent comes from the + corresponding bbox when provided β€” crucial so the chart **fills** its + output band instead of being lost inside full-height source context + (plant, background, lower-third graphics, etc.). Falls back to full + source height when bboxes are unavailable. + + **Output bands.** Controlled by :attr:`LayoutInstruction.top_band_ratio` + (default 0.5 = even 50/50 β€” the user-requested symmetric look). Focus + stack order picks chart-on-top (default) vs person-on-top. + """ + + top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio) + + chart_box = instruction.split_chart_region + person_box = instruction.split_person_region + + if chart_box is not None and person_box is not None: + seam = _compute_seam( + left_box=chart_box, right_box=person_box, src_w=src_w, src_h=src_h + ) + chart_start = 0 + else: + # Historical default: chart = left 2/3, person = right 1/3 (the + # Ark-style explainer-slide geometry this codebase was originally + # written against). ``chart_x_norm`` trims the chart strip from its + # left edge when we have no vision bbox to do it precisely. + seam = _even(max(2, min(src_w - 2, int(round((2.0 / 3.0) * float(src_w)))))) + trim = int(round(_clamp01(instruction.chart_x_norm) * float(seam))) + chart_start = _even(max(0, min(seam - 2, trim))) + + chart_strip = _bbox_strip( + chart_box, src_w=src_w, src_h=src_h, x_start=chart_start, x_end=seam + ) + if chart_box is not None: + chart_strip = _chart_strip_with_vertical_pad(chart_strip, src_h=src_h) + person_strip = _bbox_strip( + person_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w + ) + return _emit_split( + chart_strip=chart_strip, + person_strip=person_strip, + order=instruction.focus_stack_order, + out_w=out_w, + top_h=top_h, + bot_h=bot_h, + ) + + +def _emit_split( + *, + chart_strip: _SplitStrip, + person_strip: _SplitStrip, + order: FocusStackOrder, + out_w: int, + top_h: int, + bot_h: int, +) -> FilterPlan: + if order == FocusStackOrder.CHART_THEN_PERSON: + fg = _stack_filtergraph( + top_strip=chart_strip, + bot_strip=person_strip, + out_w=out_w, + top_h=top_h, + bot_h=bot_h, + ) + else: + fg = _stack_filtergraph( + top_strip=person_strip, + bot_strip=chart_strip, + out_w=out_w, + top_h=top_h, + bot_h=bot_h, + ) + return FilterPlan(filtergraph=fg) + + +def plan_split_two_persons( + instruction: LayoutInstruction, + *, + out_w: int, + out_h: int, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """2 persons (interview two-up) stacked vertically. + + First person = ``split_person_region``, second person = ``split_second_person_region``. + Seam sits at the midpoint between the two bboxes when both are known; + otherwise defaults to a centered 50/50 split. + """ + top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio) + + left_box = instruction.split_person_region + right_box = instruction.split_second_person_region + + seam = _compute_seam( + left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h + ) + + left_strip = _bbox_strip( + left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam + ) + right_strip = _bbox_strip( + right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w + ) + fg = _stack_filtergraph( + top_strip=left_strip, + bot_strip=right_strip, + out_w=out_w, + top_h=top_h, + bot_h=bot_h, + ) + return FilterPlan(filtergraph=fg) + + +def plan_split_two_charts( + instruction: LayoutInstruction, + *, + out_w: int, + out_h: int, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """2 charts stacked vertically. + + First chart = ``split_chart_region``, second chart = ``split_second_chart_region``. + Uses the same seam/bbox-y-crop recipe as the other splits, so each chart + fills its output band instead of being surrounded by source context. + """ + top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio) + + left_box = instruction.split_chart_region + right_box = instruction.split_second_chart_region + + seam = _compute_seam( + left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h + ) + + left_strip = _bbox_strip( + left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam + ) + if left_box is not None: + left_strip = _chart_strip_with_vertical_pad(left_strip, src_h=src_h) + right_strip = _bbox_strip( + right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w + ) + if right_box is not None: + right_strip = _chart_strip_with_vertical_pad(right_strip, src_h=src_h) + fg = _stack_filtergraph( + top_strip=left_strip, + bot_strip=right_strip, + out_w=out_w, + top_h=top_h, + bot_h=bot_h, + ) + return FilterPlan(filtergraph=fg) + + +_DISPATCH = { + LayoutKind.ZOOM_CALL_CENTER: plan_zoom_call_center, + LayoutKind.SIT_CENTER: plan_sit_center, + LayoutKind.SPLIT_CHART_PERSON: plan_split_chart_person, + LayoutKind.SPLIT_TWO_PERSONS: plan_split_two_persons, + LayoutKind.SPLIT_TWO_CHARTS: plan_split_two_charts, +} + + +def plan_layout( + instruction: LayoutInstruction, + *, + out_w: int = 1080, + out_h: int = 1920, + src_w: int = DEFAULT_SRC_W, + src_h: int = DEFAULT_SRC_H, +) -> FilterPlan: + """Dispatch to one of the five thrusters. + + Exhaustive over :class:`LayoutKind` β€” adding a new layout requires adding + a planner above **and** an entry in :data:`_DISPATCH`. + """ + + fn = _DISPATCH.get(instruction.layout) + if fn is None: + raise ValueError(f"Unknown layout: {instruction.layout!r}") + return fn(instruction, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h) diff --git a/humeo-core/src/humeo_core/primitives/select_clips.py b/humeo-core/src/humeo_core/primitives/select_clips.py new file mode 100644 index 0000000000000000000000000000000000000000..2fd915d4d7478df3fe4c719ee80a7649da271f8a --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/select_clips.py @@ -0,0 +1,150 @@ +"""Clip selection: pick the strongest 30-60s segments from a long source. + +Two backends, same contract: + +* ``select_clips_heuristic`` β€” greedy word-density scoring. Uses the + transcript alone; zero model calls. Good baseline when transcript exists. +* ``select_clips_with_llm`` β€” pluggable LLM hook. Caller provides a + ``(prompt_text) -> str`` function that must return strict JSON matching + the ``ClipPlan`` schema. We re-validate before returning. + +Both return a ``ClipPlan``. +""" + +from __future__ import annotations + +import json +from typing import Callable + +from ..schemas import Clip, ClipPlan, TranscriptWord + + +LLMTextFn = Callable[[str], str] + + +CLIP_SELECTOR_PROMPT_TEMPLATE = """You are a viral-clip selector for a podcast editor. +Return ONLY JSON matching this shape: + +{{ + "source_path": "{source_path}", + "clips": [ + {{ + "clip_id": "001", + "topic": "", + "start_time_sec": , + "end_time_sec": , + "viral_hook": "", + "virality_score": <0..1>, + "transcript": "", + "suggested_overlay_title": "<<=6 words>" + }} + ] +}} + +Pick {target_count} clips, each {min_sec}-{max_sec} seconds long, NO overlaps, sorted by virality_score desc. + +Transcript (word, start, end): +{transcript} +""" + + +def _words_in_window( + words: list[TranscriptWord], start: float, end: float +) -> list[TranscriptWord]: + return [w for w in words if w.start_time >= start and w.end_time <= end] + + +def select_clips_heuristic( + source_path: str, + words: list[TranscriptWord], + duration_sec: float, + *, + target_count: int = 5, + min_sec: float = 30.0, + max_sec: float = 60.0, + step_sec: float = 5.0, +) -> ClipPlan: + """Greedy: slide a window, score by words/sec, take top non-overlapping picks.""" + + if duration_sec <= min_sec or not words: + # No sensible windowing possible; return one clip of the whole thing. + end = min(duration_sec, max_sec) if duration_sec > 0 else max_sec + return ClipPlan( + source_path=source_path, + clips=[ + Clip( + clip_id="001", + topic="Full source", + start_time_sec=0.0, + end_time_sec=max(end, 1.0), + viral_hook="", + virality_score=0.5, + transcript=" ".join(w.word for w in words), + suggested_overlay_title="Highlight", + ) + ], + ) + + candidates: list[tuple[float, float, float, str]] = [] + window = (min_sec + max_sec) / 2.0 + t = 0.0 + while t + window <= duration_sec: + ws = _words_in_window(words, t, t + window) + if ws: + density = len(ws) / window + text = " ".join(w.word for w in ws) + candidates.append((density, t, t + window, text)) + t += step_sec + + candidates.sort(key=lambda c: c[0], reverse=True) + picked: list[tuple[float, float, float, str]] = [] + for c in candidates: + if len(picked) >= target_count: + break + if all(c[2] <= p[1] or c[1] >= p[2] for p in picked): + picked.append(c) + picked.sort(key=lambda c: c[1]) + + clips: list[Clip] = [] + for i, (density, s, e, text) in enumerate(picked, start=1): + norm = min(1.0, density / 3.0) # ~3 words/sec is dense talking + clips.append( + Clip( + clip_id=f"{i:03d}", + topic=text.split(".")[0][:60] or f"Clip {i}", + start_time_sec=round(s, 2), + end_time_sec=round(e, 2), + viral_hook=text[:120], + virality_score=round(norm, 3), + transcript=text, + suggested_overlay_title=(text.split(".")[0][:40] or f"Clip {i}"), + ) + ) + return ClipPlan(source_path=source_path, clips=clips) + + +def select_clips_with_llm( + source_path: str, + words: list[TranscriptWord], + *, + target_count: int, + min_sec: float, + max_sec: float, + text_fn: LLMTextFn, +) -> ClipPlan: + transcript_lines = "\n".join( + f"{w.word}\t{w.start_time:.2f}\t{w.end_time:.2f}" for w in words + ) + prompt = CLIP_SELECTOR_PROMPT_TEMPLATE.format( + source_path=source_path, + target_count=target_count, + min_sec=min_sec, + max_sec=max_sec, + transcript=transcript_lines, + ) + raw = text_fn(prompt) + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + raise ValueError(f"LLM did not return JSON: {e}; raw={raw[:200]!r}") from e + return ClipPlan.model_validate(data) diff --git a/humeo-core/src/humeo_core/primitives/vision.py b/humeo-core/src/humeo_core/primitives/vision.py new file mode 100644 index 0000000000000000000000000000000000000000..938fd3951876a8f7fd1f39964bc91e14bdc61f70 --- /dev/null +++ b/humeo-core/src/humeo_core/primitives/vision.py @@ -0,0 +1,210 @@ +"""Vision-LLM + OCR primitive β€” the alt path to per-scene framing decisions. + +Design (Bryan's "big screen change -> v3 images -> LLM+OCR -> bbox" idea): + +1. Scene detection already produces one keyframe per scene (deterministic, + local, cheap). That is ``primitives/ingest.py::extract_keyframes``. +2. For each keyframe, call a pluggable vision LLM with an OCR hint. The + model returns normalized bboxes for the on-screen roles it cares about + (``person``, ``chart``) plus any OCR text it reads. +3. Fold those bboxes into ``LayoutInstruction`` values so the existing + layout planner (``primitives/layouts.py``) does the actual ffmpeg math. + +Why this shape: + +* **Pluggable**. Caller supplies ``LLMRegionFn``. We never hard-code a + provider. The same primitive works for Gemini, GPT-4o, internal models, + tests, or mocks. +* **Schema-validated**. Raw model output is parsed into ``SceneRegions`` + (Pydantic). Malformed output degrades to ``None`` regions rather than + crashing or corrupting downstream state. +* **Separable**. ``detect_regions_with_llm`` is one function. Mapping + regions to ``LayoutInstruction`` is another. Mapping a ``LayoutKind`` + guess from regions is a third. Each is independently testable. +""" + +from __future__ import annotations + +import json +from typing import Callable + +from ..schemas import ( + BoundingBox, + LayoutInstruction, + LayoutKind, + Scene, + SceneClassification, + SceneRegions, +) + + +LLMRegionFn = Callable[[str, str], str] +"""Signature: (keyframe_path, prompt) -> raw model string (expected JSON). + +The caller is responsible for any image encoding (base64, multipart, etc.). +The primitive only passes the path + prompt and re-validates the reply. +""" + + +REGION_PROMPT = """You are a vision+OCR system for a short-video editor. +Look at the provided keyframe and return a STRICT JSON object of this shape: + +{ + "person_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null, + "chart_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null, + "ocr_text": "", + "reason": "<= 20 words of rationale" +} + +Rules: +- All bbox coordinates are normalized to the frame (0=left/top, 1=right/bottom). +- x2 > x1, y2 > y1. +- Return null for any region that is not present (e.g. a pure talking-head + scene has no chart). +- "person_bbox" is the *speaker's* body/head region if visible. +- "chart_bbox" is any chart, graph, slide, screenshare, or diagram. +- OCR text should be the readable text on screen (titles, labels, chart + axis values). Omit subtitle captions. +- NO markdown, NO prose outside JSON. JSON only. +""" + + +# --------------------------------------------------------------------------- +# Core: detect regions per scene via pluggable LLM +# --------------------------------------------------------------------------- + + +def detect_regions_with_llm( + scenes: list[Scene], vision_fn: LLMRegionFn +) -> list[SceneRegions]: + """Call ``vision_fn`` for each scene's keyframe and return parsed regions. + + Parse failures degrade to an empty ``SceneRegions`` with ``raw_reason`` + describing the error β€” never raise β€” so a single bad scene can't take + down the whole pipeline. + """ + + out: list[SceneRegions] = [] + for s in scenes: + if not s.keyframe_path: + out.append( + SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available") + ) + continue + raw = vision_fn(s.keyframe_path, REGION_PROMPT) + out.append(_parse_region_reply(s.scene_id, raw)) + return out + + +def _parse_region_reply(scene_id: str, raw: str) -> SceneRegions: + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + return SceneRegions(scene_id=scene_id, raw_reason=f"JSON parse error: {e!r}") + + def _opt_bbox(value: object) -> BoundingBox | None: + if not value: + return None + try: + return BoundingBox.model_validate(value) + except Exception: + return None + + return SceneRegions( + scene_id=scene_id, + person_bbox=_opt_bbox(data.get("person_bbox")), + chart_bbox=_opt_bbox(data.get("chart_bbox")), + ocr_text=str(data.get("ocr_text", ""))[:4000], + raw_reason=str(data.get("reason", ""))[:400], + ) + + +# --------------------------------------------------------------------------- +# Derivation: regions -> LayoutKind / LayoutInstruction +# --------------------------------------------------------------------------- + + +# Width threshold: if the chart bbox covers this much of the frame width, it +# is wide enough to treat the scene as a split_chart_person. Tuned for the +# source videos described in the spec (chart ~2/3 of width). +_CHART_WIDTH_SPLIT_THRESHOLD = 0.45 + + +def classify_from_regions(regions: SceneRegions) -> SceneClassification: + """Pick a ``LayoutKind`` for a scene using only its ``SceneRegions``. + + Priority: + 1. If ``chart_bbox`` is present and wide, it's ``SPLIT_CHART_PERSON``. + 2. Else if ``person_bbox`` is present and tight, ``ZOOM_CALL_CENTER``. + 3. Else default to ``SIT_CENTER`` with low confidence. + + "Tight" β‰ˆ the person covers more than half the frame width (zoom-call + webcam framing). "Wide" for a chart β‰ˆ 45% of frame width or more. + """ + + if regions.chart_bbox and regions.chart_bbox.width >= _CHART_WIDTH_SPLIT_THRESHOLD: + return SceneClassification( + scene_id=regions.scene_id, + layout=LayoutKind.SPLIT_CHART_PERSON, + confidence=float(min(1.0, 0.5 + regions.chart_bbox.width / 2.0)), + reason=f"chart bbox covers {regions.chart_bbox.width:.2f} of width", + ) + if regions.person_bbox and regions.person_bbox.width >= 0.5: + return SceneClassification( + scene_id=regions.scene_id, + layout=LayoutKind.ZOOM_CALL_CENTER, + confidence=float(min(1.0, 0.5 + regions.person_bbox.width / 2.0)), + reason=f"person bbox wide ({regions.person_bbox.width:.2f}) β€” tight framing", + ) + if regions.person_bbox: + return SceneClassification( + scene_id=regions.scene_id, + layout=LayoutKind.SIT_CENTER, + confidence=0.7, + reason="person present, no wide chart, wider framing", + ) + return SceneClassification( + scene_id=regions.scene_id, + layout=LayoutKind.SIT_CENTER, + confidence=0.3, + reason=regions.raw_reason or "no regions detected β€” defaulting to sit_center", + ) + + +def layout_instruction_from_regions( + regions: SceneRegions, + classification: SceneClassification, + *, + clip_id: str | None = None, + zoom: float = 1.0, +) -> LayoutInstruction: + """Build a ``LayoutInstruction`` whose knobs are populated from bboxes. + + ``person_x_norm`` uses the person bbox center when available; falls back + to 0.5 (center). ``chart_x_norm`` uses the chart bbox left edge; falls + back to 0.0. + """ + + person_x = regions.person_bbox.center_x if regions.person_bbox else 0.5 + chart_x = regions.chart_bbox.x1 if regions.chart_bbox else 0.0 + return LayoutInstruction( + clip_id=clip_id or classification.scene_id, + layout=classification.layout, + zoom=zoom, + person_x_norm=person_x, + chart_x_norm=chart_x, + ) + + +def classify_scenes_with_vision_llm( + scenes: list[Scene], vision_fn: LLMRegionFn +) -> list[tuple[SceneRegions, SceneClassification]]: + """One-shot helper: keyframes -> regions -> classifications. + + Returns ``(regions, classification)`` pairs per scene so the caller can + keep both artefacts on disk (regions = deep detail, classification = + what a renderer consumes). + """ + + regions = detect_regions_with_llm(scenes, vision_fn) + return [(r, classify_from_regions(r)) for r in regions] diff --git a/humeo-core/src/humeo_core/schemas.py b/humeo-core/src/humeo_core/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..0cee4af91664103ca45f060427ecef25e6ea4c78 --- /dev/null +++ b/humeo-core/src/humeo_core/schemas.py @@ -0,0 +1,518 @@ +"""Strict JSON contracts β€” the "container" of the rocket. + +Every primitive reads and writes these. No primitive takes or returns free-form +strings. This is the non-negotiable interface described in the HIVE paper +guide (section 7): machine-checkable intermediate artifacts at every stage. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Literal + +from pydantic import BaseModel, Field, field_validator, model_serializer, model_validator + + +# --------------------------------------------------------------------------- +# Extraction artifacts +# --------------------------------------------------------------------------- + + +class Scene(BaseModel): + """A single shot/scene detected in the source video.""" + + scene_id: str + start_time: float = Field(ge=0) + end_time: float = Field(gt=0) + keyframe_path: str | None = None + + @field_validator("end_time") + @classmethod + def _end_after_start(cls, v: float, info) -> float: + start = info.data.get("start_time", 0.0) + if v <= start: + raise ValueError("end_time must be strictly greater than start_time") + return v + + @property + def duration(self) -> float: + return self.end_time - self.start_time + + +class TranscriptWord(BaseModel): + """One ASR token with times in **seconds on the source video** timeline.""" + + word: str + start_time: float = Field(ge=0) + end_time: float = Field(ge=0) + + +class ClipSubtitleWords(BaseModel): + """Words for one clip with times in **seconds relative to clip start** (t=0 at cut in-point).""" + + words: list[TranscriptWord] = Field(default_factory=list) + + +class FocusStackOrder(str, Enum): + """Vertical order for split layouts: which item occupies the top vs bottom band. + + Bands are split by :attr:`LayoutInstruction.top_band_ratio` (default 0.5 = even). + For ``SPLIT_CHART_PERSON`` this picks chart-on-top vs person-on-top. + For ``SPLIT_TWO_PERSONS`` / ``SPLIT_TWO_CHARTS`` it has no visible meaning + (both bands hold the same kind of item); the enum value is retained only + so a single stacking recipe drives all three split layouts. + """ + + CHART_THEN_PERSON = "chart_then_person" + PERSON_THEN_CHART = "person_then_chart" + + +class RenderTheme(str, Enum): + """Visual treatment applied by the final renderer.""" + + LEGACY = "legacy" + REFERENCE_LOWER_THIRD = "reference_lower_third" + NATIVE_HIGHLIGHT = "native_highlight" + + +class IngestResult(BaseModel): + """Everything Stage 1 (deterministic local extraction) produces.""" + + source_path: str + duration_sec: float + scenes: list[Scene] + transcript_words: list[TranscriptWord] + keyframes_dir: str | None = None + + +# --------------------------------------------------------------------------- +# Layout system β€” the 5 "thrusters" (max 2 on-screen items per short) +# --------------------------------------------------------------------------- + + +class LayoutKind(str, Enum): + """The 9:16 layouts. A short contains **at most two** on-screen items. + + An "item" is one of ``person`` (a human speaker) or ``chart`` (slide, graph, + data visual, screenshare). Five combinations are allowed: + + - ``ZOOM_CALL_CENTER``: **1 person**, tight webcam/zoom-call framing, centered. + - ``SIT_CENTER``: **1 person**, interview/seated framing, centered. + - ``SPLIT_CHART_PERSON``: **1 chart + 1 person** β€” chart + speaker share the + source frame. Output stacks them vertically + (by default ``focus_stack_order`` = chart-on-top). + - ``SPLIT_TWO_PERSONS``: **2 persons** β€” two speakers (e.g. interview two-up). + Output stacks them vertically. + - ``SPLIT_TWO_CHARTS``: **2 charts** β€” two charts/slides side-by-side in source. + Output stacks them vertically. + + The "max 2 items" constraint is the keep-it-simple rule: every rendered short + is either one item centered, or two items stacked evenly top/bottom. + """ + + ZOOM_CALL_CENTER = "zoom_call_center" + SIT_CENTER = "sit_center" + SPLIT_CHART_PERSON = "split_chart_person" + SPLIT_TWO_PERSONS = "split_two_persons" + SPLIT_TWO_CHARTS = "split_two_charts" + + +# Layouts that stack two items vertically in the 9:16 output. +SPLIT_LAYOUTS: frozenset[LayoutKind] = frozenset( + { + LayoutKind.SPLIT_CHART_PERSON, + LayoutKind.SPLIT_TWO_PERSONS, + LayoutKind.SPLIT_TWO_CHARTS, + } +) + + +class TimedCenterPoint(BaseModel): + """Speaker x-center at a clip-relative time, used for tracked centering.""" + + t_sec: float = Field(ge=0.0) + x_norm: float = Field(ge=0.0, le=1.0) + zoom: float | None = Field( + default=None, + gt=0.0, + le=4.0, + description=( + "Optional per-sample crop zoom. When unset, the layout uses the " + "clip-level ``zoom`` value for that moment." + ), + ) + + +class ClipRenderSpan(BaseModel): + """One kept source-timeline span inside a selected clip.""" + + start_time_sec: float = Field(ge=0.0) + end_time_sec: float = Field(gt=0.0) + + @field_validator("end_time_sec") + @classmethod + def _end_after_start(cls, v: float, info) -> float: + start = info.data.get("start_time_sec", 0.0) + if v <= start: + raise ValueError("render span end_time_sec must be greater than start_time_sec") + return v + + @property + def duration_sec(self) -> float: + return self.end_time_sec - self.start_time_sec + + +class LayoutInstruction(BaseModel): + """Per-clip decision telling the compiler which layout to apply and how to crop. + + Every short is described by exactly one of these, keyed by ``clip_id``. Split + layouts additionally carry up to two normalized bounding boxes (chart/person + or two-of-a-kind) so the compiler crops source strips that **partition** the + source width without overlap or gap. + """ + + clip_id: str + layout: LayoutKind + # Optional per-layout knobs. Defaults are sane for a 1920x1080 source. + zoom: float = Field(default=1.0, gt=0, le=4.0) + person_x_norm: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Normalized x-center of the human subject in source frame (0=left, 1=right).", + ) + person_tracking: list[TimedCenterPoint] = Field( + default_factory=list, + description=( + "Optional clip-relative speaker framing samples for moving 9:16 crops. " + "Each point can shift the x-center and optionally widen/tighten the crop " + "for that moment. When empty, the compiler uses the static " + "person_x_norm/zoom settings." + ), + ) + chart_x_norm: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description=( + "split_chart_person only: left-edge trim of the chart strip, as a fraction of the " + "left 2/3 pane (0 = use full chart area)." + ), + ) + focus_stack_order: FocusStackOrder = Field( + default=FocusStackOrder.CHART_THEN_PERSON, + description="For split_chart_person only: chart-on-top vs person-on-top in the 9:16 stack.", + ) + split_chart_region: BoundingBox | None = Field( + default=None, + description=( + "Optional normalized rect for the chart/slide crop (Gemini vision). " + "When set with split_person_region, the split layout uses these boxes instead of fixed 2/3|1/3." + ), + ) + split_person_region: BoundingBox | None = Field( + default=None, + description="Optional normalized rect for the speaker crop (Gemini vision).", + ) + split_second_chart_region: BoundingBox | None = Field( + default=None, + description=( + "For ``SPLIT_TWO_CHARTS`` only: second chart bbox. The first chart occupies " + "the top output band, this one occupies the bottom band." + ), + ) + split_second_person_region: BoundingBox | None = Field( + default=None, + description=( + "For ``SPLIT_TWO_PERSONS`` only: second speaker bbox. The first person " + "occupies the top output band, this one occupies the bottom band." + ), + ) + top_band_ratio: float = Field( + default=0.5, + ge=0.2, + le=0.8, + description=( + "Fraction of 9:16 output height used by the top band for split layouts. " + "0.5 = EVEN 50/50 split (default β€” the user-requested symmetric look). " + "0.6 historically matched the 'chart dominant / person small' look." + ), + ) + + + @field_validator("person_tracking") + @classmethod + def _tracking_times_non_decreasing( + cls, points: list[TimedCenterPoint] + ) -> list[TimedCenterPoint]: + last_t = -1.0 + for point in points: + if point.t_sec < last_t: + raise ValueError("person_tracking times must be non-decreasing") + last_t = point.t_sec + return points + + +class SceneClassification(BaseModel): + """Result of the classifier: which layout should a given scene use.""" + + scene_id: str + layout: LayoutKind + confidence: float = Field(ge=0.0, le=1.0) + reason: str = "" + + +# --------------------------------------------------------------------------- +# Vision bounding boxes β€” the LLM+OCR path (alt to pixel heuristics) +# --------------------------------------------------------------------------- + + +class BoundingBox(BaseModel): + """Normalized [0..1] bounding box in the source frame coordinate space. + + Normalized coords keep these outputs portable across source resolutions + and stop the model hallucinating pixel values. ``x2 > x1`` and + ``y2 > y1`` are enforced. + """ + + x1: float = Field(ge=0.0, le=1.0) + y1: float = Field(ge=0.0, le=1.0) + x2: float = Field(ge=0.0, le=1.0) + y2: float = Field(ge=0.0, le=1.0) + label: str = "" + confidence: float = Field(default=1.0, ge=0.0, le=1.0) + + @field_validator("x2") + @classmethod + def _x2_after_x1(cls, v: float, info) -> float: + x1 = info.data.get("x1", 0.0) + if v <= x1: + raise ValueError("x2 must be > x1") + return v + + @field_validator("y2") + @classmethod + def _y2_after_y1(cls, v: float, info) -> float: + y1 = info.data.get("y1", 0.0) + if v <= y1: + raise ValueError("y2 must be > y1") + return v + + @property + def center_x(self) -> float: + return (self.x1 + self.x2) / 2.0 + + @property + def center_y(self) -> float: + return (self.y1 + self.y2) / 2.0 + + @property + def width(self) -> float: + return self.x2 - self.x1 + + +class SceneRegions(BaseModel): + """Vision-LLM output for a single scene keyframe. + + Flow: detect a scene change locally (cheap) -> extract one keyframe per + scene -> send that keyframe to a vision LLM with an OCR hint -> get + normalized bounding boxes for the on-screen roles (``person``, + ``chart``). Those boxes drive ``person_x_norm`` / ``chart_x_norm`` on a + ``LayoutInstruction`` without any pixel code running in Python. + """ + + scene_id: str + person_bbox: BoundingBox | None = None + chart_bbox: BoundingBox | None = None + ocr_text: str = "" + raw_reason: str = "" + + +# --------------------------------------------------------------------------- +# Clip planning +# --------------------------------------------------------------------------- + + +class Clip(BaseModel): + clip_id: str + topic: str + start_time_sec: float = Field(ge=0) + end_time_sec: float = Field(gt=0) + viral_hook: str = "" + virality_score: float = Field(default=0.0, ge=0.0, le=1.0) + transcript: str = "" + suggested_overlay_title: str = "" + layout: LayoutKind | None = None + score_breakdown: dict[str, float] | None = None + origin: Literal["text", "visual", "both"] = "text" + visual_notes: str | None = None + reasoning: str | None = None + + # Optional LLM metadata (source timeline is start_time_sec / end_time_sec). + hook_start_sec: float | None = Field( + default=None, + description="Seconds from clip in-point where the viral hook begins (0 = clip start).", + ) + hook_end_sec: float | None = Field( + default=None, + description="Seconds from clip in-point where the hook ends (exclusive upper bound).", + ) + trim_start_sec: float = Field( + default=0.0, + ge=0, + description="Seconds to remove from the start of this segment when exporting.", + ) + trim_end_sec: float = Field( + default=0.0, + ge=0, + description="Seconds to remove from the end of this segment when exporting.", + ) + render_spans: list[ClipRenderSpan] = Field( + default_factory=list, + description=( + "Optional ordered source-timeline spans to keep when exporting. " + "When present, these spans override contiguous trim_start/trim_end export." + ), + ) + shorts_title: str = "" + description: str = "" + hashtags: list[str] = Field(default_factory=list) + layout_hint: LayoutKind | None = None + needs_review: bool = False + review_reason: str = "" + + @field_validator("score_breakdown") + @classmethod + def _score_breakdown_in_range( + cls, v: dict[str, float] | None + ) -> dict[str, float] | None: + if v is None: + return None + cleaned: dict[str, float] = {} + for axis, score in v.items(): + if score < 0.0: + raise ValueError(f"score_breakdown[{axis!r}] must be non-negative") + cleaned[axis] = min(score, 1.0) + return cleaned + + @model_validator(mode="after") + def _timing_consistency(self) -> "Clip": + if self.end_time_sec <= self.start_time_sec: + raise ValueError("end_time_sec must be greater than start_time_sec") + dur = self.end_time_sec - self.start_time_sec + hs, he = self.hook_start_sec, self.hook_end_sec + if (hs is None) ^ (he is None): + raise ValueError("hook_start_sec and hook_end_sec must both be set or both omitted") + if hs is not None and he is not None: + if not (0 <= hs < he <= dur): + raise ValueError( + "hook window must satisfy 0 <= hook_start_sec < hook_end_sec <= clip duration" + ) + if self.trim_start_sec + self.trim_end_sec > dur: + raise ValueError("trim_start_sec + trim_end_sec must not exceed clip duration") + last_end = None + for span in self.render_spans: + if span.start_time_sec < self.start_time_sec - 1e-6: + raise ValueError("render_spans must stay within the clip start_time_sec") + if span.end_time_sec > self.end_time_sec + 1e-6: + raise ValueError("render_spans must stay within the clip end_time_sec") + if last_end is not None and span.start_time_sec < last_end - 1e-6: + raise ValueError("render_spans must be ordered and non-overlapping") + last_end = span.end_time_sec + return self + + @model_serializer(mode="wrap") + def _serialize_without_default_extensions(self, handler): + data = handler(self) + if data.get("score_breakdown") is None: + data.pop("score_breakdown", None) + if data.get("origin") == "text": + data.pop("origin", None) + if data.get("visual_notes") is None: + data.pop("visual_notes", None) + if data.get("reasoning") is None: + data.pop("reasoning", None) + return data + + @property + def duration_sec(self) -> float: + return self.end_time_sec - self.start_time_sec + + +class ClipPlan(BaseModel): + """Output of the clip-selection stage β€” a list of clips + their layouts.""" + + source_path: str + clips: list[Clip] + + +class ApprovalResult(BaseModel): + action: Literal["proceed", "refine", "quit", "accept_all"] + selected_ids: list[str] | None = None + steering_note: str | None = None + + +class RatingFeedback(BaseModel): + rating: Literal[1, 2, 3] + issues: list[ + Literal[ + "wrong_moments", + "bad_cuts", + "boring", + "confusing", + "wrong_layout", + "length_off", + "other", + ] + ] = Field(default_factory=list) + free_text: str | None = None + + +class SessionState(BaseModel): + source_key: str = "" + iteration: int = 0 + steering_notes: list[str] = Field(default_factory=list) + last_rating: RatingFeedback | None = None + last_selected_ids: list[str] | None = None + + +# --------------------------------------------------------------------------- +# Render +# --------------------------------------------------------------------------- + + +class RenderRequest(BaseModel): + source_path: str + clip: Clip + layout: LayoutInstruction + output_path: str + width: int = 1080 + height: int = 1920 + subtitle_path: str | None = None + subtitle_font_size: int = Field( + default=48, + ge=10, + le=120, + description=( + "Caption font size in **output pixels** (libass is pinned to " + "``original_size=width x height`` by the compiler, so this is a " + "true pixel value, not the old PlayResY=288 unit)." + ), + ) + subtitle_margin_v: int = Field( + default=160, + ge=0, + le=800, + description="Vertical caption margin in output pixels (bottom-anchored).", + ) + title_text: str = "" + render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT + mode: Literal["normal", "dry_run"] = "normal" + + +class RenderResult(BaseModel): + clip_id: str + output_path: str + ffmpeg_cmd: list[str] + success: bool + error: str = "" diff --git a/humeo-core/src/humeo_core/server.py b/humeo-core/src/humeo_core/server.py new file mode 100644 index 0000000000000000000000000000000000000000..610b66778ad1a61f307fa67927fcb7a4bf315b7e --- /dev/null +++ b/humeo-core/src/humeo_core/server.py @@ -0,0 +1,332 @@ +"""FastMCP server β€” the control panel for the reusable rocket. + +Every primitive is exposed as a single MCP ``tool``. Each tool takes and +returns strict Pydantic-validated JSON, so an MCP client (Cursor, Claude +Desktop, etc.) can compose a full long-to-short pipeline without guessing +any interface. + +Tools: + + humeo.ingest β€” Stage 1 extraction (scenes + keyframes [+ transcript]) + humeo.classify_scenes β€” Assign one of 5 layouts to each scene (pixel heuristic) + humeo.classify_scenes_with_vision β€” Assign layouts using bboxes from a vision LLM + OCR + humeo.detect_scene_regions β€” Raw LLM bbox output per scene keyframe (OCR-assisted) + humeo.select_clips β€” Pick top clips from a transcript (heuristic) + humeo.plan_layout β€” Return the ffmpeg filtergraph for a given layout + humeo.build_render_cmd β€” Build the full ffmpeg command (dry-run safe) + humeo.render_clip β€” Build + actually run ffmpeg to produce a 9:16 clip + humeo.list_layouts β€” List the 5 available layouts (discovery) + +Resources: + + humeo://layouts β€” JSON listing of the 5 layouts + description +""" + +from __future__ import annotations + +import json +from typing import Any + +from mcp.server.fastmcp import FastMCP + +from .primitives import classify as classify_mod +from .primitives import compile as compile_mod +from .primitives import ingest as ingest_mod +from .primitives import layouts as layouts_mod +from .primitives import select_clips as select_mod +from .primitives import vision as vision_mod +from .schemas import ( + IngestResult, + LayoutInstruction, + LayoutKind, + RenderRequest, + RenderResult, + Scene, + SceneRegions, + TranscriptWord, +) + + +mcp = FastMCP( + "humeo-core", + instructions=( + "Humeo MCP: reusable primitives for turning long videos into 9:16 shorts. " + "Compose tools in this order: ingest -> classify_scenes -> select_clips -> " + "plan_layout/build_render_cmd -> render_clip. All IO is strict JSON." + ), +) + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +@mcp.tool() +def list_layouts() -> dict[str, Any]: + """Return the 5 fixed 9:16 layouts this server supports. + + Every short shows **at most two** on-screen items (person/chart), which + gives exactly five recipes. Use this to discover the set of + :class:`LayoutKind` values before classifying scenes or requesting + renders. + """ + + return { + "layouts": [ + { + "kind": LayoutKind.ZOOM_CALL_CENTER.value, + "items": ["person"], + "description": "1 person, tight zoom-call / webcam framing, centered.", + }, + { + "kind": LayoutKind.SIT_CENTER.value, + "items": ["person"], + "description": "1 person, interview / seated framing, centered.", + }, + { + "kind": LayoutKind.SPLIT_CHART_PERSON.value, + "items": ["chart", "person"], + "description": ( + "1 chart + 1 person. Source is partitioned left/right by the chart and " + "person bboxes (falling back to a 2/3 | 1/3 split); each strip is scaled " + "to fill its output band. Bands default to an even 50/50 vertical split; " + "configurable via ``top_band_ratio`` and swappable via ``focus_stack_order``." + ), + }, + { + "kind": LayoutKind.SPLIT_TWO_PERSONS.value, + "items": ["person", "person"], + "description": ( + "2 people (interview two-up / panel). Left speaker in the top band, right " + "speaker in the bottom band; seam sits between the two person bboxes." + ), + }, + { + "kind": LayoutKind.SPLIT_TWO_CHARTS.value, + "items": ["chart", "chart"], + "description": ( + "2 charts / slides side-by-side in source. Left chart on top, right chart " + "on bottom; each is scaled to fill its band." + ), + }, + ] + } + + +@mcp.resource("humeo://layouts") +def layouts_resource() -> str: + return json.dumps(list_layouts(), indent=2) + + +# --------------------------------------------------------------------------- +# Landing gear: ingest +# --------------------------------------------------------------------------- + + +@mcp.tool() +def ingest( + source_path: str, + work_dir: str, + with_transcript: bool = False, + whisper_model: str = "base", +) -> dict[str, Any]: + """Run deterministic local extraction (scenes + keyframes, optional transcript). + + Args: + source_path: absolute path to a local video file. + work_dir: directory where keyframes/ and temp artifacts will be written. + with_transcript: if True, run faster-whisper word-level transcription. + whisper_model: whisper model name (e.g. "tiny", "base", "small"). + """ + + result: IngestResult = ingest_mod.ingest( + source_path, + work_dir, + with_transcript=with_transcript, + whisper_model=whisper_model, + ) + return result.model_dump() + + +# --------------------------------------------------------------------------- +# Pilot: classify scenes +# --------------------------------------------------------------------------- + + +@mcp.tool() +def classify_scenes(scenes: list[dict[str, Any]]) -> dict[str, Any]: + """Classify each scene into exactly one of the 5 supported layouts. + + Uses an offline pixel heuristic on each scene's keyframe. Agents that + want a smarter classifier can post-process or overwrite the result, + or call ``classify_scenes_with_vision`` with bboxes from a vision LLM. + """ + + parsed = [Scene.model_validate(s) for s in scenes] + results = classify_mod.classify_scenes_heuristic(parsed) + return {"classifications": [r.model_dump() for r in results]} + + +# --------------------------------------------------------------------------- +# Pilot (alt path): vision-LLM + OCR bbox classifier +# --------------------------------------------------------------------------- + + +@mcp.tool() +def detect_scene_regions(scenes: list[dict[str, Any]]) -> dict[str, Any]: + """Return the prompt + per-scene stubs used for LLM+OCR bbox detection. + + This tool is the *adapter* half of the vision primitive. The MCP server + itself never calls an LLM β€” the agent does. So this endpoint returns: + + 1. the exact ``REGION_PROMPT`` to send along with each keyframe, and + 2. a list of ``{scene_id, keyframe_path, prompt}`` jobs. + + The agent runs its own vision model for each job, then feeds the + resulting JSON back via ``classify_scenes_with_vision``. + """ + + parsed = [Scene.model_validate(s) for s in scenes] + return { + "prompt": vision_mod.REGION_PROMPT, + "jobs": [ + { + "scene_id": s.scene_id, + "keyframe_path": s.keyframe_path, + "prompt": vision_mod.REGION_PROMPT, + } + for s in parsed + ], + } + + +@mcp.tool() +def classify_scenes_with_vision(regions: list[dict[str, Any]]) -> dict[str, Any]: + """Classify scenes from already-gathered ``SceneRegions`` bbox records. + + Input is a list of ``SceneRegions`` JSON dicts (output of the agent's + vision-LLM pass). Output is a ``{classifications, layout_instructions}`` + pair β€” the layout kind per scene plus a ready-to-render + ``LayoutInstruction`` with ``person_x_norm`` / ``chart_x_norm`` already + populated from the bboxes. + """ + + parsed_regions = [SceneRegions.model_validate(r) for r in regions] + classifications = [vision_mod.classify_from_regions(r) for r in parsed_regions] + instructions = [ + vision_mod.layout_instruction_from_regions(r, c) + for r, c in zip(parsed_regions, classifications) + ] + return { + "classifications": [c.model_dump() for c in classifications], + "layout_instructions": [i.model_dump() for i in instructions], + } + + +# --------------------------------------------------------------------------- +# Pilot: select clips +# --------------------------------------------------------------------------- + + +@mcp.tool() +def select_clips( + source_path: str, + transcript_words: list[dict[str, Any]], + duration_sec: float, + target_count: int = 5, + min_sec: float = 30.0, + max_sec: float = 60.0, +) -> dict[str, Any]: + """Heuristically select top clips from a word-level transcript. + + Scoring is word-density per window. Returns a ``ClipPlan`` with up to + ``target_count`` non-overlapping clips. + """ + + words = [TranscriptWord.model_validate(w) for w in transcript_words] + plan = select_mod.select_clips_heuristic( + source_path, + words, + duration_sec, + target_count=target_count, + min_sec=min_sec, + max_sec=max_sec, + ) + return plan.model_dump() + + +# --------------------------------------------------------------------------- +# Thrusters: plan + render +# --------------------------------------------------------------------------- + + +@mcp.tool() +def plan_layout( + layout: str, + out_w: int = 1080, + out_h: int = 1920, + src_w: int = 1920, + src_h: int = 1080, + zoom: float = 1.0, + person_x_norm: float = 0.5, + chart_x_norm: float = 0.0, + clip_id: str = "preview", +) -> dict[str, Any]: + """Return the ffmpeg filter_complex fragment for one layout. + + This is the pure, deterministic function underpinning the 5 thrusters. + No rendering is performed. Useful for agents that want to preview the + filtergraph or compose it with their own ffmpeg invocation. + """ + + instr = LayoutInstruction( + clip_id=clip_id, + layout=LayoutKind(layout), + zoom=zoom, + person_x_norm=person_x_norm, + chart_x_norm=chart_x_norm, + ) + fp = layouts_mod.plan_layout(instr, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h) + return {"filtergraph": fp.filtergraph, "out_label": fp.out_label} + + +@mcp.tool() +def build_render_cmd(request: dict[str, Any]) -> dict[str, Any]: + """Build (but do NOT run) the ffmpeg command for a render request. + + ``request`` must conform to the ``RenderRequest`` schema. This is a + dry-run helper so an agent can review the command before executing it. + """ + + req = RenderRequest.model_validate({**request, "mode": "dry_run"}) + result = compile_mod.render_clip(req) + return result.model_dump() + + +@mcp.tool() +def render_clip(request: dict[str, Any]) -> dict[str, Any]: + """Render a single 9:16 clip with the specified layout. + + ``request`` must conform to ``RenderRequest``. If ``request.mode`` is + ``"dry_run"`` the ffmpeg command is returned without execution. + """ + + req = RenderRequest.model_validate(request) + result: RenderResult = compile_mod.render_clip(req) + return result.model_dump() + + +# --------------------------------------------------------------------------- +# Entrypoint +# --------------------------------------------------------------------------- + + +def main() -> None: + """stdio entrypoint for ``humeo-core`` console-script.""" + + mcp.run() + + +if __name__ == "__main__": + main() diff --git a/humeo-core/tests/__init__.py b/humeo-core/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/humeo-core/tests/test_classify.py b/humeo-core/tests/test_classify.py new file mode 100644 index 0000000000000000000000000000000000000000..3a6eebac4316b28571310ee7af2f218ff8690943 --- /dev/null +++ b/humeo-core/tests/test_classify.py @@ -0,0 +1,39 @@ +import json + +from humeo_core.primitives.classify import ( + classify_scenes_heuristic, + classify_scenes_with_llm, +) +from humeo_core.schemas import LayoutKind, Scene + + +def test_heuristic_no_keyframe_defaults_sit_center(): + scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path=None)] + result = classify_scenes_heuristic(scenes) + assert len(result) == 1 + assert result[0].scene_id == "s0" + assert result[0].layout == LayoutKind.SIT_CENTER + + +def test_llm_classifier_uses_callback_and_validates(): + scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")] + + def fake_vision(image_path: str, prompt: str) -> str: + return json.dumps( + {"layout": "split_chart_person", "confidence": 0.88, "reason": "chart left"} + ) + + result = classify_scenes_with_llm(scenes, fake_vision) + assert result[0].layout == LayoutKind.SPLIT_CHART_PERSON + assert result[0].confidence == 0.88 + + +def test_llm_classifier_parse_error_is_safe(): + scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")] + + def bad_vision(image_path: str, prompt: str) -> str: + return "not json" + + result = classify_scenes_with_llm(scenes, bad_vision) + assert result[0].layout == LayoutKind.SIT_CENTER + assert "parse error" in result[0].reason.lower() diff --git a/humeo-core/tests/test_compile.py b/humeo-core/tests/test_compile.py new file mode 100644 index 0000000000000000000000000000000000000000..03d39184966a484b5bccfc80c92262184cba8b6f --- /dev/null +++ b/humeo-core/tests/test_compile.py @@ -0,0 +1,329 @@ +from pathlib import Path + +from humeo_core.primitives import compile as compile_mod +from humeo_core.primitives.compile import ( + _ensure_windows_fontconfig, + build_ffmpeg_cmd, + plan_title_drawtext, +) +from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RenderRequest, RenderTheme + + +def _req(**overrides): + c = Clip(clip_id="1", topic="t", start_time_sec=10.0, end_time_sec=40.0) + li = LayoutInstruction(clip_id="1", layout=LayoutKind.SIT_CENTER) + data = dict( + source_path="/tmp/src.mp4", + clip=c, + layout=li, + output_path="/tmp/out.mp4", + render_theme=RenderTheme.LEGACY, + mode="dry_run", + ) + data.update(overrides) + return RenderRequest(**data) + + +def test_ffmpeg_cmd_has_ss_duration_filtergraph_output(): + cmd = build_ffmpeg_cmd(_req()) + assert "-ss" in cmd + assert "-t" in cmd + assert "-filter_complex" in cmd + # duration = 30.0 + t_idx = cmd.index("-t") + assert float(cmd[t_idx + 1]) == 30.0 + ss_idx = cmd.index("-ss") + assert float(cmd[ss_idx + 1]) == 10.0 + assert cmd[-1] == "/tmp/out.mp4" + + +def test_title_text_injects_drawtext(): + cmd = build_ffmpeg_cmd(_req(title_text="Hello: world's")) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawtext" in fg + # colon should be escaped + assert "Hello\\:" in fg + assert "worlds" in fg + assert "world's" not in fg + assert "expansion=none" in fg + + +def test_map_vout_and_primary_audio(): + cmd = build_ffmpeg_cmd(_req()) + assert "[vout]" in cmd + assert "0:a:0" in cmd + + +def test_subtitle_style_uses_requested_font_and_margin(): + cmd = build_ffmpeg_cmd( + _req(subtitle_path="/tmp/clip.srt", subtitle_font_size=18, subtitle_margin_v=64) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "subtitles='" in fg + assert "FontSize=18" in fg + assert "MarginV=64" in fg + # Smart word wrap so long captions break into multiple readable lines. + assert "WrapStyle=0" in fg + + +def test_subtitle_original_size_pins_libass_to_output_resolution(): + """Without original_size=W x H, libass uses PlayResY=288 and blows up fonts/margins. + + This is the root cause of the "subtitles floating in the middle of the + frame / blocked" bug the user reported. + """ + cmd = build_ffmpeg_cmd(_req(subtitle_path="/tmp/clip.srt")) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "original_size=1080x1920" in fg + + +def test_subtitles_applied_after_crop_and_title(): + """Order: crop/compose -> drawtext title -> subtitles. + + The pipeline must crop **first**, then draw text on the finished frame. + """ + cmd = build_ffmpeg_cmd( + _req(title_text="Hook", subtitle_path="/tmp/clip.srt") + ) + fg = cmd[cmd.index("-filter_complex") + 1] + crop_pos = fg.index("[0:v]crop=") + drawtext_pos = fg.index("drawtext") + subs_pos = fg.index("subtitles=") + assert crop_pos < drawtext_pos < subs_pos + + +def test_build_is_layout_specific(): + c = Clip(clip_id="1", topic="t", start_time_sec=0, end_time_sec=10) + split_req = _req( + clip=c, + layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON), + ) + cmd = build_ffmpeg_cmd(split_req) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "vstack" in fg + + +def test_title_is_suppressed_on_split_layouts(): + """Split layouts already contain a slide/chart with its own title. + + Overlaying an additional drawtext title just obscures content -- that's + what was happening in the Cathy Wood "chart overlaps subject" report. + """ + for kind in ( + LayoutKind.SPLIT_CHART_PERSON, + LayoutKind.SPLIT_TWO_PERSONS, + LayoutKind.SPLIT_TWO_CHARTS, + ): + cmd = build_ffmpeg_cmd( + _req( + layout=LayoutInstruction(clip_id="1", layout=kind), + title_text="This should not render", + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawtext" not in fg, f"title leaked into split layout {kind}" + + +def test_title_is_drawn_on_single_subject_layouts(): + """Titles are still rendered on ZOOM_CALL_CENTER and SIT_CENTER.""" + for kind in (LayoutKind.ZOOM_CALL_CENTER, LayoutKind.SIT_CENTER): + cmd = build_ffmpeg_cmd( + _req( + layout=LayoutInstruction(clip_id="1", layout=kind), + title_text="Hook title", + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawtext=text='Hook title'" in fg + + +# --------------------------------------------------------------------------- +# Title wrapping / auto-shrink (P2: fixes the "Prediction Markets vs +# Derivatives" clipped-title bug reported against the Cathy Wood run). +# --------------------------------------------------------------------------- + + +def test_plan_title_short_stays_single_line_at_72px(): + """Backward compat: short titles keep the pre-P2 single-drawtext form. + + Byte-identical output for short titles is important because it keeps + previously-calibrated visual output unchanged and avoids needless cache + churn on existing renders. + """ + frag = plan_title_drawtext("Hook title", out_w=1080) + assert frag is not None + assert frag.count("drawtext=") == 1 + assert "fontsize=72" in frag + assert "y=80" in frag + assert "drawtext=text='Hook title'" in frag + + +def test_plan_title_long_wraps_to_two_lines_below_72px(): + """Long titles wrap at the best word boundary and shrink to fit. + + "Prediction Markets vs Derivatives" is 33 chars β€” it overflows a 1080px + canvas at 72px. It must wrap into "Prediction Markets" / "vs Derivatives" + (balanced halves) at a smaller font. + """ + frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080) + assert frag is not None + assert frag.count("drawtext=") == 2, "long titles must split into two drawtext calls" + assert "drawtext=text='Prediction Markets'" in frag + assert "drawtext=text='vs Derivatives'" in frag + assert "fontsize=72" not in frag, "two-line layout must use a smaller font" + # Both lines share the same shrunken fontsize. + import re + + sizes = re.findall(r"fontsize=(\d+)", frag) + assert len(sizes) == 2 and sizes[0] == sizes[1] + assert 44 <= int(sizes[0]) <= 64 + + +def test_plan_title_empty_returns_none(): + assert plan_title_drawtext("", out_w=1080) is None + assert plan_title_drawtext(" ", out_w=1080) is None + + +def test_plan_title_single_huge_word_shrinks_instead_of_wrapping(): + """A single word cannot be word-wrapped; it must shrink to fit.""" + frag = plan_title_drawtext("Supercalifragilisticexpialidocious", out_w=1080) + assert frag is not None + assert frag.count("drawtext=") == 1 # no wrap possible + assert "fontsize=72" not in frag + + +def test_title_uses_arial_font_not_default_serif(): + """Titles must render in Arial (matching the ASS subtitle font), not the + platform default which is Times New Roman on Windows. + + Regression test for the "ugly serif title on the finance short" bug. + Both the single-line and the two-line drawtext variants must carry a + ``font=Arial`` directive so fontconfig resolves to the same family as + the subtitle ``Fontname=Arial``. + """ + short = plan_title_drawtext("Hook title", out_w=1080) + assert short is not None + assert "font=Arial" in short or "fontfile='" in short + + long_frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080) + assert long_frag is not None + if "font=Arial" in long_frag: + assert long_frag.count("font=Arial") == 2 + else: + assert long_frag.count("fontfile='") == 2 + + +def test_title_font_matches_subtitle_font_family(): + """Title overlay and subtitle captions must read as one typographic + family. Both routes through ``build_ffmpeg_cmd`` should carry the same + Arial reference. + """ + cmd = build_ffmpeg_cmd( + _req( + title_text="Hook title", + subtitle_path="/tmp/clip.ass", + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "font=Arial" in fg or "fontfile='" in fg + assert "Fontname=Arial" in fg + + +def test_long_title_pipes_through_build_ffmpeg_cmd(): + """End-to-end: a long title routed through the full command builder + produces a valid filtergraph with two drawtext filters and no syntax + errors ffmpeg would choke on. + """ + cmd = build_ffmpeg_cmd(_req(title_text="Prediction Markets vs Derivatives")) + fg = cmd[cmd.index("-filter_complex") + 1] + assert fg.count("drawtext=") == 2 + assert "[v_prepad]drawtext=text='Prediction Markets'" in fg + assert "[vout]" in fg + assert ";;" not in fg # no empty chain links + assert ",," not in fg # no stray commas + + +def test_reference_theme_draws_title_and_caption_bars(): + cmd = build_ffmpeg_cmd( + _req( + title_text="A Multi-Trillion Dollar Opportunity", + subtitle_path="/tmp/clip.ass", + render_theme=RenderTheme.REFERENCE_LOWER_THIRD, + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawbox=x=28:y=32" in fg + assert "drawbox=x=0:y=" in fg + assert "Fontname=Source Sans 3" in fg + assert "Alignment=2" in fg + assert "Outline=2" in fg + + +def test_reference_theme_wraps_long_titles_inside_the_title_bar(): + cmd = build_ffmpeg_cmd( + _req( + title_text="12% Youth Unemployment? Start a Business With AI", + render_theme=RenderTheme.REFERENCE_LOWER_THIRD, + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert fg.count("drawtext=") >= 2 + assert "..." not in fg + + +def test_reference_theme_draws_frosted_caption_ribbon_when_subtitles_exist(): + cmd = build_ffmpeg_cmd( + _req( + title_text="Hook title", + subtitle_path="/tmp/clip.ass", + render_theme=RenderTheme.REFERENCE_LOWER_THIRD, + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawbox=x=0:y=" in fg + + +def test_reference_theme_allows_titles_on_split_layouts(): + cmd = build_ffmpeg_cmd( + _req( + layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON), + title_text="Hook title", + render_theme=RenderTheme.REFERENCE_LOWER_THIRD, + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawtext=" in fg + + +def test_native_highlight_theme_skips_title_card_and_keeps_ass_styles(): + cmd = build_ffmpeg_cmd( + _req( + title_text="This title should not render", + subtitle_path="/tmp/clip.ass", + render_theme=RenderTheme.NATIVE_HIGHLIGHT, + ) + ) + fg = cmd[cmd.index("-filter_complex") + 1] + assert "drawtext" not in fg + assert "subtitles='" in fg + assert "force_style='" not in fg + + +def test_ensure_windows_fontconfig_is_noop_off_windows(): + env = _ensure_windows_fontconfig() + assert isinstance(env, dict) + + +def test_ensure_windows_fontconfig_creates_config(monkeypatch, tmp_path): + monkeypatch.setattr(compile_mod.os, "name", "nt", raising=False) + monkeypatch.delenv("FONTCONFIG_FILE", raising=False) + monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "localappdata")) + monkeypatch.setenv("WINDIR", str(tmp_path / "winroot")) + + env = _ensure_windows_fontconfig() + + cfg_file = Path(env["FONTCONFIG_FILE"]) + assert cfg_file.is_file() + text = cfg_file.read_text(encoding="utf-8") + assert (tmp_path / "winroot" / "Fonts").as_posix() in text + assert "fontconfig-cache" in text diff --git a/humeo-core/tests/test_face_detect.py b/humeo-core/tests/test_face_detect.py new file mode 100644 index 0000000000000000000000000000000000000000..d1850da11d71efaa166e9e39c67b9f2dd5df2e1d --- /dev/null +++ b/humeo-core/tests/test_face_detect.py @@ -0,0 +1,73 @@ +"""Tests for the MediaPipe-backed face detection primitive. + +Uses a stub ``face_fn`` so MediaPipe itself is not required to run the +tests β€” the primitive contract is what we care about: *given* a face +bbox, does the primitive produce the right ``SceneRegions``. +""" + +from humeo_core.primitives.face_detect import detect_face_regions +from humeo_core.schemas import BoundingBox, Scene + + +def _scene(i: int, kf: str | None = "/tmp/k.jpg") -> Scene: + return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf) + + +def test_no_keyframe_returns_raw_reason(): + out = detect_face_regions([_scene(0, kf=None)], face_fn=lambda _p: None) + assert out[0].person_bbox is None + assert "no keyframe" in out[0].raw_reason.lower() + + +def test_no_face_detected_returns_raw_reason(): + out = detect_face_regions([_scene(0)], face_fn=lambda _p: None) + assert out[0].person_bbox is None + assert "no face" in out[0].raw_reason.lower() + + +def test_face_centered_produces_person_only(): + centered = BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.7, label="face", confidence=0.9) + out = detect_face_regions([_scene(0)], face_fn=lambda _p: centered) + r = out[0] + assert r.person_bbox is not None + assert r.person_bbox.center_x == centered.center_x + assert r.chart_bbox is None + + +def test_face_pushed_right_synthesises_chart_bbox(): + # face center x ~ 0.86 -> above default threshold 0.65 -> chart bbox inferred + face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9, label="face", confidence=0.95) + out = detect_face_regions([_scene(0)], face_fn=lambda _p: face) + r = out[0] + assert r.person_bbox is not None + assert r.chart_bbox is not None + assert r.chart_bbox.x1 == 0.0 + assert r.chart_bbox.x2 <= 0.75 # can't overlap the face + assert r.chart_bbox.x2 <= 0.65 # bounded by threshold too + assert "synthetic chart" in r.raw_reason + + +def test_face_detector_exception_is_isolated_per_scene(): + scenes = [_scene(0), _scene(1)] + calls: list[str] = [] + + def flaky_fn(path: str) -> BoundingBox | None: + calls.append(path) + if len(calls) == 1: + raise RuntimeError("boom") + return BoundingBox(x1=0.3, y1=0.2, x2=0.7, y2=0.8) + + out = detect_face_regions(scenes, face_fn=flaky_fn) + assert out[0].person_bbox is None + assert "error" in out[0].raw_reason.lower() + assert out[1].person_bbox is not None + + +def test_custom_threshold_prevents_false_chart_split(): + face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9) + out = detect_face_regions( + [_scene(0)], + face_fn=lambda _p: face, + chart_split_threshold=0.95, + ) + assert out[0].chart_bbox is None diff --git a/humeo-core/tests/test_layout_bbox.py b/humeo-core/tests/test_layout_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..426b35bc291ff6b9487aa5d9507a80ef24f97839 --- /dev/null +++ b/humeo-core/tests/test_layout_bbox.py @@ -0,0 +1,17 @@ +"""Split layout uses optional normalized bbox regions (Gemini vision).""" + +from humeo_core.primitives.layouts import plan_layout +from humeo_core.schemas import BoundingBox, FocusStackOrder, LayoutInstruction, LayoutKind + + +def test_split_with_bbox_regions_not_fixed_thirds(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + focus_stack_order=FocusStackOrder.CHART_THEN_PERSON, + split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.64, y2=1.0), + split_person_region=BoundingBox(x1=0.64, y1=0.0, x2=1.0, y2=1.0), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + assert "crop=1228:1080:0:0" in fg or "crop=1224:1080:0:0" in fg + assert "vstack=inputs=2" in fg diff --git a/humeo-core/tests/test_layouts.py b/humeo-core/tests/test_layouts.py new file mode 100644 index 0000000000000000000000000000000000000000..b11dec0e5b0736cadadfbefb9fcdd54deb18fb87 --- /dev/null +++ b/humeo-core/tests/test_layouts.py @@ -0,0 +1,312 @@ +import re + +from humeo_core.primitives.layouts import ( + _center_crop_to_9x16, + _crop_box, + plan_layout, +) +from humeo_core.schemas import ( + BoundingBox, + FocusStackOrder, + LayoutInstruction, + LayoutKind, + TimedCenterPoint, +) + + +def test_crop_box_aspect_exact(): + cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 1.0, 0.5, 0.5) + # 9:16 inside 1920x1080 -> height-limited: ch=1080, cw ~= 608 + assert ch == 1080 + assert abs(cw / ch - 9 / 16) < 0.01 + assert 0 <= x <= 1920 - cw + assert y == 0 + + +def test_crop_box_clamps_inside_frame(): + cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 2.0, 0.99, 0.5) + assert x + cw <= 1920 + assert y + ch <= 1080 + + +def test_crop_box_zoom_tightens(): + cw_small, ch_small, _, _ = _center_crop_to_9x16(1920, 1080, 2.0, 0.5) + cw_large, ch_large, _, _ = _center_crop_to_9x16(1920, 1080, 1.0, 0.5) + assert cw_small < cw_large + assert ch_small < ch_large + + +def test_even_dimensions(): + cw, ch, x, y = _crop_box(1921, 1081, 9 / 16, 1.3, 0.4, 0.5) + assert cw % 2 == 0 and ch % 2 == 0 + assert x % 2 == 0 and y % 2 == 0 + + +def _contains(s: str, *subs: str) -> bool: + return all(sub in s for sub in subs) + + +def test_zoom_call_layout_filtergraph_shape(): + instr = LayoutInstruction( + clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.5, person_x_norm=0.5 + ) + plan = plan_layout(instr, out_w=1080, out_h=1920) + fg = plan.filtergraph + assert _contains(fg, "[0:v]crop=", "scale=1080:1920", "[vout]") + + +def test_sit_center_layout_filtergraph_shape(): + instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER) + plan = plan_layout(instr, out_w=1080, out_h=1920) + assert "[vout]" in plan.filtergraph + assert plan.out_label == "vout" + + +def test_sit_center_tracking_uses_dynamic_crop_expression(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SIT_CENTER, + person_tracking=[ + TimedCenterPoint(t_sec=0.0, x_norm=0.2), + TimedCenterPoint(t_sec=10.0, x_norm=0.8), + ], + ) + fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph + assert "setpts=PTS-STARTPTS" in fg + assert "[vsrc]crop=" in fg + assert "if(lt(t\\,4.850)" in fg + assert "*(t-4.850)/(0.300)" in fg + + +def test_sit_center_tracking_with_zoom_uses_dynamic_crop_window_expressions(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SIT_CENTER, + person_tracking=[ + TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.28), + TimedCenterPoint(t_sec=10.0, x_norm=0.8, zoom=1.0), + ], + ) + fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph + assert "setpts=PTS-STARTPTS" in fg + assert "[vsrc]crop=" in fg + assert "out_w/2" in fg + assert "out_h/2" in fg + assert "floor((min(" in fg + + +def test_split_layout_contains_vstack(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + person_x_norm=0.83, + chart_x_norm=0.0, + ) + plan = plan_layout(instr, out_w=1080, out_h=1920) + fg = plan.filtergraph + assert _contains(fg, "split=2", "vstack=inputs=2", "[vout]") + assert "[top]" in fg and "[bot]" in fg + + +def test_split_layout_person_crop_is_right_third(): + """Chart uses left 2/3; person uses right 1/3 (non-overlapping).""" + instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + # Right third: x=1280, w=640 for 1920-wide source. + assert "crop=640:1080:1280:0" in fg + + +def test_split_layout_can_swap_stack_order(): + """PERSON_THEN_CHART puts the right-strip (person) crop into the top band.""" + chart_first = plan_layout( + LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + focus_stack_order=FocusStackOrder.CHART_THEN_PERSON, + ), + out_w=1080, + out_h=1920, + ).filtergraph + person_first = plan_layout( + LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + focus_stack_order=FocusStackOrder.PERSON_THEN_CHART, + ), + out_w=1080, + out_h=1920, + ).filtergraph + + def top_crop(fg: str) -> str: + m = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg) + assert m is not None, fg + return m.group(1) + + # chart strip = left 1280px of source (2/3 split seam). + assert top_crop(chart_first) == "1280:1080:0:0" + # person strip = right 640px -> x=1280. + assert top_crop(person_first) == "640:1080:1280:0" + assert "vstack=inputs=2" in chart_first + assert "vstack=inputs=2" in person_first + + +def test_split_layout_person_clamped(): + instr = LayoutInstruction( + clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, person_x_norm=1.0 + ) + plan = plan_layout(instr, out_w=1080, out_h=1920) + assert "crop=" in plan.filtergraph # no OOB math crash + + +def test_plan_layout_dispatch_covers_all_kinds(): + for k in LayoutKind: + instr = LayoutInstruction(clip_id="c", layout=k) + plan = plan_layout(instr) + assert plan.out_label == "vout" + assert plan.filtergraph.endswith("[vout]") + + +def test_default_split_is_even_50_50_bands(): + """The user-requested symmetric look: top and bottom bands are equal.""" + instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON) + fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph + # Each strip should scale to the same height (half of 1920). + heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg) + assert len(heights) == 2 + assert heights[0] == heights[1] == "960", f"expected even 960/960, got {heights}" + + +def test_top_band_ratio_honored_for_uneven_splits(): + instr = LayoutInstruction( + clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, top_band_ratio=0.6 + ) + fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph + heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg) + assert heights == ["1152", "768"], heights + + +def test_split_seam_is_midpoint_between_bboxes(): + """When both bboxes are provided, strips partition the source -- no overlap, no gap.""" + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.50, y2=1.0), + split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + # chart.x2 = 960px, person.x1 = 1056px -> midpoint = 1008 -> even -> 1008. + # Chart strip: x=0, cw=1008. Person strip: x=1008, cw=912. + top_crop = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg).group(1) + bot_crop = re.search(r"\[src2\]crop=(\d+:\d+:\d+:\d+)", fg).group(1) + assert top_crop == "1008:1080:0:0" + assert bot_crop == "912:1080:1008:0" + + +def test_split_uses_bbox_y_for_tight_band_fill(): + """Chart bboxes anchor the crop, with a little extra height for edge safety.""" + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + split_chart_region=BoundingBox(x1=0.0, y1=0.1, x2=0.5, y2=0.7), + split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + # Chart bbox y: 0.1..0.7 -> y=108, ch=648, then a modest 12% pad per side. + assert "crop=1008:804:0:30" in fg + + +def test_split_chart_person_adds_vertical_pad_to_reduce_chart_side_crop(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + split_chart_region=BoundingBox(x1=0.02, y1=0.03, x2=0.58, y2=0.7), + split_person_region=BoundingBox(x1=0.585, y1=0.0, x2=0.995, y2=0.62), + top_band_ratio=0.436, + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=640, src_h=360).filtergraph + assert "[src1]crop=372:280:0:0" in fg + + +def test_split_minimum_strip_width_enforced(): + """If chart/person bboxes are pathological (seam at edge), don't starve a strip.""" + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_CHART_PERSON, + split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.05, y2=1.0), + split_person_region=BoundingBox(x1=0.05, y1=0.0, x2=1.0, y2=1.0), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + widths = [int(m) for m in re.findall(r"crop=(\d+):\d+:\d+:\d+", fg)] + # Min strip = 20% of 1920 = 384 px. Neither strip should be narrower. + assert all(w >= 384 for w in widths), widths + + +def test_split_two_persons_stacks_two_crops(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_TWO_PERSONS, + split_person_region=BoundingBox(x1=0.0, y1=0.05, x2=0.5, y2=0.95), + split_second_person_region=BoundingBox(x1=0.5, y1=0.05, x2=1.0, y2=0.95), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + assert "split=2" in fg and "vstack=inputs=2" in fg + # Seam at x=960. bbox y: 0.05..0.95 -> y=54, ch=972 (even). + assert "[src1]crop=960:972:0:54" in fg + assert "[src2]crop=960:972:960:54" in fg + + +def test_split_two_charts_stacks_two_crops(): + instr = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SPLIT_TWO_CHARTS, + split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.5, y2=1.0), + split_second_chart_region=BoundingBox(x1=0.5, y1=0.0, x2=1.0, y2=1.0), + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + assert "split=2" in fg and "vstack=inputs=2" in fg + assert "[src1]crop=960:1080:0:0" in fg + assert "[src2]crop=960:1080:960:0" in fg + + +def test_split_two_persons_without_bboxes_defaults_to_centered(): + """No bboxes -> centered 50/50 seam, full source height fallback.""" + instr = LayoutInstruction( + clip_id="c", layout=LayoutKind.SPLIT_TWO_PERSONS + ) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + assert "[src1]crop=960:1080:0:0" in fg + assert "[src2]crop=960:1080:960:0" in fg + + +def test_split_bands_use_cover_scale_plus_center_crop(): + """Each band is painted edge-to-edge -- no letterbox bars.""" + instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON) + fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph + assert fg.count("force_original_aspect_ratio=increase") == 2 + assert fg.count("setsar=1") == 2 + + +def test_zoom_tighter_means_smaller_crop_window(): + from humeo_core.primitives.layouts import plan_zoom_call_center + + wide = plan_zoom_call_center( + LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.0), + out_w=1080, + out_h=1920, + ) + tight = plan_zoom_call_center( + LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=2.0), + out_w=1080, + out_h=1920, + ) + # Parse crop=CW:CH:X:Y out of each filtergraph. + import re + + def crop(fg: str) -> tuple[int, int]: + m = re.search(r"crop=(\d+):(\d+):", fg) + assert m is not None + return int(m.group(1)), int(m.group(2)) + + wcw, wch = crop(wide.filtergraph) + tcw, tch = crop(tight.filtergraph) + assert tcw < wcw and tch < wch diff --git a/humeo-core/tests/test_schemas.py b/humeo-core/tests/test_schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..c49470eec90402a1196d29e4eef6896f19bcb12b --- /dev/null +++ b/humeo-core/tests/test_schemas.py @@ -0,0 +1,267 @@ +import pytest +from pydantic import ValidationError + +from humeo_core.schemas import ( + ApprovalResult, + Clip, + ClipPlan, + ClipSubtitleWords, + FocusStackOrder, + LayoutInstruction, + LayoutKind, + RatingFeedback, + RenderRequest, + Scene, + SessionState, + TimedCenterPoint, + TranscriptWord, +) + + +def test_scene_requires_end_after_start(): + Scene(scene_id="s1", start_time=0.0, end_time=1.0) + with pytest.raises(ValueError): + Scene(scene_id="s1", start_time=5.0, end_time=5.0) + with pytest.raises(ValueError): + Scene(scene_id="s1", start_time=5.0, end_time=1.0) + + +def test_layout_instruction_defaults_and_bounds(): + li = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER) + assert li.zoom == 1.0 + assert 0 <= li.person_x_norm <= 1 + assert li.person_tracking == [] + assert li.focus_stack_order == FocusStackOrder.CHART_THEN_PERSON + with pytest.raises(ValueError): + LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, zoom=0.0) + with pytest.raises(ValueError): + LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, person_x_norm=2.0) + + +def test_layout_instruction_accepts_sorted_tracking_points(): + li = LayoutInstruction( + clip_id="c", + layout=LayoutKind.SIT_CENTER, + person_tracking=[ + TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.25), + TimedCenterPoint(t_sec=5.0, x_norm=0.8, zoom=1.0), + ], + ) + assert [point.t_sec for point in li.person_tracking] == [0.0, 5.0] + assert li.person_tracking[0].zoom == pytest.approx(1.25) + + +def test_layout_instruction_rejects_unsorted_tracking_points(): + with pytest.raises(ValueError, match="person_tracking times"): + LayoutInstruction( + clip_id="c", + layout=LayoutKind.SIT_CENTER, + person_tracking=[ + TimedCenterPoint(t_sec=5.0, x_norm=0.8), + TimedCenterPoint(t_sec=1.0, x_norm=0.2), + ], + ) + + +def test_clip_duration(): + c = Clip( + clip_id="1", + topic="t", + start_time_sec=10.0, + end_time_sec=42.5, + ) + assert c.duration_sec == pytest.approx(32.5) + + +def test_clip_hook_relative_to_clip_in_point(): + c = Clip( + clip_id="1", + topic="t", + start_time_sec=100.0, + end_time_sec=130.0, + hook_start_sec=0.0, + hook_end_sec=3.0, + ) + assert c.hook_end_sec == 3.0 + + +def test_clip_hook_must_be_within_duration(): + with pytest.raises(ValueError, match="hook window"): + Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=10.0, + hook_start_sec=0.0, + hook_end_sec=15.0, + ) + + +def test_clip_hook_both_or_neither(): + with pytest.raises(ValueError, match="hook_start_sec and hook_end_sec"): + Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=10.0, + hook_start_sec=1.0, + hook_end_sec=None, + ) + + +def test_clip_trim_cannot_exceed_duration(): + with pytest.raises(ValueError, match="trim"): + Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=10.0, + trim_start_sec=6.0, + trim_end_sec=6.0, + ) + + +def test_clip_plan_roundtrip(): + plan = ClipPlan( + source_path="/tmp/x.mp4", + clips=[ + Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0) + ], + ) + d = plan.model_dump() + assert ClipPlan.model_validate(d) == plan + + +def test_clip_roundtrip_with_extended_fields(): + clip = Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=30.0, + score_breakdown={"message_wow": 0.9, "hook_emotion": 0.7}, + origin="both", + visual_notes="Speaker leans in.", + reasoning="Strong explanation and hook.", + ) + + dumped = clip.model_dump() + + assert dumped["score_breakdown"] == {"message_wow": 0.9, "hook_emotion": 0.7} + assert dumped["origin"] == "both" + assert dumped["visual_notes"] == "Speaker leans in." + assert dumped["reasoning"] == "Strong explanation and hook." + assert Clip.model_validate(dumped) == clip + + +def test_clip_defaults_validate_and_do_not_serialize_new_fields(): + clip = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0) + + assert clip.origin == "text" + assert clip.score_breakdown is None + assert clip.visual_notes is None + assert clip.reasoning is None + + dumped = clip.model_dump() + assert "score_breakdown" not in dumped + assert "origin" not in dumped + assert "visual_notes" not in dumped + assert "reasoning" not in dumped + assert Clip.model_validate(dumped) == clip + + +def test_clip_score_breakdown_validation(): + with pytest.raises(ValidationError): + Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=30.0, + score_breakdown={"hook": -0.1}, + ) + + clip = Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=30.0, + score_breakdown={"hook": 1.2}, + ) + assert clip.score_breakdown == {"hook": 1.0} + + clip = Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=30.0, + score_breakdown={}, + ) + assert clip.score_breakdown == {} + + clip = Clip( + clip_id="1", + topic="t", + start_time_sec=0.0, + end_time_sec=30.0, + score_breakdown={"hook": 0.5}, + ) + assert clip.score_breakdown == {"hook": 0.5} + + +def test_clip_subtitle_words_relative_times(): + w = ClipSubtitleWords( + words=[TranscriptWord(word="hi", start_time=0.0, end_time=0.2)] + ) + assert w.words[0].start_time == 0.0 + + +def test_render_request_modes(): + c = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0) + li = LayoutInstruction(clip_id="1", layout=LayoutKind.ZOOM_CALL_CENTER) + req = RenderRequest( + source_path="/tmp/x.mp4", + clip=c, + layout=li, + output_path="/tmp/out.mp4", + ) + assert req.mode == "normal" + req2 = RenderRequest(**{**req.model_dump(), "mode": "dry_run"}) + assert req2.mode == "dry_run" + + +def test_approval_result_roundtrip(): + result = ApprovalResult( + action="proceed", + selected_ids=["001", "003"], + steering_note="prefer emotional moments", + ) + assert ApprovalResult.model_validate(result.model_dump()) == result + + +def test_approval_result_rejects_invalid_action(): + with pytest.raises(ValidationError): + ApprovalResult(action="invalid") + + +def test_rating_feedback_roundtrip(): + feedback = RatingFeedback( + rating=2, + issues=["wrong_moments", "other"], + free_text="needs more context", + ) + assert RatingFeedback.model_validate(feedback.model_dump()) == feedback + + +def test_rating_feedback_rejects_invalid_rating(): + with pytest.raises(ValidationError): + RatingFeedback(rating=4) + + +def test_session_state_roundtrip(): + state = SessionState( + source_key="youtube:PdVv_vLkUgk", + iteration=3, + steering_notes=["be punchier"], + last_rating=RatingFeedback(rating=3), + last_selected_ids=["001", "002"], + ) + assert SessionState.model_validate(state.model_dump()) == state diff --git a/humeo-core/tests/test_select_clips.py b/humeo-core/tests/test_select_clips.py new file mode 100644 index 0000000000000000000000000000000000000000..bfe86ea77c7835323e902ec22f6f95a03f613a80 --- /dev/null +++ b/humeo-core/tests/test_select_clips.py @@ -0,0 +1,49 @@ +from humeo_core.primitives.select_clips import select_clips_heuristic +from humeo_core.schemas import TranscriptWord + + +def _words(start: float, end: float, n: int) -> list[TranscriptWord]: + step = (end - start) / max(1, n) + return [ + TranscriptWord(word=f"w{i}", start_time=start + i * step, end_time=start + (i + 1) * step) + for i in range(n) + ] + + +def test_no_transcript_returns_single_clip(): + plan = select_clips_heuristic("/tmp/x.mp4", [], duration_sec=600.0) + assert len(plan.clips) == 1 + + +def test_prefers_dense_windows(): + # dense between 30-90, sparse elsewhere + dense = _words(30.0, 90.0, 240) # 4 words/sec + sparse_before = _words(0.0, 30.0, 6) + sparse_after = _words(90.0, 600.0, 30) + words = sparse_before + dense + sparse_after + plan = select_clips_heuristic( + "/tmp/x.mp4", words, duration_sec=600.0, target_count=1, min_sec=30, max_sec=60 + ) + assert len(plan.clips) == 1 + c = plan.clips[0] + assert 30 <= c.start_time_sec <= 90 + assert c.end_time_sec <= 120 + + +def test_no_overlap_when_multiple_picked(): + dense_a = _words(30.0, 90.0, 240) + dense_b = _words(200.0, 260.0, 240) + words = dense_a + dense_b + plan = select_clips_heuristic( + "/tmp/x.mp4", + words, + duration_sec=400.0, + target_count=3, + min_sec=30, + max_sec=60, + ) + # Should pick both dense regions without overlap. + assert len(plan.clips) >= 2 + starts_ends = sorted((c.start_time_sec, c.end_time_sec) for c in plan.clips) + for (s1, e1), (s2, e2) in zip(starts_ends, starts_ends[1:]): + assert e1 <= s2 diff --git a/humeo-core/tests/test_server_tools.py b/humeo-core/tests/test_server_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..0ec239d7b798f94abbb8e06b815325efbd40424d --- /dev/null +++ b/humeo-core/tests/test_server_tools.py @@ -0,0 +1,93 @@ +"""Exercise the MCP server tools as plain Python callables. + +FastMCP tools are registered on the server instance, but the underlying +functions are ordinary Python functions decorated with ``@mcp.tool()``. +We import the module and invoke those functions directly to verify the +end-to-end wiring (schemas validated, dispatch correct, JSON-serializable). +""" + +import humeo_core.server as srv +from humeo_core.schemas import LayoutKind + + +def test_list_layouts_lists_all_three(): + result = srv.list_layouts() + kinds = {layout["kind"] for layout in result["layouts"]} + assert kinds == {k.value for k in LayoutKind} + + +def test_plan_layout_tool_returns_filtergraph(): + for k in LayoutKind: + out = srv.plan_layout(layout=k.value) + assert out["out_label"] == "vout" + assert "[vout]" in out["filtergraph"] + + +def test_build_render_cmd_dry_run(): + req = { + "source_path": "/tmp/src.mp4", + "clip": { + "clip_id": "1", + "topic": "t", + "start_time_sec": 0.0, + "end_time_sec": 30.0, + }, + "layout": {"clip_id": "1", "layout": LayoutKind.SIT_CENTER.value}, + "output_path": "/tmp/out.mp4", + } + out = srv.build_render_cmd(request=req) + assert out["success"] is True + assert out["output_path"] == "/tmp/out.mp4" + assert any("-filter_complex" == part for part in out["ffmpeg_cmd"]) + + +def test_select_clips_tool_happy_path(): + words = [ + {"word": f"w{i}", "start_time": float(i), "end_time": float(i) + 0.5} + for i in range(120) + ] + plan = srv.select_clips( + source_path="/tmp/x.mp4", + transcript_words=words, + duration_sec=120.0, + target_count=2, + min_sec=30.0, + max_sec=60.0, + ) + assert plan["source_path"] == "/tmp/x.mp4" + assert 1 <= len(plan["clips"]) <= 2 + + +def test_classify_scenes_tool_no_keyframes(): + scenes = [{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0}] + out = srv.classify_scenes(scenes=scenes) + assert out["classifications"][0]["scene_id"] == "s0" + assert out["classifications"][0]["layout"] in {k.value for k in LayoutKind} + + +def test_detect_scene_regions_returns_jobs_and_prompt(): + scenes = [ + {"scene_id": "s0", "start_time": 0.0, "end_time": 5.0, "keyframe_path": "/tmp/k0.jpg"}, + {"scene_id": "s1", "start_time": 5.0, "end_time": 10.0, "keyframe_path": "/tmp/k1.jpg"}, + ] + out = srv.detect_scene_regions(scenes=scenes) + assert "STRICT JSON" in out["prompt"] + assert len(out["jobs"]) == 2 + assert out["jobs"][0]["scene_id"] == "s0" + assert out["jobs"][0]["keyframe_path"] == "/tmp/k0.jpg" + + +def test_classify_scenes_with_vision_derives_instructions(): + regions = [ + { + "scene_id": "s0", + "chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 0.66, "y2": 1.0}, + "person_bbox": {"x1": 0.72, "y1": 0.1, "x2": 0.99, "y2": 0.95}, + "ocr_text": "CPI YoY", + } + ] + out = srv.classify_scenes_with_vision(regions=regions) + assert out["classifications"][0]["layout"] == LayoutKind.SPLIT_CHART_PERSON.value + instr = out["layout_instructions"][0] + assert instr["chart_x_norm"] == 0.0 + assert 0.8 < instr["person_x_norm"] < 0.9 diff --git a/humeo-core/tests/test_vision.py b/humeo-core/tests/test_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..d370e5f455fac525c6f10afc04945e506dfcfff9 --- /dev/null +++ b/humeo-core/tests/test_vision.py @@ -0,0 +1,228 @@ +"""Tests for the scene-change + vision-LLM + OCR bbox primitive. + +Covers: +* happy path: well-formed JSON -> populated ``SceneRegions``. +* bad JSON: degrade to empty regions + raw_reason, never raise. +* bad bbox: one malformed bbox does not take down the whole scene record. +* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT. +* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come + from the bboxes when present, defaults when not. +""" + +import json + +import pytest + +from humeo_core.primitives.vision import ( + _CHART_WIDTH_SPLIT_THRESHOLD, + classify_from_regions, + classify_scenes_with_vision_llm, + detect_regions_with_llm, + layout_instruction_from_regions, +) +from humeo_core.schemas import ( + BoundingBox, + LayoutKind, + Scene, + SceneClassification, + SceneRegions, +) + + +# --------------------------------------------------------------------------- +# Schema +# --------------------------------------------------------------------------- + + +def test_bounding_box_requires_x2_gt_x1(): + BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2) + with pytest.raises(ValueError): + BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2) + with pytest.raises(ValueError): + BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1) + + +def test_bounding_box_center_and_width(): + b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9) + assert b.center_x == pytest.approx(0.4) + assert b.center_y == pytest.approx(0.65) + assert b.width == pytest.approx(0.4) + + +# --------------------------------------------------------------------------- +# detect_regions_with_llm +# --------------------------------------------------------------------------- + + +def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene: + return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf) + + +def test_detect_regions_happy_path(): + scenes = [_scene(0)] + + def vision_fn(_img: str, _prompt: str) -> str: + return json.dumps( + { + "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9}, + "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8}, + "ocr_text": "Inflation YoY", + "reason": "explainer layout", + } + ) + + out = detect_regions_with_llm(scenes, vision_fn) + assert len(out) == 1 + r = out[0] + assert r.scene_id == "s0" + assert r.person_bbox and r.person_bbox.center_x > 0.8 + assert r.chart_bbox and r.chart_bbox.width > 0.6 + assert "Inflation" in r.ocr_text + + +def test_detect_regions_bad_json_is_safe(): + scenes = [_scene(0)] + + def vision_fn(*_a) -> str: + return "not json" + + out = detect_regions_with_llm(scenes, vision_fn) + assert out[0].person_bbox is None + assert out[0].chart_bbox is None + assert "parse error" in out[0].raw_reason.lower() + + +def test_detect_regions_missing_keyframe_is_safe(): + scenes = [_scene(0, kf=None)] + + def vision_fn(*_a) -> str: # pragma: no cover - should not be called + raise AssertionError("vision_fn must not be called without a keyframe") + + out = detect_regions_with_llm(scenes, vision_fn) + assert out[0].person_bbox is None + assert "no keyframe" in out[0].raw_reason.lower() + + +def test_detect_regions_bad_bbox_degrades_gracefully(): + scenes = [_scene(0)] + + def vision_fn(*_a) -> str: + return json.dumps( + { + "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9}, + "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95}, + "ocr_text": "", + "reason": "person bbox inverted", + } + ) + + out = detect_regions_with_llm(scenes, vision_fn) + assert out[0].person_bbox is None + assert out[0].chart_bbox is not None + + +# --------------------------------------------------------------------------- +# classify_from_regions +# --------------------------------------------------------------------------- + + +def test_classify_wide_chart_is_split(): + r = SceneRegions( + scene_id="s0", + chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0), + person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95), + ) + c = classify_from_regions(r) + assert c.layout == LayoutKind.SPLIT_CHART_PERSON + assert c.confidence > 0.5 + + +def test_classify_narrow_chart_not_split(): + r = SceneRegions( + scene_id="s0", + chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4), + person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95), + ) + c = classify_from_regions(r) + # chart width (0.1) is below the split threshold -> not split + assert c.layout != LayoutKind.SPLIT_CHART_PERSON + + +def test_classify_wide_person_is_zoom_call(): + r = SceneRegions( + scene_id="s0", + person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98), + ) + c = classify_from_regions(r) + assert c.layout == LayoutKind.ZOOM_CALL_CENTER + + +def test_classify_small_person_is_sit_center(): + r = SceneRegions( + scene_id="s0", + person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8), + ) + c = classify_from_regions(r) + assert c.layout == LayoutKind.SIT_CENTER + + +def test_classify_nothing_detected_defaults_sit_center_low_conf(): + r = SceneRegions(scene_id="s0", raw_reason="model returned null") + c = classify_from_regions(r) + assert c.layout == LayoutKind.SIT_CENTER + assert c.confidence <= 0.5 + + +def test_chart_threshold_is_exported(): + # guard against the tuning constant silently being removed + assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0 + + +# --------------------------------------------------------------------------- +# layout_instruction_from_regions +# --------------------------------------------------------------------------- + + +def test_layout_instruction_from_regions_split(): + r = SceneRegions( + scene_id="s0", + chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0), + person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95), + ) + c = classify_from_regions(r) + instr = layout_instruction_from_regions(r, c) + assert instr.layout == LayoutKind.SPLIT_CHART_PERSON + # person_x_norm = center of (0.72, 0.99) = 0.855 + assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3) + # chart_x_norm = left edge = 0.0 + assert instr.chart_x_norm == pytest.approx(0.0) + + +def test_layout_instruction_defaults_when_no_regions(): + r = SceneRegions(scene_id="s0") + c = SceneClassification( + scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default" + ) + instr = layout_instruction_from_regions(r, c) + assert instr.person_x_norm == 0.5 + assert instr.chart_x_norm == 0.0 + + +def test_classify_scenes_with_vision_llm_returns_pairs(): + scenes = [_scene(0)] + + def vision_fn(*_a) -> str: + return json.dumps( + { + "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95}, + "chart_bbox": None, + "ocr_text": "", + "reason": "solo subject", + } + ) + + pairs = classify_scenes_with_vision_llm(scenes, vision_fn) + assert len(pairs) == 1 + regions, classification = pairs[0] + assert regions.person_bbox is not None + assert classification.layout == LayoutKind.ZOOM_CALL_CENTER diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..1e8fbd4bd6d827fb709ea50cc604fbcfecaa228f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,56 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "humeo" +version = "0.1.0" +description = "Automated podcast-to-shorts pipeline" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "yt-dlp>=2024.0", + "fastapi>=0.115", + "openai>=1.0", + "google-genai>=1.0", + "httpx>=0.28", + "jinja2>=3.1", + "numpy>=1.24", + "Pillow>=10.0", + "python-dotenv>=1.0", + "replicate>=0.34.2", + "tqdm>=4.60", + "python-multipart>=0.0.9", + "uvicorn[standard]>=0.30", + "humeo-core", +] + +[project.optional-dependencies] +dev = [ + "pytest-asyncio>=0.23", + "ruff", + "pytest", +] +whisper = [ + "whisperx @ git+https://github.com/m-bain/whisperX.git", +] + +[tool.uv.sources] +humeo-core = { path = "humeo-core", editable = true } + +[project.scripts] +humeo = "humeo.cli:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +humeo = ["prompts/*.jinja2"] + +[tool.pytest.ini_options] +testpaths = ["tests", "humeo-core/tests"] +addopts = "-ra -q" + +[tool.ruff] +line-length = 100 +target-version = "py310" diff --git a/src/humeo.egg-info/PKG-INFO b/src/humeo.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..dd7705a36a7c386b5f71595ef886799545abca9b --- /dev/null +++ b/src/humeo.egg-info/PKG-INFO @@ -0,0 +1,223 @@ +Metadata-Version: 2.4 +Name: humeo +Version: 0.1.0 +Summary: Automated podcast-to-shorts pipeline +Requires-Python: >=3.10 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: yt-dlp>=2024.0 +Requires-Dist: openai>=1.0 +Requires-Dist: google-genai>=1.0 +Requires-Dist: httpx>=0.28 +Requires-Dist: jinja2>=3.1 +Requires-Dist: numpy>=1.24 +Requires-Dist: Pillow>=10.0 +Requires-Dist: python-dotenv>=1.0 +Requires-Dist: replicate>=0.34.2 +Requires-Dist: tqdm>=4.60 +Requires-Dist: humeo-core +Provides-Extra: dev +Requires-Dist: pytest-asyncio>=0.23; extra == "dev" +Requires-Dist: ruff; extra == "dev" +Requires-Dist: pytest; extra == "dev" +Provides-Extra: whisper +Requires-Dist: whisperx @ git+https://github.com/m-bain/whisperX.git ; extra == "whisper" +Dynamic: license-file + +--- +title: Humeo +sdk: docker +app_port: 7860 +--- + +# Humeo + +Current default preset: + +- `native_highlight` captions +- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages +- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available +- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set + +Long podcast or interview β†’ vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render. + +**Architecture (static HTML, GitHub Pages):** +[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html) + +## Hugging Face Space + +This repo includes a Hugging Face Docker Space entrypoint in `app.py`. + +- Upload one local MP4 +- Watch live pipeline logs and stage progress +- Download rendered `short_*.mp4` clips from the UI + +Required Space secrets: + +- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY` +- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY` + +The Docker image pins `HUMEO_TRANSCRIBE_PROVIDER=openai` for the Space demo. + +## Repo layout + +| Path | Role | +|------|------| +| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters | +| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server | + +## Pipeline (actual order) + +```text +YouTube URL + β†’ ingest (source.mp4, transcript.json) + β†’ clip selection (Gemini β†’ clips.json) + β†’ hook detection (Gemini β†’ hooks.json) + β†’ content pruning (Gemini β†’ prune.json) + β†’ keyframes + layout vision (Gemini vision β†’ layout_vision.json) + β†’ ASS subtitles + humeo-core ffmpeg render β†’ short_.mp4 +``` + +Details: **`docs/PIPELINE.md`**. + +## Five layouts + +A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**). + +## Requirements + +- **Python** β‰₯ 3.10 +- **`uv`** β€” install: [astral.sh/uv](https://docs.astral.sh/uv/) +- **`ffmpeg`** β€” on `PATH` for extract/render +- **API keys** β€” see **`docs/ENVIRONMENT.md`** + - `GOOGLE_API_KEY` or `GEMINI_API_KEY` β€” preferred for Gemini stages + - `OPENROUTER_API_KEY` β€” supported fallback for those same Gemini-like stages when Google keys are unavailable + - `OPENAI_API_KEY` β€” if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`) + +Copy **`.env.example`** β†’ **`.env`** (never commit `.env`). + +## Install + +```bash +uv venv +uv sync +``` + +Optional local WhisperX (heavy; Windows often uses OpenAI API instead): + +```bash +uv sync --extra whisper +``` + +## Run + +```bash +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" +humeo --long-to-shorts "C:\path\to\video.mp4" +``` + +Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**). + +## CLI guide (all flags) + +Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`. + +### Required + +| Flag | Meaning | +|------|---------| +| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). | + +### Paths and cache behavior + +| Flag | Meaning | +|------|---------| +| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). | +| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). | +| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. | +| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). | +| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. | + +### Model selection and stage forcing + +| Flag | Meaning | +|------|---------| +| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). | +| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). | +| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. | +| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. | +| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. | +| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. | +| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). | + +### Pruning and subtitles + +| Flag | Meaning | +|------|---------| +| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). | +| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). | +| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). | +| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). | +| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). | + +### Logging + +| Flag | Meaning | +|------|---------| +| `--verbose`, `-v` | Enable debug logging. | + +### Common command recipes + +```bash +# Basic run +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" + +# Local MP4 +humeo --long-to-shorts "C:\path\to\video.mp4" + +# Full fresh run for debugging / prompt tuning +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose + +# Re-run only clip selection after prompt edits +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection + +# Keep intermediates in a fixed local folder +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work + +# Compare different prune levels on same source +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative +humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive +``` + +## Documentation + +| Doc | Purpose | +|-----|---------| +| **`docs/README.md`** | Index of all files under `docs/` | +| **`docs/STUDY_ORDER.md`** | Read order for onboarding | +| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts | +| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout | +| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git | +| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example | +| **`docs/full_run_output.txt`** | Example full run log (text) | +| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping Β§9 | +| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap | +| **`docs/TODO.md`** | Backlog | +| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) | +| **`docs/SOLUTIONS.md`** | Design rationale | +| **`TERMINOLOGY.md`** | Glossary | + +## Tests + +```bash +uv sync --extra dev +uv run pytest +``` + +## Sharing outputs + +`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**. + +## License + +See **`LICENSE`** (root) and **`humeo-core/LICENSE`**. diff --git a/src/humeo.egg-info/SOURCES.txt b/src/humeo.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f037363506a416f615b401a87df964d04bc1d9f --- /dev/null +++ b/src/humeo.egg-info/SOURCES.txt @@ -0,0 +1,58 @@ +LICENSE +README.md +pyproject.toml +src/humeo/__init__.py +src/humeo/best_of.py +src/humeo/cli.py +src/humeo/clip_assembly.py +src/humeo/clip_selection_cache.py +src/humeo/clip_selector.py +src/humeo/config.py +src/humeo/content_pruning.py +src/humeo/cutter.py +src/humeo/env.py +src/humeo/gemini_generate.py +src/humeo/hook_detector.py +src/humeo/hook_library.py +src/humeo/ingest.py +src/humeo/interactive.py +src/humeo/layout_vision.py +src/humeo/pipeline.py +src/humeo/prompt_loader.py +src/humeo/reframe_ffmpeg.py +src/humeo/render_window.py +src/humeo/session_state.py +src/humeo/transcript_align.py +src/humeo/video_cache.py +src/humeo.egg-info/PKG-INFO +src/humeo.egg-info/SOURCES.txt +src/humeo.egg-info/dependency_links.txt +src/humeo.egg-info/entry_points.txt +src/humeo.egg-info/requires.txt +src/humeo.egg-info/top_level.txt +src/humeo/prompts/clip_selection_system.jinja2 +src/humeo/prompts/clip_selection_user.jinja2 +src/humeo/prompts/content_pruning_system.jinja2 +src/humeo/prompts/hook_detection_system.jinja2 +tests/test_ass_subtitles.py +tests/test_best_of.py +tests/test_clip_assembly.py +tests/test_clip_ranking.py +tests/test_clip_selection_cache.py +tests/test_clip_selector.py +tests/test_content_pruning.py +tests/test_cutter_native_highlight.py +tests/test_gemini_generate.py +tests/test_hook_detector.py +tests/test_hook_library.py +tests/test_ingest_openai_chunks.py +tests/test_interactive.py +tests/test_layout_vision_unit.py +tests/test_pipeline_interactive.py +tests/test_pipeline_quality_gate.py +tests/test_prompt_loader.py +tests/test_reframe_ffmpeg.py +tests/test_render_window.py +tests/test_session_state.py +tests/test_transcript_align.py +tests/test_video_cache.py \ No newline at end of file diff --git a/src/humeo.egg-info/dependency_links.txt b/src/humeo.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/humeo.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/humeo.egg-info/entry_points.txt b/src/humeo.egg-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..6650ec9964eaf820cd23924e97064deec871f740 --- /dev/null +++ b/src/humeo.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +humeo = humeo.cli:main diff --git a/src/humeo.egg-info/requires.txt b/src/humeo.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a25622009493d8500d8404f29971a15fc047d43 --- /dev/null +++ b/src/humeo.egg-info/requires.txt @@ -0,0 +1,19 @@ +yt-dlp>=2024.0 +openai>=1.0 +google-genai>=1.0 +httpx>=0.28 +jinja2>=3.1 +numpy>=1.24 +Pillow>=10.0 +python-dotenv>=1.0 +replicate>=0.34.2 +tqdm>=4.60 +humeo-core + +[dev] +pytest-asyncio>=0.23 +ruff +pytest + +[whisper] +whisperx @ git+https://github.com/m-bain/whisperX.git diff --git a/src/humeo.egg-info/top_level.txt b/src/humeo.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..16b4994761aa79c4c9a2ec5bb765ce91cf61f3a2 --- /dev/null +++ b/src/humeo.egg-info/top_level.txt @@ -0,0 +1 @@ +humeo diff --git a/src/humeo/__init__.py b/src/humeo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f1e6ebddd25ec70dc23cb703e971b0276b7a833b --- /dev/null +++ b/src/humeo/__init__.py @@ -0,0 +1,3 @@ +"""Humeo - Automated podcast-to-shorts pipeline.""" + +__version__ = "0.1.0" diff --git a/src/humeo/best_of.py b/src/humeo/best_of.py new file mode 100644 index 0000000000000000000000000000000000000000..921e8c6c692570f922fd3f3403c96f81e94a1dcc --- /dev/null +++ b/src/humeo/best_of.py @@ -0,0 +1,99 @@ +"""Curate a small review pack from a larger batch render.""" + +from __future__ import annotations + +import json +import re +import shutil +from pathlib import Path + +from humeo.clip_selector import clip_quality_priority_score +from humeo_core.schemas import Clip + +_SHORT_FILENAME_RE = re.compile(r"^short_(?P\d+)\.mp4$", re.IGNORECASE) + + +def _load_clip_map(work_dir: Path) -> dict[str, Clip]: + for filename in ("clips.json", "assembled_clips.json"): + path = work_dir / filename + if not path.is_file(): + continue + data = json.loads(path.read_text(encoding="utf-8")) + items = data.get("clips", data) if isinstance(data, dict) else data + return { + clip["clip_id"]: Clip.model_validate(clip) + for clip in items + if isinstance(clip, dict) and clip.get("clip_id") + } + return {} + + +def _default_work_dir_for_source(source_dir: Path, repo_root: Path) -> Path: + match = re.fullmatch(r"videoplayback_(\d+)", source_dir.name) + if match: + return repo_root / f".humeo_batch_videoplayback{match.group(1)}" + return repo_root / f".humeo_{source_dir.name}" + + +def build_best_of_review_pack( + batch_root: Path, + destination_dir: Path, + *, + per_source: int = 2, + repo_root: Path | None = None, + work_dir_map: dict[str, Path] | None = None, +) -> list[Path]: + batch_root = Path(batch_root) + destination_dir = Path(destination_dir) + repo_root = Path(repo_root) if repo_root is not None else batch_root.parent + destination_dir.mkdir(parents=True, exist_ok=True) + + copied: list[Path] = [] + manifest: list[dict[str, object]] = [] + for source_dir in sorted(path for path in batch_root.iterdir() if path.is_dir()): + work_dir = ( + work_dir_map[source_dir.name] + if work_dir_map is not None and source_dir.name in work_dir_map + else _default_work_dir_for_source(source_dir, repo_root) + ) + clip_map = _load_clip_map(work_dir) + ranked: list[tuple[float, Path, str, Clip | None]] = [] + for mp4_path in sorted(source_dir.glob("short_*.mp4")): + match = _SHORT_FILENAME_RE.match(mp4_path.name) + if not match: + continue + clip_id = match.group("clip_id") + clip = clip_map.get(clip_id) + score = clip_quality_priority_score(clip) if clip is not None else 0.0 + ranked.append((score, mp4_path, clip_id, clip)) + + ranked.sort( + key=lambda item: ( + item[0], + item[3].virality_score if item[3] is not None else 0.0, + -(item[3].duration_sec if item[3] is not None else 0.0), + ), + reverse=True, + ) + for rank, (score, mp4_path, clip_id, clip) in enumerate(ranked[: max(1, per_source)], start=1): + target_path = destination_dir / f"{source_dir.name}__pick{rank:02d}__{mp4_path.name}" + shutil.copy2(mp4_path, target_path) + copied.append(target_path) + manifest.append( + { + "source": source_dir.name, + "rank": rank, + "score": round(score, 4), + "output_path": str(target_path), + "original_path": str(mp4_path), + "clip_id": clip.clip_id if clip is not None else clip_id, + "title": clip.suggested_overlay_title if clip is not None else "", + "topic": clip.topic if clip is not None else "", + } + ) + + (destination_dir / "best_of_manifest.json").write_text( + json.dumps({"clips": manifest}, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + return copied diff --git a/src/humeo/cli.py b/src/humeo/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..16bd83f4801cd6190b2b898120b32d5c6495ca10 --- /dev/null +++ b/src/humeo/cli.py @@ -0,0 +1,369 @@ +"""CLI entry point for the Humeo pipeline.""" + +import argparse +import logging +import os +import sys +from datetime import datetime +from pathlib import Path + +from humeo.config import PipelineConfig +from humeo.pipeline import run_pipeline + +DEFAULT_SEGMENTATION_PROVIDER = ( + (os.environ.get("HUMEO_SEGMENTATION_PROVIDER") or "").strip().lower() + or ("replicate" if (os.environ.get("REPLICATE_API_TOKEN") or "").strip() else "off") +) + + +def setup_logging(verbose: bool = False): + """Configure logging with a clean format.""" + level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s", + datefmt="%H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + # Suppress noisy third-party loggers + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("httpx").setLevel(logging.WARNING) + + +def build_parser() -> argparse.ArgumentParser: + """Build the argument parser.""" + parser = argparse.ArgumentParser( + prog="humeo", + description="Humeo - Automated podcast-to-shorts pipeline from YouTube or local MP4", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + humeo --long-to-shorts "https://youtube.com/watch?v=abc123" + humeo --long-to-shorts "C:\\Videos\\episode.mp4" + humeo --long-to-shorts "https://youtube.com/watch?v=abc123" --work-dir .humeo_work + humeo --long-to-shorts "https://youtube.com/watch?v=abc123" --gemini-model gemini-2.0-flash + """, + ) + + parser.add_argument( + "--long-to-shorts", + metavar="SOURCE", + required=True, + help="YouTube video URL or local MP4 path to process", + ) + + parser.add_argument( + "--output", "-o", + type=Path, + default=Path("output"), + help="Output directory for final shorts (default: ./output)", + ) + + parser.add_argument( + "--work-dir", + type=Path, + default=None, + help="Working directory for intermediate files. Default: per-video folder under the " + "cache root (see docs/ENVIRONMENT.md). Use this to force e.g. ./.humeo_work.", + ) + + parser.add_argument( + "--no-video-cache", + action="store_true", + help="Do not use per-video cache dirs; use ./.humeo_work unless --work-dir is set.", + ) + + parser.add_argument( + "--cache-root", + type=Path, + default=None, + help="Override cache root for manifests and per-video ingest (env: HUMEO_CACHE_ROOT).", + ) + + parser.add_argument( + "--gemini-model", + default=None, + help="Gemini model id for clip selection (default: GEMINI_MODEL env; see humeo.config).", + ) + + parser.add_argument( + "--render-theme", + choices=["legacy", "reference_lower_third", "native_highlight"], + default="native_highlight", + help="Visual theme for title/caption rendering (default: native_highlight).", + ) + + parser.add_argument( + "--hook-library-path", + type=Path, + default=None, + help="Zip or directory containing retrieved viral hook examples (env: HUMEO_HOOK_LIBRARY_PATH).", + ) + + parser.add_argument( + "--segmentation-provider", + choices=["off", "replicate"], + default=DEFAULT_SEGMENTATION_PROVIDER, + help=( + "Speaker-centering tracker. Defaults to HUMEO_SEGMENTATION_PROVIDER when set, " + "otherwise replicate if REPLICATE_API_TOKEN exists, else off." + ), + ) + + parser.add_argument( + "--segmentation-model", + default="meta/sam-2-video", + help="Segmentation model id used by the fallback tracker (default: meta/sam-2-video).", + ) + + parser.add_argument( + "--force-clip-selection", + action="store_true", + help="Re-run clip-selection LLM even when clips.meta.json matches the transcript.", + ) + + parser.add_argument( + "--gemini-vision-model", + default=None, + help="Gemini model for per-keyframe layout + bbox (default: GEMINI_VISION_MODEL env or --gemini-model).", + ) + + parser.add_argument( + "--force-layout-vision", + action="store_true", + help="Re-run Gemini vision for layouts even when layout_vision.meta.json matches.", + ) + + parser.add_argument( + "--prune-level", + choices=["off", "conservative", "balanced", "aggressive"], + default="balanced", + help=( + "Stage 2.5 inner-clip content pruning aggressiveness. " + "'off' skips pruning entirely; 'conservative' trims <=10%%, " + "'balanced' <=20%%, 'aggressive' <=35%% of each clip " + "(always clamped to the MIN_CLIP_DURATION_SEC floor). Default: balanced." + ), + ) + + parser.add_argument( + "--force-content-pruning", + action="store_true", + help="Re-run content-pruning LLM even when prune.meta.json matches.", + ) + + parser.add_argument( + "--no-hook-detection", + action="store_true", + help=( + "Skip Stage 2.25 hook detection. The selector's hook window " + "(possibly the 0.0-3.0s placeholder) will be carried through. " + "Stage 2.5 content pruning still treats that exact placeholder " + "as 'no hook' so pruning is not disabled." + ), + ) + + parser.add_argument( + "--force-hook-detection", + action="store_true", + help="Re-run hook-detection LLM even when hooks.meta.json matches.", + ) + + parser.add_argument( + "--clean-run", + action="store_true", + help=( + "Run with a fresh work dir and no cache reuse. Implies --no-video-cache, " + "--force-clip-selection, --force-layout-vision, and overwrite existing outputs." + ), + ) + + parser.add_argument( + "--interactive", "-i", + action="store_true", + help="Pause after clip selection and after render for human approval.", + ) + + parser.add_argument( + "--subtitle-font-size", + type=int, + default=48, + help=( + "Caption font size in output pixels. libass is pinned to " + "original_size=1080x1920, so this is a true pixel value. " + "(default: 48)" + ), + ) + + parser.add_argument( + "--subtitle-margin-v", + type=int, + default=160, + help="Caption bottom margin in output pixels (default: 160)", + ) + + parser.add_argument( + "--subtitle-max-words", + type=int, + default=4, + help="Max words per subtitle cue (default: 4)", + ) + + parser.add_argument( + "--subtitle-max-cue-sec", + type=float, + default=2.2, + help="Max subtitle cue duration in seconds (default: 2.2)", + ) + + parser.add_argument( + "--caption-highlight-lead-ms", + type=float, + default=60.0, + help="Native-highlight word box lead time in milliseconds (default: 60)", + ) + + parser.add_argument( + "--caption-highlight-min-dwell-ms", + type=float, + default=160.0, + help=( + "Minimum native-highlight word box dwell in milliseconds " + "when timing allows (default: 160)" + ), + ) + + parser.add_argument( + "--no-caption-timing-repair", + action="store_true", + help="Disable conservative repair of suspicious word-level ASR timings.", + ) + + parser.add_argument( + "--no-subtitles", + action="store_true", + help="Skip burning subtitles. Useful when the source already has captions baked in.", + ) + + parser.add_argument( + "--no-render-qa", + action="store_true", + help="Skip automatic render QA contact sheets, scores, and debug overlays.", + ) + + parser.add_argument( + "--qa-reference-video", + type=Path, + default=None, + help="Optional reference video for automatic A/B contact-sheet comparison.", + ) + + parser.add_argument( + "--no-qa-debug-overlay", + action="store_true", + help="Skip low-res crop/debug overlay videos in render QA.", + ) + + parser.add_argument( + "--rerender-clip", + action="append", + default=[], + metavar="CLIP_ID", + help="Rerender only this clip id, e.g. 002 or short_002. Can be repeated.", + ) + + parser.add_argument( + "--rerender-warned-only", + action="store_true", + help="Rerender only clips flagged in the existing render_qa/qa_manifest.json.", + ) + + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable debug logging", + ) + + return parser + + +def main(): + """CLI entry point.""" + parser = build_parser() + args = parser.parse_args() + setup_logging(args.verbose) + + use_video_cache = not args.no_video_cache + force_clip_selection = args.force_clip_selection + force_layout_vision = args.force_layout_vision + force_content_pruning = args.force_content_pruning + force_hook_detection = args.force_hook_detection + detect_hooks = not args.no_hook_detection + overwrite_outputs = False + work_dir = args.work_dir + + if args.clean_run: + use_video_cache = False + force_clip_selection = True + force_layout_vision = True + force_content_pruning = True + force_hook_detection = True + overwrite_outputs = True + if work_dir is None: + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + work_dir = Path(f".humeo_work_clean_{stamp}") + + config = PipelineConfig( + youtube_url=args.long_to_shorts, + output_dir=args.output, + work_dir=work_dir, + use_video_cache=use_video_cache, + cache_root=args.cache_root, + gemini_model=args.gemini_model, + gemini_vision_model=args.gemini_vision_model, + render_theme=args.render_theme, + hook_library_path=args.hook_library_path, + segmentation_provider=args.segmentation_provider, + segmentation_model=args.segmentation_model, + force_clip_selection=force_clip_selection, + force_layout_vision=force_layout_vision, + clean_run=args.clean_run, + overwrite_outputs=overwrite_outputs, + interactive=args.interactive, + prune_level=args.prune_level, + force_content_pruning=force_content_pruning, + detect_hooks=detect_hooks, + force_hook_detection=force_hook_detection, + subtitle_font_size=args.subtitle_font_size, + subtitle_margin_v=args.subtitle_margin_v, + subtitle_max_words_per_cue=args.subtitle_max_words, + subtitle_max_cue_sec=args.subtitle_max_cue_sec, + burn_subtitles=not args.no_subtitles, + subtitle_highlight_lead_sec=max(0.0, args.caption_highlight_lead_ms / 1000.0), + subtitle_highlight_min_dwell_sec=max( + 0.02, + args.caption_highlight_min_dwell_ms / 1000.0, + ), + repair_subtitle_word_timings=not args.no_caption_timing_repair, + render_qa=not args.no_render_qa, + qa_reference_video=args.qa_reference_video, + qa_debug_overlay=not args.no_qa_debug_overlay, + rerender_clip_ids=args.rerender_clip, + rerender_warned_only=args.rerender_warned_only, + ) + + try: + outputs = run_pipeline(config) + print(f"\nDone. {len(outputs)} shorts generated in: {config.output_dir}") + for p in outputs: + print(f" -> {p}") + except KeyboardInterrupt: + print("\nPipeline interrupted.") + sys.exit(1) + except Exception as e: + logging.getLogger(__name__).error("Pipeline failed: %s", e, exc_info=True) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/humeo/clip_assembly.py b/src/humeo/clip_assembly.py new file mode 100644 index 0000000000000000000000000000000000000000..4a5e33575be3f40e06c54e0640f35ceb41f8af2b --- /dev/null +++ b/src/humeo/clip_assembly.py @@ -0,0 +1,303 @@ +"""Hard-cut filler/silence cleanup by assembling multiple kept spans.""" + +from __future__ import annotations + +import json +import logging +import re +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from humeo_core.schemas import Clip, ClipPlan, ClipRenderSpan + +from humeo.render_window import effective_export_bounds + +logger = logging.getLogger(__name__) + +_SPAN_BREAK_MIN_GAP_SEC = 0.55 +_SPAN_EDGE_PAD_SEC = 0.05 +_SPAN_MIN_DURATION_SEC = 0.30 +_FILLER_SPAN_MIN_DURATION_SEC = 0.12 +_SEGMENT_BREAK_MIN_GAP_SEC = 0.65 +_SEGMENT_MAX_DURATION_SEC = 6.0 +_SEGMENT_MAX_WORDS = 18 +_FILLER_CUT_PAD_SEC = 0.02 +_FILLER_WORD_RE = re.compile(r"^(u+h+|u+m+|e+h+|e+r+|a+h+|h+m+|m+m+)$", re.IGNORECASE) +_FILLER_WORDS = { + "ah", + "eh", + "er", + "hmm", + "mm", + "uh", + "uhh", + "uhm", + "um", + "umm", +} + + +@dataclass(frozen=True) +class AssembledClip: + source_path: Path + clip: Clip + transcript: dict + spans: list[ClipRenderSpan] + + +def _iter_words(transcript: dict) -> list[dict]: + words: list[dict] = [] + for seg in transcript.get("segments", []) or []: + for raw in seg.get("words", []) or []: + try: + word = { + "word": str(raw.get("word", "")).strip(), + "start": float(raw["start"]), + "end": float(raw["end"]), + } + except (KeyError, TypeError, ValueError): + continue + if not word["word"] or word["end"] <= word["start"]: + continue + words.append(word) + return words + + +def _clean_word_token(text: str) -> str: + return re.sub(r"(^[^A-Za-z]+|[^A-Za-z]+$)", "", text or "").lower() + + +def _looks_like_filler_word(text: str) -> bool: + token = _clean_word_token(text) + if not token: + return False + return token in _FILLER_WORDS or bool(_FILLER_WORD_RE.fullmatch(token)) + + +def derive_render_spans(clip: Clip, transcript: dict) -> list[ClipRenderSpan]: + if clip.render_spans: + return list(clip.render_spans) + + start_sec, end_sec = effective_export_bounds(clip) + words = [ + word + for word in _iter_words(transcript) + if word["end"] > start_sec and word["start"] < end_sec + ] + if not words: + return [ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec)] + + spans: list[ClipRenderSpan] = [] + span_start: float | None = None + prev_end: float | None = None + resume_after = start_sec + + for word in words: + word_start = float(word["start"]) + word_end = float(word["end"]) + if _looks_like_filler_word(str(word["word"])): + if span_start is not None and prev_end is not None: + span_end = min(end_sec, max(span_start, word_start - _FILLER_CUT_PAD_SEC)) + if span_end - span_start >= _FILLER_SPAN_MIN_DURATION_SEC: + spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=span_end)) + span_start = None + prev_end = None + resume_after = min(end_sec, word_end + _FILLER_CUT_PAD_SEC) + continue + if span_start is None: + span_start = max(start_sec, word_start - _SPAN_EDGE_PAD_SEC, resume_after) + prev_end = word_end + continue + if prev_end is not None and word_start - prev_end >= _SPAN_BREAK_MIN_GAP_SEC: + span_end = min(end_sec, prev_end + _SPAN_EDGE_PAD_SEC) + if span_end - span_start >= _SPAN_MIN_DURATION_SEC: + spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=span_end)) + span_start = max(start_sec, word_start - _SPAN_EDGE_PAD_SEC) + prev_end = word_end + + if span_start is None or prev_end is None: + if not spans: + spans.append(ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec)) + return spans + + final_end = min(end_sec, prev_end + _SPAN_EDGE_PAD_SEC) + if final_end - span_start >= _SPAN_MIN_DURATION_SEC: + spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=final_end)) + + if not spans: + spans.append(ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec)) + return spans + + +def apply_render_spans(clips: list[Clip], transcript: dict) -> list[Clip]: + out: list[Clip] = [] + for clip in clips: + spans = derive_render_spans(clip, transcript) + out.append(clip.model_copy(update={"render_spans": spans})) + return out + + +def _segment_local_words(words: list[dict], *, language: str) -> dict: + segments: list[dict] = [] + chunk: list[dict] = [] + + def flush() -> None: + if not chunk: + return + segments.append( + { + "start": chunk[0]["start"], + "end": chunk[-1]["end"], + "text": " ".join(str(word["word"]) for word in chunk).strip(), + "words": list(chunk), + } + ) + chunk.clear() + + for word in words: + if chunk: + gap = float(word["start"]) - float(chunk[-1]["end"]) + dur = float(word["end"]) - float(chunk[0]["start"]) + if ( + gap >= _SEGMENT_BREAK_MIN_GAP_SEC + or dur >= _SEGMENT_MAX_DURATION_SEC + or len(chunk) >= _SEGMENT_MAX_WORDS + ): + flush() + chunk.append(word) + flush() + return {"segments": segments, "language": language} + + +def build_assembled_transcript(clip: Clip, transcript: dict) -> dict: + words = _iter_words(transcript) + local_words: list[dict] = [] + current_offset = 0.0 + for span in derive_render_spans(clip, transcript): + for word in words: + if word["end"] <= span.start_time_sec or word["start"] >= span.end_time_sec: + continue + if _looks_like_filler_word(str(word["word"])): + continue + local_words.append( + { + "word": word["word"], + "start": max(word["start"], span.start_time_sec) - span.start_time_sec + current_offset, + "end": min(word["end"], span.end_time_sec) - span.start_time_sec + current_offset, + } + ) + current_offset += span.duration_sec + language = str(transcript.get("language") or "en") + return _segment_local_words(local_words, language=language) + + +def _ffmpeg_concat_filter(spans: list[ClipRenderSpan]) -> str: + parts: list[str] = [] + for idx, span in enumerate(spans): + parts.append( + f"[0:v]trim=start={span.start_time_sec:.3f}:end={span.end_time_sec:.3f},setpts=PTS-STARTPTS[v{idx}]" + ) + parts.append( + f"[0:a]atrim=start={span.start_time_sec:.3f}:end={span.end_time_sec:.3f},asetpts=PTS-STARTPTS[a{idx}]" + ) + concat_inputs = "".join(f"[v{idx}][a{idx}]" for idx in range(len(spans))) + parts.append(f"{concat_inputs}concat=n={len(spans)}:v=1:a=1[vout][aout]") + return ";".join(parts) + + +def assemble_clip( + source_path: Path, + clip: Clip, + transcript: dict, + output_dir: Path, +) -> AssembledClip: + spans = derive_render_spans(clip, transcript) + output_dir.mkdir(parents=True, exist_ok=True) + assembled_path = output_dir / f"clip_{clip.clip_id}.mp4" + + ffmpeg = shutil.which("ffmpeg") + if not ffmpeg: + raise RuntimeError("ffmpeg not found on PATH") + + cmd = [ + ffmpeg, + "-y", + "-i", + str(source_path), + "-filter_complex", + _ffmpeg_concat_filter(spans), + "-map", + "[vout]", + "-map", + "[aout]", + "-c:v", + "libx264", + "-preset", + "veryfast", + "-crf", + "20", + "-c:a", + "aac", + "-b:a", + "160k", + "-movflags", + "+faststart", + str(assembled_path), + ] + subprocess.run(cmd, check=True, capture_output=True) + + assembled_transcript = build_assembled_transcript(clip, transcript) + assembled_transcript_path = output_dir / f"clip_{clip.clip_id}.transcript.json" + assembled_transcript_path.write_text( + json.dumps(assembled_transcript, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + timeline_path = output_dir / f"clip_{clip.clip_id}.timeline.json" + timeline_path.write_text( + json.dumps( + { + "clip_id": clip.clip_id, + "source_spans": [span.model_dump() for span in spans], + "assembled_duration_sec": sum(span.duration_sec for span in spans), + }, + indent=2, + ) + + "\n", + encoding="utf-8", + ) + + assembled_duration = sum(span.duration_sec for span in spans) + assembled_clip = clip.model_copy( + update={ + "start_time_sec": 0.0, + "end_time_sec": assembled_duration, + "trim_start_sec": 0.0, + "trim_end_sec": 0.0, + "hook_start_sec": None, + "hook_end_sec": None, + "render_spans": [], + } + ) + logger.info( + "Assembled clip %s into %d span(s): %.1fs -> %.1fs", + clip.clip_id, + len(spans), + clip.duration_sec, + assembled_duration, + ) + return AssembledClip( + source_path=assembled_path, + clip=assembled_clip, + transcript=assembled_transcript, + spans=spans, + ) + + +def write_clip_plan(path: Path, clips: list[Clip]) -> Path: + path.write_text( + ClipPlan(source_path="", clips=clips).model_dump_json(indent=2) + "\n", + encoding="utf-8", + ) + return path diff --git a/src/humeo/clip_selection_cache.py b/src/humeo/clip_selection_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..a84d84d5e8d3ffeb1009a73ca17c4856d3ceb246 --- /dev/null +++ b/src/humeo/clip_selection_cache.py @@ -0,0 +1,85 @@ +"""Persist Gemini clip-selection output and skip re-inference when transcript matches.""" + +from __future__ import annotations + +import hashlib +import json +import logging +from pathlib import Path +from typing import Any + +from humeo.config import GEMINI_MODEL, PipelineConfig +from humeo.env import current_llm_provider +from humeo.hook_library import hook_library_fingerprint, resolve_hook_library_path + +logger = logging.getLogger(__name__) + +# v3: includes hook-library fingerprint for retrieval-augmented prompts. +CURRENT_META_VERSION = 3 +META_FILENAME = "clips.meta.json" +RAW_FILENAME = "clip_selection_raw.json" + + +def transcript_fingerprint(transcript: dict) -> str: + payload = json.dumps(transcript, sort_keys=True, ensure_ascii=False) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def resolved_gemini_model(config: PipelineConfig) -> str: + return (config.gemini_model or GEMINI_MODEL).strip() + + +def load_meta(work_dir: Path) -> dict[str, Any] | None: + path = work_dir / META_FILENAME + if not path.is_file(): + return None + with open(path, encoding="utf-8") as f: + return json.load(f) + + +def cache_valid(meta: dict[str, Any], fingerprint: str, config: PipelineConfig) -> bool: + if meta.get("transcript_sha256") != fingerprint: + return False + gm = resolved_gemini_model(config) + current_provider = current_llm_provider() + meta_provider = meta.get("llm_backend") + if current_provider == "openrouter": + if meta_provider != "openrouter": + return False + elif current_provider == "google": + if meta_provider not in (None, "google"): + return False + ver = meta.get("version", 1) + if ver >= CURRENT_META_VERSION: + return ( + meta.get("gemini_model") == gm + and meta.get("hook_library_sha256", "") + == hook_library_fingerprint(resolve_hook_library_path(config)) + ) + # Legacy v1: had llm_provider + model fields + if meta.get("llm_provider") == "openai": + return False + return meta.get("gemini_model") == gm + + +def write_artifacts( + work_dir: Path, + *, + transcript: dict, + config: PipelineConfig, + raw_response: str, +) -> None: + work_dir.mkdir(parents=True, exist_ok=True) + fp = transcript_fingerprint(transcript) + meta: dict[str, Any] = { + "version": CURRENT_META_VERSION, + "transcript_sha256": fp, + "gemini_model": resolved_gemini_model(config), + "llm_backend": current_llm_provider() or "google", + "hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)), + } + (work_dir / RAW_FILENAME).write_text(raw_response, encoding="utf-8") + with open(work_dir / META_FILENAME, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + f.write("\n") + logger.info("Wrote %s and %s", META_FILENAME, RAW_FILENAME) diff --git a/src/humeo/clip_selector.py b/src/humeo/clip_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..47887687fcd548f8d92b1374c5298be0dc0734fd --- /dev/null +++ b/src/humeo/clip_selector.py @@ -0,0 +1,674 @@ +""" +Step 2 - Clip Selection: Gemini-only LLM for viral clip identification. + +Uses the unified ``google-genai`` SDK (``from google import genai``). See: +https://github.com/googleapis/python-genai +""" + +from __future__ import annotations + +import json +import logging +import re +import time +from pathlib import Path +from typing import Callable, TypeVar + +from google import genai +from openai import OpenAI + +from humeo.gemini_generate import gemini_generate_config + +from humeo_core.schemas import Clip, ClipPlan + +from humeo.config import ( + GEMINI_MODEL, + MAX_CLIP_DURATION_SEC, + MIN_CLIP_DURATION_SEC, + TEXT_AXIS_WEIGHTS, + TARGET_CLIP_COUNT, +) +from humeo.env import ( + OPENROUTER_BASE_URL, + model_name_for_provider, + openrouter_default_headers, + resolve_gemini_api_key, + resolve_llm_provider, + resolve_openrouter_api_key, +) +from humeo.hook_library import ( + format_hook_examples, + retrieve_hook_examples, +) +from humeo.prompt_loader import clip_selection_prompts + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + +LLM_MAX_ATTEMPTS = 3 +LLM_RETRY_DELAY_SEC = 2.0 + +# Over-generation defaults (also exposed via PipelineConfig so callers can +# override per-run without touching code). Rationale: +# +# - Ask Gemini for a *pool* of ~12 candidates at temperature 0.7 so the model +# considers a wider slice of the transcript instead of locking onto the +# first 5 obvious ones. More candidates -> more chance the actual gold +# nugget is in the list. +# - Then rank by ``virality_score`` and keep everything >= threshold, but +# always keep at least ``min_kept`` and at most ``max_kept`` clips. This +# lets a single strong clip survive a weak transcript ("keep the best 5 +# even if no one clears the bar") AND lets an exceptionally rich +# transcript ship 7-8 strong shorts instead of artificially capping at 5. +DEFAULT_CANDIDATE_COUNT = 12 +DEFAULT_QUALITY_THRESHOLD = 0.70 +DEFAULT_MIN_KEPT = TARGET_CLIP_COUNT +DEFAULT_MAX_KEPT = 8 +# Higher than the old 0.3 so the pool is meaningfully different from +# "the same five most-obvious clips every run". Still well below 1.0 so we +# do not get word-salad IDs or timestamps. +DEFAULT_CANDIDATE_TEMPERATURE = 0.7 +_TITLE_SMALL_WORDS = { + "a", + "an", + "and", + "as", + "at", + "by", + "for", + "from", + "in", + "of", + "on", + "or", + "the", + "to", + "vs", + "with", +} +_TITLE_DROP_WORDS = { + "actually", + "entirely", + "just", + "next", + "really", + "still", + "that", + "their", + "these", + "this", + "those", + "very", + "will", + "your", +} +_TITLE_BLAND_WORDS = { + "big", + "future", + "important", + "lesson", + "matter", + "matters", + "opportunity", + "reason", + "soon", + "story", + "thing", +} +_GENERIC_TITLE_PATTERNS = ( + "big opportunity", + "future of", + "important lesson", + "start a business with ai", + "why this matters", + "what this means", +) +_TITLE_TOKEN_REPLACEMENTS = { + "ai": "AI", + "agi": "AGI", + "api": "API", + "btc": "BTC", + "ev": "EV", + "evs": "EVs", + "us": "US", +} +_POWER_TITLE_TOKENS = {"$", "%", "under", "beats", "fewer", "more", "less", "vs"} +_FILLER_OPENERS = { + "actually", + "basically", + "i", + "kind", + "look", + "listen", + "now", + "okay", + "ok", + "right", + "so", + "sort", + "well", + "yeah", + "you", +} +_FILLER_OPENING_PHRASES = { + "i mean", + "kind of", + "sort of", + "you know", +} +_PREFERRED_MAX_DURATION_SEC = 72.0 + + +def _has_valid_duration(clip: Clip) -> bool: + """Return True when the clip window satisfies the product duration contract.""" + return MIN_CLIP_DURATION_SEC <= clip.duration_sec <= MAX_CLIP_DURATION_SEC + + +def _text_composite_score(clip: Clip) -> float: + """Weighted composite from the text-axis breakdown, falling back to virality_score. + + Cache compatibility note: + - New Ticket 3 clips use the three-axis rubric (message_wow / hook_emotion / catchy). + - Older caches may still contain legacy rule-name ``score_breakdown`` maps from the + pre-Ticket-3 prompt. If none of the expected axes are present, fall back cleanly + to ``virality_score`` instead of treating the legacy shape as three missing axes. + """ + if not clip.score_breakdown: + return clip.virality_score + + present_expected_axes = [axis for axis in TEXT_AXIS_WEIGHTS if axis in clip.score_breakdown] + if not present_expected_axes: + return clip.virality_score + + total = 0.0 + missing: list[str] = [] + for axis, weight in TEXT_AXIS_WEIGHTS.items(): + value = clip.score_breakdown.get(axis) + if value is None: + missing.append(axis) + continue + total += value * weight + + if missing: + logger.warning( + "Clip %s score_breakdown missing axis(es) %s; treating as 0.0.", + clip.clip_id, + ", ".join(missing), + ) + return total + + +def _title_quality_penalty(clip: Clip) -> float: + title = _tighten_overlay_title_text(clip.suggested_overlay_title or "") + if not title: + return 0.0 + penalty = 0.0 + if _looks_generic_title(title): + penalty += 0.18 + tokens = [token for token in _normalized_title(title).split() if token] + if len(tokens) < 2 or len(tokens) > 6: + penalty += 0.05 + if not any(token in title.lower() for token in _POWER_TITLE_TOKENS) and not any( + ch.isdigit() for ch in title + ): + penalty += 0.03 + return min(0.22, penalty) + + +def _hook_quality_penalty(clip: Clip) -> float: + penalty = 0.0 + if clip.hook_start_sec is not None and clip.hook_start_sec > 5.0: + penalty += min(0.18, 0.06 + (clip.hook_start_sec - 5.0) * 0.025) + opener = " ".join((clip.viral_hook or clip.transcript or "").split()).lower() + if opener: + first_words = opener.split() + first_word = first_words[0] if first_words else "" + opening_phrase = " ".join(first_words[:2]) + if first_word in _FILLER_OPENERS: + penalty += 0.14 + if opening_phrase in _FILLER_OPENING_PHRASES: + penalty += 0.06 + if len(first_words) >= 12: + penalty += 0.03 + return min(0.24, penalty) + + +def _duration_quality_penalty(clip: Clip) -> float: + if clip.duration_sec <= _PREFERRED_MAX_DURATION_SEC: + return 0.0 + drift = clip.duration_sec - _PREFERRED_MAX_DURATION_SEC + return min(0.14, 0.03 + drift * 0.01) + + +def clip_quality_penalty(clip: Clip) -> float: + return min( + 0.42, + _title_quality_penalty(clip) + + _hook_quality_penalty(clip) + + _duration_quality_penalty(clip), + ) + + +def clip_quality_priority_score(clip: Clip) -> float: + review_penalty = 0.5 if clip.needs_review else 0.0 + composite = _text_composite_score(clip) + return composite - review_penalty - clip_quality_penalty(clip) + + +def renumber_clips_dense(clips: list[Clip]) -> list[Clip]: + renumbered: list[Clip] = [] + for idx, clip in enumerate(clips, start=1): + new_id = f"{idx:03d}" + renumbered.append(clip if clip.clip_id == new_id else clip.model_copy(update={"clip_id": new_id})) + return renumbered + + +def _openai_message_text(content: object) -> str: + """Normalize OpenAI-compatible message content into plain text.""" + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text") + if isinstance(text, str): + parts.append(text) + return "".join(parts) + return "" + + +def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T: + last: Exception | None = None + for i in range(attempts): + try: + return fn() + except Exception as e: + last = e + if i < attempts - 1: + logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e) + time.sleep(LLM_RETRY_DELAY_SEC * (i + 1)) + assert last is not None + raise last + + +def _headline_case_title(text: str) -> str: + words = text.split() + if not words: + return "" + out: list[str] = [] + for idx, word in enumerate(words): + if any(ch.isdigit() for ch in word) or word.startswith("$"): + out.append(word) + continue + raw = re.sub(r"^[^A-Za-z]+|[^A-Za-z]+$", "", word) + lower = raw.lower() + if lower in _TITLE_TOKEN_REPLACEMENTS: + out.append(word.replace(raw, _TITLE_TOKEN_REPLACEMENTS[lower])) + continue + if idx not in (0, len(words) - 1) and lower in _TITLE_SMALL_WORDS: + out.append(word.replace(raw, lower)) + continue + out.append(word.replace(raw, raw.capitalize())) + return " ".join(out) + + +def _normalized_title(text: str) -> str: + return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9$% ]+", " ", (text or "").lower())).strip() + + +def _looks_generic_title(text: str) -> bool: + normalized = _normalized_title(text) + if not normalized: + return True + if any(pattern in normalized for pattern in _GENERIC_TITLE_PATTERNS): + return True + tokens = [token for token in normalized.split() if token] + bland_count = sum(token in _TITLE_BLAND_WORDS for token in tokens) + return bland_count >= 2 + + +def _tighten_overlay_title_text(text: str) -> str: + title = " ".join((text or "").replace("β€”", "-").split()).strip(" .,!?:;-") + if not title: + return "" + title = re.sub(r"\bwill cost less than\b", "under", title, flags=re.IGNORECASE) + title = re.sub(r"\bless than\b", "under", title, flags=re.IGNORECASE) + title = re.sub(r"\bmade your\b", "", title, flags=re.IGNORECASE) + title = re.sub(r"\bis still\b", "is", title, flags=re.IGNORECASE) + title = re.sub(r"\bis creating\b", "creates", title, flags=re.IGNORECASE) + title = re.sub(r"\bthere are\b", "", title, flags=re.IGNORECASE) + title = re.sub(r"\bentirely\b", "", title, flags=re.IGNORECASE) + words = title.split() + while len(words) > 6: + filtered = [word for word in words if word.lower() not in _TITLE_DROP_WORDS] + if len(filtered) == len(words): + break + words = filtered + if len(words) > 4: + words = [word for word in words if word.lower() not in {"your", "next"} or len(words) <= 4] + if len(words) > 6 and words[0].lower() in {"why", "how", "when"}: + words = words[1:] + if len(words) > 6: + words = words[:6] + return _headline_case_title(" ".join(words).strip(" .,!?:;-")) + + +def _polish_overlay_title(clip: Clip) -> str: + current = _tighten_overlay_title_text(clip.suggested_overlay_title or "") + if current and not _looks_generic_title(current): + return current + for candidate in (clip.viral_hook or "", clip.topic or ""): + polished = _tighten_overlay_title_text(candidate) + if polished and not _looks_generic_title(polished): + return polished + return current + + +def _polish_clip_metadata(clip: Clip) -> Clip: + title = _polish_overlay_title(clip) + if not title or title == clip.suggested_overlay_title: + return clip + return clip.model_copy(update={"suggested_overlay_title": title}) + + +def build_prompt( + transcript: dict, + *, + candidate_count: int = DEFAULT_CANDIDATE_COUNT, + steering_notes: list[str] | None = None, + hook_library_path: Path | None = None, +) -> tuple[str, str]: + """Return ``(system_prompt, user_message)`` for the clip-selector LLM call. + + ``candidate_count`` is the size of the candidate POOL we ask Gemini for. + A downstream ranker (``rank_and_filter_clips``) then keeps the top + clips that clear the quality threshold. Defaults preserve the previous + visible output (5 clips) when the pool is narrow. + """ + lines = [] + for seg in transcript.get("segments", []): + start = seg.get("start", 0) + end = seg.get("end", 0) + text = seg.get("text", "").strip() + lines.append(f"[{start:.1f}s - {end:.1f}s] {text}") + + transcript_text = "\n".join(lines) + + hook_examples = format_hook_examples( + retrieve_hook_examples( + transcript_text[:8000], + path=hook_library_path, + limit=8, + ) + ) + + system, user = clip_selection_prompts( + transcript_text=transcript_text, + min_dur=MIN_CLIP_DURATION_SEC, + max_dur=MAX_CLIP_DURATION_SEC, + count=candidate_count, + steering_notes=steering_notes, + hook_examples=hook_examples, + ) + return system, user + + +def rank_and_filter_clips( + clips: list[Clip], + *, + threshold: float = DEFAULT_QUALITY_THRESHOLD, + min_kept: int = DEFAULT_MIN_KEPT, + max_kept: int = DEFAULT_MAX_KEPT, +) -> list[Clip]: + """Rank ``clips`` by text composite (or legacy ``virality_score``) and apply + the threshold+floor+cap. + + Rules (in order, with clear precedence): + + 1. Sort descending by the text composite score when the Ticket 3 + three-axis ``score_breakdown`` is present; otherwise fall back to the + legacy ``virality_score``. + 2. Keep clips whose active score signal is ``>= threshold`` (or + ``needs_review`` cleared). Reviewed-out clips (``needs_review=True``) + are always sent to the back of the priority queue. + 3. If fewer than ``min_kept`` clips passed the threshold, fill up from + the remaining clips in rank order until we reach ``min_kept`` (or + run out of candidates). + 4. Cap the final list at ``max_kept`` entries. + 5. Renumber ``clip_id`` to ``001``, ``002``, ... so downstream artifacts + (keyframes, subtitles, output filenames) stay dense and ordered. + + This is the "threshold with a floor" policy the user asked for: quality + first, but never ship zero shorts when the transcript is weak. + """ + if not clips: + return [] + + score_signal = {id(c): _text_composite_score(c) for c in clips} + priority_signal = {id(c): clip_quality_priority_score(c) for c in clips} + + def _priority(c: Clip) -> tuple[float, float]: + return (priority_signal[id(c)], score_signal[id(c)]) + + valid: list[Clip] = [] + invalid: list[Clip] = [] + for clip in clips: + if _has_valid_duration(clip): + valid.append(clip) + else: + invalid.append(clip) + logger.warning( + "Clip %s dropped before ranking: duration %.1fs is outside [%ds, %ds] - %s", + clip.clip_id, + clip.duration_sec, + MIN_CLIP_DURATION_SEC, + MAX_CLIP_DURATION_SEC, + clip.topic, + ) + + if not valid: + logger.warning( + "Clip ranking: 0 valid candidates remain after duration filtering (dropped=%d).", + len(invalid), + ) + return [] + + ordered = sorted(valid, key=_priority, reverse=True) + + strong = [c for c in ordered if priority_signal[id(c)] >= threshold and not c.needs_review] + kept = list(strong) + + if len(kept) < min_kept: + backfill = [c for c in ordered if c not in kept] + for c in backfill: + if len(kept) >= min_kept: + break + kept.append(c) + + if len(kept) < min_kept: + logger.warning( + "Clip ranking: only %d valid candidates remain after duration filtering; " + "cannot satisfy min_kept=%d without invalid clips.", + len(kept), + min_kept, + ) + + if len(kept) > max_kept: + kept = kept[:max_kept] + + # Renumber clip_ids so consumers (filenames, layout vision, subtitles) + # always see 001..NNN in rank order regardless of what the LLM returned. + renumbered = renumber_clips_dense(kept) + + dropped = len(valid) - len(kept) + len(invalid) + logger.info( + "Clip ranking: kept %d / %d candidates (threshold=%.2f, min=%d, max=%d, dropped=%d).", + len(renumbered), + len(clips), + threshold, + min_kept, + max_kept, + dropped, + ) + for c in renumbered: + logger.info( + " [%s] score=%.2f priority=%.2f penalty=%.2f %s %s", + c.clip_id, + c.virality_score, + clip_quality_priority_score(c), + clip_quality_penalty(c), + "(review)" if c.needs_review else "", + c.topic, + ) + return renumbered + + +def select_clips( + transcript: dict, + *, + gemini_model: str | None = None, + hook_library_path: Path | None = None, + candidate_count: int = DEFAULT_CANDIDATE_COUNT, + quality_threshold: float = DEFAULT_QUALITY_THRESHOLD, + min_kept: int = DEFAULT_MIN_KEPT, + max_kept: int = DEFAULT_MAX_KEPT, + temperature: float = DEFAULT_CANDIDATE_TEMPERATURE, + steering_notes: list[str] | None = None, +) -> tuple[list[Clip], str]: + """ + Call Gemini to select clips. Returns ``(clips, raw_json)`` for caching / debugging. + + The returned clip list has already been ranked + filtered by + :func:`rank_and_filter_clips`. ``raw_json`` is the untouched LLM + response so the cache artifact reflects the entire candidate pool for + audit / re-ranking without another LLM call. + + Uses ``google.genai.Client`` and ``GenerateContentConfig`` (see Google + Gen AI SDK for Python). + """ + provider = resolve_llm_provider() + model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider) + system_prompt, user_text = build_prompt( + transcript, + candidate_count=candidate_count, + steering_notes=steering_notes, + hook_library_path=hook_library_path, + ) + + def _call() -> str: + logger.info( + "%s clip selection (model=%s, candidate_pool=%d, temp=%.2f)...", + provider, + model_name, + candidate_count, + temperature, + ) + if provider == "google": + client = genai.Client(api_key=resolve_gemini_api_key()) + response = client.models.generate_content( + model=model_name, + contents=user_text, + config=gemini_generate_config( + system_instruction=system_prompt, + temperature=temperature, + response_mime_type="application/json", + ), + ) + if not response.text: + raise RuntimeError("Gemini returned empty response text") + return response.text + + client = OpenAI( + api_key=resolve_openrouter_api_key(), + base_url=OPENROUTER_BASE_URL, + default_headers=openrouter_default_headers(), + ) + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_text}, + ], + temperature=temperature, + response_format={"type": "json_object"}, + ) + text = _openai_message_text(response.choices[0].message.content) + if not text: + raise RuntimeError("OpenRouter returned empty response text") + return text + + raw = _retry_llm("Gemini clip selection", _call) + candidates = _parse_clips(raw) + # The ranker can only backfill from the pool Gemini returned. If Gemini + # under-delivered (e.g. returned 2 of a requested 12), the min_kept floor + # is unenforceable -- warn loudly so we do not silently ship fewer shorts + # than the caller expected. + if len(candidates) < min_kept: + logger.warning( + "Clip selection: Gemini returned only %d candidates (requested %d, floor %d). " + "Output will be capped at %d shorts -- check prompt or transcript length.", + len(candidates), + candidate_count, + min_kept, + len(candidates), + ) + elif len(candidates) < candidate_count: + logger.info( + "Clip selection: Gemini returned %d of %d requested candidates " + "(pool still >= floor of %d).", + len(candidates), + candidate_count, + min_kept, + ) + clips = rank_and_filter_clips( + candidates, + threshold=quality_threshold, + min_kept=min_kept, + max_kept=max_kept, + ) + return clips, raw + + +def _parse_clips(raw_json: str) -> list[Clip]: + """Parse and validate the LLM's JSON response into Clip objects.""" + data = json.loads(raw_json) + clips_data = data.get("clips", data) if isinstance(data, dict) else data + + clips: list[Clip] = [] + for item in clips_data: + payload = dict(item) + payload.pop("duration_sec", None) + clip = _polish_clip_metadata(Clip.model_validate(payload)) + + actual_dur = clip.end_time_sec - clip.start_time_sec + stated_dur = item.get("duration_sec") + if stated_dur is not None and abs(actual_dur - float(stated_dur)) > 1.0: + logger.warning( + "Clip %s: stated duration %.1fs doesn't match (%.1f-%.1f = %.1f).", + clip.clip_id, float(stated_dur), + clip.start_time_sec, clip.end_time_sec, actual_dur, + ) + clips.append(clip) + + logger.info("Parsed %d clips from LLM response", len(clips)) + return clips + + +def save_clips(clips: list[Clip], output_path: Path) -> Path: + """Persist clips to a JSON file using the shared Pydantic schema.""" + plan = ClipPlan(source_path="", clips=list(clips)) + with open(output_path, "w", encoding="utf-8") as f: + f.write(plan.model_dump_json(indent=2)) + logger.info("Saved %d clips to %s", len(clips), output_path) + return output_path + + +def load_clips(clips_path: Path) -> list[Clip]: + """Load clips from a previously saved JSON file.""" + with open(clips_path, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict) and "clips" in data: + return [Clip.model_validate(c) for c in data["clips"]] + return [Clip.model_validate(c) for c in data] diff --git a/src/humeo/config.py b/src/humeo/config.py new file mode 100644 index 0000000000000000000000000000000000000000..053e719db08b1f25ef1ea9a80093b68f2f44dcd3 --- /dev/null +++ b/src/humeo/config.py @@ -0,0 +1,159 @@ +"""Configuration for the product pipeline.""" + +import os +from dataclasses import dataclass, field +from pathlib import Path + +from humeo_core.schemas import RenderTheme + +from humeo.env import bootstrap_env + +bootstrap_env() + +# --------------------------------------------------------------------------- +# Video Output +# --------------------------------------------------------------------------- +TARGET_WIDTH = 1080 +TARGET_HEIGHT = 1920 +TARGET_ASPECT = 9 / 16 + +# --------------------------------------------------------------------------- +# Clip Selection +# --------------------------------------------------------------------------- +# Clip length bounds for Gemini (also referenced in prompts/clip_selection_system.jinja2). +MIN_CLIP_DURATION_SEC = 50 +MAX_CLIP_DURATION_SEC = 90 +TARGET_CLIP_COUNT = 5 +TEXT_AXIS_WEIGHTS: dict[str, float] = { + "message_wow": 0.4, + "hook_emotion": 0.35, + "catchy": 0.25, +} + +# Gemini model id (override with GEMINI_MODEL in .env or shell). See docs/ENVIRONMENT.md. +GEMINI_MODEL = (os.environ.get("GEMINI_MODEL") or "google/gemini-2.5-pro").strip() or "google/gemini-2.5-pro" +# Optional *only* when layout vision should use a different id than clip selection +# (e.g. cheaper model per keyframe). Empty unset β†’ ``resolved_vision_model`` uses +# ``GEMINI_MODEL`` / ``PipelineConfig.gemini_model`` (same multimodal stack). +GEMINI_VISION_MODEL = (os.environ.get("GEMINI_VISION_MODEL") or "").strip() or None +DEFAULT_SEGMENTATION_PROVIDER = ( + (os.environ.get("HUMEO_SEGMENTATION_PROVIDER") or "").strip().lower() + or ("replicate" if (os.environ.get("REPLICATE_API_TOKEN") or "").strip() else "off") +) + +# --------------------------------------------------------------------------- +@dataclass +class PipelineConfig: + """Runtime configuration for a single pipeline run.""" + + youtube_url: str | None = None + source: str | None = None + output_dir: Path = field(default_factory=lambda: Path("output")) + # None = auto: per-video dir under the cache root (see docs/ENVIRONMENT.md). + work_dir: Path | None = None + use_video_cache: bool = True + # None = default from env (HUMEO_CACHE_ROOT) or platform default. + cache_root: Path | None = None + + # None = use GEMINI_MODEL from env / module default (Gemini-only clip selection). + gemini_model: str | None = None + # None = GEMINI_VISION_MODEL env or same as gemini_model (per-keyframe layout + bbox). + gemini_vision_model: str | None = None + render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT + hook_library_path: Path | None = None + segmentation_provider: str = DEFAULT_SEGMENTATION_PROVIDER + segmentation_model: str = "meta/sam-2-video" + # When True, always re-run clip-selection LLM (ignore clips.meta.json match). + force_clip_selection: bool = False + # When True, always re-run Gemini vision for layouts (ignore layout_vision.meta.json). + force_layout_vision: bool = False + # When True, use an isolated work dir and force all stages to recompute. + clean_run: bool = False + # When True, render stage overwrites existing output files. + overwrite_outputs: bool = False + # When True, pause after clip selection and after render for human approval. + interactive: bool = False + # Interactive steering notes injected into the clip-selection prompt on reruns. + steering_notes: list[str] = field(default_factory=list) + # Hard cap on interactive reruns. + max_iterations: int = 5 + + # Stage 2.25 - hook detection. The clip selector is unreliable at + # localising the hook sentence and tends to echo the 0.0-3.0s placeholder + # from the prompt verbatim. This dedicated stage reads each candidate + # window and returns a real hook window per clip, which Stage 2.5 then + # uses to clamp pruning safely. When False, the clip-selection hook + # (possibly a placeholder) is carried through unchanged. + detect_hooks: bool = True + # When True, re-run the hook-detection LLM even when hooks.meta.json matches. + force_hook_detection: bool = False + + # Stage 2.5 - inner-clip content pruning (HIVE "irrelevant content pruning" + # applied at clip scale). One of: off | conservative | balanced | aggressive. + # See ``src/humeo/content_pruning.py`` for the caps and the prompt. + prune_level: str = "balanced" + # When True, re-run the pruning LLM even when prune.meta.json matches. + force_content_pruning: bool = False + + # Stage 2 - candidate over-generation. The selector now asks Gemini for a + # pool of candidates (``clip_selection_candidate_count``), scores them, + # and keeps the top ones that pass ``clip_selection_quality_threshold``. + # We always keep at least ``clip_selection_min_kept`` clips even when + # none pass the threshold, so rendering never blocks on a weak transcript. + # See ``src/humeo/clip_selector.py`` for the ranking logic. + clip_selection_candidate_count: int = 12 + clip_selection_quality_threshold: float = 0.70 + clip_selection_min_kept: int = 5 + clip_selection_max_kept: int = 8 + + # Subtitle rendering / cue shaping. + # Values are in **output pixels** for a 1080x1920 short: libass is pinned to + # the output resolution via ``original_size``, so ``FontSize`` and ``MarginV`` + # mean what they say. 48px font with a 160px bottom margin lands the caption + # in the lower third with a readable-but-not-shouting size. + subtitle_font_size: int = 38 + subtitle_margin_v: int = 166 + subtitle_max_words_per_cue: int = 10 + subtitle_max_cue_sec: float = 2.8 + burn_subtitles: bool = True + subtitle_highlight_lead_sec: float = 0.06 + subtitle_highlight_min_dwell_sec: float = 0.16 + repair_subtitle_word_timings: bool = True + + # Render QA. Best-effort: failures write warnings and do not fail a render. + render_qa: bool = True + qa_reference_video: Path | None = None + qa_debug_overlay: bool = True + rerender_clip_ids: list[str] = field(default_factory=list) + rerender_warned_only: bool = False + + def __post_init__(self): + youtube_url = (self.youtube_url or "").strip() or None + source = (self.source or "").strip() or None + + if source is None and youtube_url is None: + raise ValueError("PipelineConfig requires either source or youtube_url.") + if source is not None and youtube_url is not None and source != youtube_url: + raise ValueError("PipelineConfig source and youtube_url must match when both are set.") + if source is None: + source = youtube_url + if youtube_url is None: + youtube_url = source + + self.source = source + self.youtube_url = youtube_url + if isinstance(self.render_theme, str): + self.render_theme = RenderTheme(self.render_theme) + self.segmentation_provider = (self.segmentation_provider or "off").strip().lower() + self.output_dir = Path(self.output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + if self.cache_root is not None: + self.cache_root = Path(self.cache_root) + if self.work_dir is not None: + self.work_dir = Path(self.work_dir) + self.work_dir.mkdir(parents=True, exist_ok=True) + if self.hook_library_path is not None: + self.hook_library_path = Path(self.hook_library_path) + if self.qa_reference_video is not None: + self.qa_reference_video = Path(self.qa_reference_video) + self.rerender_clip_ids = [str(clip_id).strip() for clip_id in self.rerender_clip_ids if str(clip_id).strip()] diff --git a/src/humeo/content_pruning.py b/src/humeo/content_pruning.py new file mode 100644 index 0000000000000000000000000000000000000000..72d06c3d8628b57a61f33dec0c109f7977d8ed7e --- /dev/null +++ b/src/humeo/content_pruning.py @@ -0,0 +1,1144 @@ +"""Stage 2.5 - Content pruning inside each selected clip. + +This is the HIVE "irrelevant content pruning" sub-task, applied at the +*inner-clip* scale rather than the scene scale. After the clip selector has +chosen 5 x 50-90s windows, we ask Gemini to tighten each window by dropping +weak lead-in (throat-clears, false starts, slow setup) and weak tail content +(trailing ramble, fade-out talk). + +Design choices kept deliberately minimal: + +- **No schema changes.** The existing ``Clip.trim_start_sec`` / + ``Clip.trim_end_sec`` fields already feed ``humeo.render_window`` and + ``humeo_core.primitives.compile`` via ``-ss`` / ``-t``. Writing the pruned + in / out points into those fields tightens the exported window for free. +- **Contiguous trimming only** (V1). We move the in-point forward and the + out-point backward; we do not cut in the middle. That keeps subtitles and + layout vision untouched. +- **Strict clamping** after the LLM returns, so the final duration always + respects ``MIN_CLIP_DURATION_SEC`` and any declared hook window is + preserved. +- **Never fatal.** Any failure (API error, malformed JSON, missing clip_id) + degrades to no-op trims (0.0 / 0.0) for that clip. The pipeline still + produces output identical to the pre-Stage-2.5 behaviour. +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Literal, TypeVar + +from google import genai +from openai import OpenAI +from pydantic import BaseModel, Field, ValidationError + +from humeo_core.schemas import Clip + +from humeo.config import ( + GEMINI_MODEL, + MAX_CLIP_DURATION_SEC, + MIN_CLIP_DURATION_SEC, + PipelineConfig, +) +from humeo.env import ( + OPENROUTER_BASE_URL, + current_llm_provider, + model_name_for_provider, + openrouter_default_headers, + resolve_gemini_api_key, + resolve_llm_provider, + resolve_openrouter_api_key, +) +from humeo.gemini_generate import gemini_generate_config +from humeo.prompt_loader import content_pruning_system_prompt + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + +PRUNE_META_VERSION = 1 +PRUNE_META_FILENAME = "prune.meta.json" +PRUNE_RAW_FILENAME = "prune_raw.json" +PRUNE_ARTIFACT_FILENAME = "prune.json" + +LLM_MAX_ATTEMPTS = 3 +LLM_RETRY_DELAY_SEC = 2.0 + +PruneLevel = Literal["off", "conservative", "balanced", "aggressive"] + +VALID_LEVELS: tuple[PruneLevel, ...] = ("off", "conservative", "balanced", "aggressive") + + +def _openai_message_text(content: object) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text") + if isinstance(text, str): + parts.append(text) + return "".join(parts) + return "" + +# The clip-selection prompt uses `[0.0, 3.0]` as an example / fallback hook +# window. Gemini frequently copies this placeholder verbatim instead of +# localising the real hook, which silently disables Stage 2.5 start-trims for +# every clip (the hook clamp below refuses to trim past `hook_start_sec`, so +# any `trim_start_sec > 0` returned by the prune LLM gets zeroed). +# +# Treat this exact fingerprint as "no real hook" for clamp purposes. The real +# fix is the Stage 2.25 hook detector (``humeo.hook_detector``) which +# overwrites the clip's hook fields with a localised window before pruning +# runs. This constant is the belt-and-suspenders guard for the case where +# hook detection is disabled, fails, or cache-hits stale data. +_DEFAULT_HOOK_FINGERPRINT: tuple[float, float] = (0.0, 3.0) +_DEFAULT_HOOK_EPS: float = 1e-3 + + +def _looks_like_default_hook(hook_start: float | None, hook_end: float | None) -> bool: + """True when the hook window matches the prompt's 0-3s placeholder. + + This is intentionally a narrow, exact-match check so a real hook that + happens to open at t=0 with a 3.0s window is still respected. + """ + if hook_start is None or hook_end is None: + return False + return ( + abs(hook_start - _DEFAULT_HOOK_FINGERPRINT[0]) < _DEFAULT_HOOK_EPS + and abs(hook_end - _DEFAULT_HOOK_FINGERPRINT[1]) < _DEFAULT_HOOK_EPS + ) + +# Per-level cap on the fraction of the original clip the LLM is allowed to +# trim. Even if the LLM tries to be more eager, we clamp. Final duration is +# additionally clamped to ``MIN_CLIP_DURATION_SEC``. +_MAX_TOTAL_TRIM_PCT: dict[PruneLevel, float] = { + "off": 0.0, + "conservative": 0.10, + "balanced": 0.20, + "aggressive": 0.35, +} + + +class _PruneDecision(BaseModel): + """Per-clip decision returned by Gemini (clip-relative seconds).""" + + clip_id: str + trim_start_sec: float = Field(default=0.0, ge=0.0) + trim_end_sec: float = Field(default=0.0, ge=0.0) + reason: str = "" + + +class _PruneResponse(BaseModel): + decisions: list[_PruneDecision] = Field(default_factory=list) + + +@dataclass +class _ClampStats: + """Diagnostics for why a returned trim got reshaped.""" + + clamped_start: bool = False + clamped_end: bool = False + hook_protected: bool = False + min_duration_protected: bool = False + max_pct_protected: bool = False + + +def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T: + last: Exception | None = None + for i in range(attempts): + try: + return fn() + except Exception as e: + last = e + if i < attempts - 1: + logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e) + time.sleep(LLM_RETRY_DELAY_SEC * (i + 1)) + assert last is not None + raise last + + +# --------------------------------------------------------------------------- +# Clamping +# --------------------------------------------------------------------------- + + +def _clamp_decision( + clip: Clip, + trim_start: float, + trim_end: float, + *, + level: PruneLevel, +) -> tuple[float, float, _ClampStats]: + """Clamp a raw (trim_start, trim_end) pair so the resulting clip is legal. + + Guarantees: + - ``trim_start`` and ``trim_end`` are non-negative. + - Final duration (``clip.duration_sec - trim_start - trim_end``) is at + least ``MIN_CLIP_DURATION_SEC`` (or the original duration, whichever is + smaller - we never *extend* a clip that was already too short). + - Combined trim does not exceed the level's allowed fraction of the + original duration. + - If ``hook_start_sec`` / ``hook_end_sec`` are set on the clip, the hook + window stays fully inside the result. + """ + stats = _ClampStats() + duration = clip.duration_sec + + ts = max(0.0, float(trim_start)) + te = max(0.0, float(trim_end)) + if ts != trim_start: + stats.clamped_start = True + if te != trim_end: + stats.clamped_end = True + + max_pct = _MAX_TOTAL_TRIM_PCT.get(level, 0.0) + max_total_trim = duration * max_pct + if ts + te > max_total_trim: + scale = max_total_trim / max(ts + te, 1e-9) + ts = ts * scale + te = te * scale + stats.max_pct_protected = True + + # Only protect the hook when the clip carries a *real* localised hook + # window. The clip-selection LLM frequently echoes the prompt's + # 0.0-3.0s placeholder, which would otherwise lock ``trim_start`` to 0 + # for every clip and silently disable the entire pruning stage. See + # ``_looks_like_default_hook`` for the fingerprint rationale. + hook_is_real = ( + clip.hook_start_sec is not None + and clip.hook_end_sec is not None + and not _looks_like_default_hook(clip.hook_start_sec, clip.hook_end_sec) + ) + if hook_is_real: + hook_lo = clip.hook_start_sec # type: ignore[assignment] + hook_hi = clip.hook_end_sec # type: ignore[assignment] + if ts > max(0.0, hook_lo - 0.25): + ts = max(0.0, hook_lo - 0.25) + stats.hook_protected = True + if te > max(0.0, duration - hook_hi - 0.25): + te = max(0.0, duration - hook_hi - 0.25) + stats.hook_protected = True + + min_final = min(float(MIN_CLIP_DURATION_SEC), duration) + max_total_by_min = max(0.0, duration - min_final) + if ts + te > max_total_by_min: + overflow = ts + te - max_total_by_min + te_cut = min(te, overflow) + te -= te_cut + overflow -= te_cut + if overflow > 0: + ts = max(0.0, ts - overflow) + stats.min_duration_protected = True + + ts = max(0.0, min(ts, duration)) + te = max(0.0, min(te, duration - ts)) + return ts, te, stats + + +# Tolerance used when snapping trim boundaries to WhisperX segment edges. A +# 3s window comfortably covers "finish the current sentence" cases without +# materially deviating from what the LLM asked for. Tuned on the reported +# mid-sentence cut in clip 001 of the ``PdVv_vLkUgk`` run (6.38s trim vs a +# sentence that ended ~1.5s later). +_SEGMENT_SNAP_TOLERANCE_SEC: float = 3.0 +_BOUNDARY_GAP_SEC: float = 0.5 +_BOUNDARY_TIME_EPS_SEC: float = 0.12 +_START_BOUNDARY_WINDOW_SEC: float = 3.0 +_END_BOUNDARY_WINDOW_SEC: float = 2.0 +_TERMINAL_PUNCT: tuple[str, ...] = (".", "?", "!") +_WEAK_START_WORDS: frozenset[str] = frozenset({"and", "but", "so", "or", "then", "because"}) + + +@dataclass(frozen=True) +class _BoundaryCandidate: + """A possible snapped boundary on the source timeline.""" + + time_sec: float + clean: bool + reason: str + source: str + weak_start: bool = False + + +def _snap_trims_to_segment_boundaries( + clip: Clip, + transcript: dict, + *, + level: PruneLevel, + tolerance_sec: float = _SEGMENT_SNAP_TOLERANCE_SEC, +) -> tuple[float, float]: + """Snap an already-clamped ``(trim_start, trim_end)`` to phrase boundaries. + + WhisperX segments correspond to natural phrase / sentence groupings. + Landing cuts on segment edges eliminates the "this could be..." class of + mid-sentence truncation, even when the LLM rounds to an arbitrary + syllable. + + Direction preference: + + - ``trim_start``: prefer the nearest segment START at-or-after the + current in-point (trim a hair more to drop lead-in filler). Fallback + is the nearest segment start behind, within tolerance. + - ``trim_end``: prefer the nearest segment END at-or-after the current + out-point (let the sentence finish, keeping MORE content). Fallback + is the nearest segment end before, within tolerance. + + Safety: the snapped pair is reverted if it would violate + ``MIN_CLIP_DURATION_SEC``, exceed the level's ``max_pct`` trim cap, or + eat into a real (non-placeholder) hook window. Snapping can only + *improve* a decision, never break it. + """ + ts0 = float(clip.trim_start_sec) + te0 = float(clip.trim_end_sec) + if ts0 < 0.05 and te0 < 0.05: + return ts0, te0 + + segs = _segments_within_clip(transcript, clip) + if not segs: + return ts0, te0 + + duration = clip.duration_sec + seg_starts = [float(s["start"]) for s in segs] + seg_ends = [float(s["end"]) for s in segs] + + new_ts = ts0 + if ts0 >= 0.05: + forward = [s for s in seg_starts if s >= ts0 and (s - ts0) <= tolerance_sec] + backward = [s for s in seg_starts if s < ts0 and (ts0 - s) <= tolerance_sec] + if forward: + new_ts = min(forward) + elif backward: + new_ts = max(backward) + + new_te = te0 + if te0 >= 0.05: + out0 = duration - te0 + forward = [e for e in seg_ends if e >= out0 and (e - out0) <= tolerance_sec] + backward = [e for e in seg_ends if e < out0 and (out0 - e) <= tolerance_sec] + if forward: + new_out = min(forward) + elif backward: + new_out = max(backward) + else: + new_out = out0 + new_te = max(0.0, duration - new_out) + + new_ts = max(0.0, min(new_ts, duration)) + new_te = max(0.0, min(new_te, duration - new_ts)) + + min_final = min(float(MIN_CLIP_DURATION_SEC), duration) + if duration - new_ts - new_te < min_final - 1e-6: + return ts0, te0 + + max_pct = _MAX_TOTAL_TRIM_PCT.get(level, 0.0) + if max_pct > 0.0 and (new_ts + new_te) > duration * max_pct + 1e-6: + return ts0, te0 + + if ( + clip.hook_start_sec is not None + and clip.hook_end_sec is not None + and not _looks_like_default_hook(clip.hook_start_sec, clip.hook_end_sec) + ): + hook_lo = float(clip.hook_start_sec) + hook_hi = float(clip.hook_end_sec) + if new_ts > max(0.0, hook_lo - 0.25) + 1e-6: + return ts0, te0 + if duration - new_te < hook_hi + 0.25 - 1e-6: + return ts0, te0 + + return new_ts, new_te + + +def _flatten_transcript_words(transcript: dict) -> list[dict[str, float | str]]: + words: list[dict[str, float | str]] = [] + for seg in transcript.get("segments", []): + for word in seg.get("words", []): + if "start" not in word or "end" not in word: + continue + try: + start = float(word["start"]) + end = float(word["end"]) + except (TypeError, ValueError): + continue + words.append( + { + "word": str(word.get("word", "")), + "start": start, + "end": end, + } + ) + return words + + +def _normalized_last_char(text: str) -> str: + stripped = text.rstrip() + return stripped[-1] if stripped else "" + + +def _segment_start_hint( + segments: list[dict[str, Any]], + words: list[dict[str, float | str]], + time_sec: float, +) -> tuple[bool, str, bool]: + for idx, seg in enumerate(segments): + seg_start = float(seg.get("start", 0.0)) + if abs(seg_start - time_sec) > _BOUNDARY_TIME_EPS_SEC: + continue + seg_words = seg.get("words") or [] + first_word = "" + if seg_words: + first_word = str(seg_words[0].get("word", "")).strip().lower() + weak_start = first_word in _WEAK_START_WORDS + if idx == 0: + return True, "first transcript segment", weak_start + prev_text = str(segments[idx - 1].get("text", "")).rstrip() + if _normalized_last_char(prev_text) in _TERMINAL_PUNCT: + return True, "previous segment ends with terminal punctuation", weak_start + break + + for idx, word in enumerate(words): + start = float(word["start"]) + if abs(start - time_sec) > _BOUNDARY_TIME_EPS_SEC: + continue + weak_start = str(word["word"]).strip().lower() in _WEAK_START_WORDS + if idx == 0: + return True, "first transcript word", weak_start + gap_before = start - float(words[idx - 1]["end"]) + if gap_before >= _BOUNDARY_GAP_SEC: + return True, f"silence gap before boundary ({gap_before:.2f}s)", weak_start + return False, "no terminal punctuation or >=0.5s silence before boundary", weak_start + + return False, "no matching transcript boundary", False + + +def _segment_end_hint( + segments: list[dict[str, Any]], + words: list[dict[str, float | str]], + time_sec: float, +) -> tuple[bool, str]: + for seg in segments: + seg_end = float(seg.get("end", 0.0)) + if abs(seg_end - time_sec) > _BOUNDARY_TIME_EPS_SEC: + continue + text = str(seg.get("text", "")).rstrip() + if _normalized_last_char(text) in _TERMINAL_PUNCT: + return True, "segment ends with terminal punctuation" + break + + for idx, word in enumerate(words): + end = float(word["end"]) + if abs(end - time_sec) > _BOUNDARY_TIME_EPS_SEC: + continue + if idx == len(words) - 1: + return True, "last transcript word" + gap_after = float(words[idx + 1]["start"]) - end + if gap_after >= _BOUNDARY_GAP_SEC: + return True, f"silence gap after boundary ({gap_after:.2f}s)" + return False, "no terminal punctuation or >=0.5s silence after boundary" + + return False, "no matching transcript boundary" + + +def _candidate_key(time_sec: float) -> float: + return round(time_sec, 3) + + +def _gather_start_candidates( + clip: Clip, + current_start: float, + transcript: dict, +) -> list[_BoundaryCandidate]: + low = current_start - _START_BOUNDARY_WINDOW_SEC + high = current_start + _START_BOUNDARY_WINDOW_SEC + segments = list(transcript.get("segments", [])) + words = _flatten_transcript_words(transcript) + + by_time: dict[float, _BoundaryCandidate] = {} + + def add_candidate(time_sec: float, source: str) -> None: + clean, reason, weak = _segment_start_hint(segments, words, time_sec) + candidate = _BoundaryCandidate( + time_sec=float(time_sec), + clean=clean, + reason=reason, + source=source, + weak_start=weak, + ) + key = _candidate_key(candidate.time_sec) + existing = by_time.get(key) + if existing is None: + by_time[key] = candidate + return + if candidate.clean and not existing.clean: + by_time[key] = candidate + return + if candidate.clean == existing.clean and not candidate.weak_start and existing.weak_start: + by_time[key] = candidate + + add_candidate(current_start, "current") + add_candidate(clip.start_time_sec, "raw") + + for seg in segments: + seg_start = float(seg.get("start", 0.0)) + if low <= seg_start <= high: + add_candidate(seg_start, "segment") + for word in words: + word_start = float(word["start"]) + if low <= word_start <= high: + add_candidate(word_start, "word") + + return list(by_time.values()) + + +def _gather_end_candidates( + clip: Clip, + current_end: float, + transcript: dict, +) -> list[_BoundaryCandidate]: + low = current_end - _END_BOUNDARY_WINDOW_SEC + high = current_end + _END_BOUNDARY_WINDOW_SEC + segments = list(transcript.get("segments", [])) + words = _flatten_transcript_words(transcript) + + by_time: dict[float, _BoundaryCandidate] = {} + + def add_candidate(time_sec: float, source: str) -> None: + clean, reason = _segment_end_hint(segments, words, time_sec) + candidate = _BoundaryCandidate( + time_sec=float(time_sec), + clean=clean, + reason=reason, + source=source, + ) + key = _candidate_key(candidate.time_sec) + existing = by_time.get(key) + if existing is None or (candidate.clean and not existing.clean): + by_time[key] = candidate + + add_candidate(current_end, "current") + add_candidate(clip.end_time_sec, "raw") + + for seg in segments: + seg_end = float(seg.get("end", 0.0)) + if low <= seg_end <= high: + add_candidate(seg_end, "segment") + for word in words: + word_end = float(word["end"]) + if low <= word_end <= high: + add_candidate(word_end, "word") + + return list(by_time.values()) + + +def _candidate_priority(current_time: float, candidate: _BoundaryCandidate) -> tuple[int, int, int, float]: + source_rank = {"current": 0, "raw": 1, "segment": 2, "word": 3}.get(candidate.source, 9) + weak_rank = 1 if candidate.weak_start else 0 + clean_rank = 0 if candidate.clean else 1 + return (clean_rank, weak_rank, source_rank, abs(candidate.time_sec - current_time)) + + +def _pair_priority( + current_start: float, + current_end: float, + start_candidate: _BoundaryCandidate, + end_candidate: _BoundaryCandidate, +) -> tuple[int, int, int, float]: + good_start = start_candidate.clean and not start_candidate.weak_start + good_end = end_candidate.clean + return ( + -(int(good_start) + int(good_end)), + 1 if start_candidate.weak_start else 0, + 0 if (good_start or good_end) else 1, + abs(start_candidate.time_sec - current_start) + abs(end_candidate.time_sec - current_end), + ) + + +def snap_render_windows_to_sentence_boundaries( + clips: list[Clip], + transcript: dict, +) -> list[Clip]: + """Snap render windows to nearby complete-thought boundaries. + + This runs after Stage 2.5 pruning and operates on the *actual* render + window (`start + trim_start`, `end - trim_end`). Unlike trim snapping, it + can undo a harmful trim or move slightly beyond the original selected + window, as long as the final duration still satisfies the hard + `[MIN_CLIP_DURATION_SEC, MAX_CLIP_DURATION_SEC]` contract. + """ + if not transcript.get("segments"): + return clips + + snapped: list[Clip] = [] + for clip in clips: + current_start = clip.start_time_sec + clip.trim_start_sec + current_end = clip.end_time_sec - clip.trim_end_sec + start_candidates = sorted( + _gather_start_candidates(clip, current_start, transcript), + key=lambda c: _candidate_priority(current_start, c), + ) + end_candidates = sorted( + _gather_end_candidates(clip, current_end, transcript), + key=lambda c: _candidate_priority(current_end, c), + ) + + current_start_candidate = next(c for c in start_candidates if c.source == "current") + current_end_candidate = next(c for c in end_candidates if c.source == "current") + current_pair = (current_start_candidate, current_end_candidate) + best_pair = current_pair + best_priority = _pair_priority( + current_start, + current_end, + current_start_candidate, + current_end_candidate, + ) + + for start_candidate in start_candidates: + for end_candidate in end_candidates: + if end_candidate.time_sec <= start_candidate.time_sec: + continue + duration = end_candidate.time_sec - start_candidate.time_sec + if duration < MIN_CLIP_DURATION_SEC or duration > MAX_CLIP_DURATION_SEC: + continue + priority = _pair_priority( + current_start, + current_end, + start_candidate, + end_candidate, + ) + if priority < best_priority: + best_pair = (start_candidate, end_candidate) + best_priority = priority + + start_candidate, end_candidate = best_pair + start_improved = best_pair[0] is not current_pair[0] + end_improved = best_pair[1] is not current_pair[1] + if start_improved or end_improved: + logger.info( + "Clip %s: render window snapped %.2f-%.2f -> %.2f-%.2f " + "(start=%s; end=%s).", + clip.clip_id, + current_start, + current_end, + start_candidate.time_sec, + end_candidate.time_sec, + start_candidate.reason, + end_candidate.reason, + ) + snapped.append( + clip.model_copy( + update={ + "start_time_sec": start_candidate.time_sec, + "end_time_sec": end_candidate.time_sec, + "trim_start_sec": 0.0, + "trim_end_sec": 0.0, + "hook_start_sec": None, + "hook_end_sec": None, + } + ) + ) + continue + + warnings: list[str] = [] + if not current_start_candidate.clean or current_start_candidate.weak_start: + warnings.append(f"start@{current_start:.2f}s") + if not current_end_candidate.clean: + warnings.append(f"end@{current_end:.2f}s") + if warnings: + logger.warning( + "Clip %s: no valid clean sentence boundary found for %s; leaving render window unchanged.", + clip.clip_id, + ", ".join(warnings), + ) + snapped.append(clip) + + return snapped + + +def apply_prune_decisions( + clips: list[Clip], + decisions: list[_PruneDecision], + *, + level: PruneLevel, + transcript: dict | None = None, +) -> list[Clip]: + """Return new clips with trim_start / trim_end set from LLM decisions. + + Clips whose ``clip_id`` is missing from ``decisions`` are returned with + trims of 0 / 0 (no-op). Decisions are always clamped; no exception is + raised if the model returned invalid numbers. + + When ``transcript`` is provided, each clamped trim pair is additionally + snapped to the nearest WhisperX segment boundary (see + :func:`_snap_trims_to_segment_boundaries`) so cuts never land + mid-sentence. The clamp is authoritative -- snapping only ever produces + an equally-safe boundary, never a looser one. + """ + by_id = {d.clip_id: d for d in decisions} + out: list[Clip] = [] + for clip in clips: + d = by_id.get(clip.clip_id) + if d is None or level == "off": + out.append(clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0})) + continue + ts, te, stats = _clamp_decision( + clip, d.trim_start_sec, d.trim_end_sec, level=level + ) + # Surface every non-trivial clamp so silent degradations (e.g. a + # fake hook nuking every trim) are visible in INFO logs, not just + # buried in ``prune_raw.json``. + requested = d.trim_start_sec + d.trim_end_sec + applied = ts + te + reshaped = ( + stats.hook_protected + or stats.min_duration_protected + or stats.max_pct_protected + or (requested > 0.0 and abs(applied - requested) > 0.05) + ) + if reshaped: + logger.info( + "Clip %s: prune decision clamped (hook=%s min=%s cap=%s) " + "requested %.2f/%.2f -> applied %.2f/%.2f", + clip.clip_id, + stats.hook_protected, + stats.min_duration_protected, + stats.max_pct_protected, + d.trim_start_sec, + d.trim_end_sec, + ts, + te, + ) + candidate = clip.model_copy(update={"trim_start_sec": ts, "trim_end_sec": te}) + if transcript is not None: + snapped_ts, snapped_te = _snap_trims_to_segment_boundaries( + candidate, transcript, level=level + ) + if abs(snapped_ts - ts) > 1e-3 or abs(snapped_te - te) > 1e-3: + logger.info( + "Clip %s: prune boundaries snapped to segment edges " + "%.2f/%.2f -> %.2f/%.2f", + clip.clip_id, + ts, + te, + snapped_ts, + snapped_te, + ) + candidate = candidate.model_copy( + update={"trim_start_sec": snapped_ts, "trim_end_sec": snapped_te} + ) + out.append(candidate) + return out + + +# --------------------------------------------------------------------------- +# Prompt construction +# --------------------------------------------------------------------------- + + +def _segments_within_clip(transcript: dict, clip: Clip) -> list[dict]: + """Return transcript segments that overlap the clip window, with times + expressed as seconds relative to the clip start. + """ + s0 = clip.start_time_sec + s1 = clip.end_time_sec + lines: list[dict] = [] + for seg in transcript.get("segments", []): + start = float(seg.get("start", 0.0)) + end = float(seg.get("end", start)) + if end <= s0 or start >= s1: + continue + rel_start = max(0.0, start - s0) + rel_end = min(clip.duration_sec, end - s0) + if rel_end <= rel_start: + continue + lines.append( + { + "start": rel_start, + "end": rel_end, + "text": (seg.get("text") or "").strip(), + } + ) + return lines + + +def _build_user_message(clips: list[Clip], transcript: dict) -> str: + """Render a compact textual view of every clip for the LLM user turn.""" + blocks: list[str] = [] + for clip in clips: + seg_lines = _segments_within_clip(transcript, clip) + header = ( + f"clip_id: {clip.clip_id}\n" + f"duration_sec: {clip.duration_sec:.2f}\n" + f"topic: {clip.topic}" + ) + if clip.hook_start_sec is not None and clip.hook_end_sec is not None: + header += ( + f"\nhook_window_sec: [{clip.hook_start_sec:.2f}, {clip.hook_end_sec:.2f}]" + ) + body = "\n".join( + f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}" for seg in seg_lines + ) + if not body: + body = "(no segments overlap this clip window)" + blocks.append(f"{header}\n---\n{body}") + return "\n\n===\n\n".join(blocks) + + +# --------------------------------------------------------------------------- +# Cache +# --------------------------------------------------------------------------- + + +def _clips_fingerprint(clips: list[Clip]) -> str: + """Fingerprint the clip *windows* (not trims, so the cache ignores previous + prune results when deciding whether to re-ask the LLM). + """ + payload = json.dumps( + [ + { + "id": c.clip_id, + "s": round(c.start_time_sec, 3), + "e": round(c.end_time_sec, 3), + "hs": c.hook_start_sec, + "he": c.hook_end_sec, + } + for c in clips + ], + sort_keys=True, + ensure_ascii=False, + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def _resolved_gemini_model(config: PipelineConfig) -> str: + return (config.gemini_model or GEMINI_MODEL).strip() + + +def _prune_meta( + *, + transcript_fp: str, + clips_fp: str, + config: PipelineConfig, + level: PruneLevel, +) -> dict[str, Any]: + return { + "version": PRUNE_META_VERSION, + "transcript_sha256": transcript_fp, + "clips_sha256": clips_fp, + "gemini_model": _resolved_gemini_model(config), + "prune_level": level, + "llm_backend": current_llm_provider() or "google", + } + + +def _load_cached_clips(work_dir: Path, clips: list[Clip]) -> list[Clip] | None: + artifact = work_dir / PRUNE_ARTIFACT_FILENAME + if not artifact.is_file(): + return None + try: + with open(artifact, "r", encoding="utf-8") as f: + data = json.load(f) + cached = {item["clip_id"]: item for item in data.get("clips", [])} + except Exception as e: + logger.warning("Prune cache artifact unreadable (%s); re-running.", e) + return None + out: list[Clip] = [] + for clip in clips: + cached_c = cached.get(clip.clip_id) + if cached_c is None: + return None + out.append( + clip.model_copy( + update={ + "trim_start_sec": float(cached_c.get("trim_start_sec", 0.0)), + "trim_end_sec": float(cached_c.get("trim_end_sec", 0.0)), + } + ) + ) + return out + + +def _write_cache( + work_dir: Path, + *, + pruned: list[Clip], + meta: dict[str, Any], + raw_response: str, +) -> None: + work_dir.mkdir(parents=True, exist_ok=True) + payload = { + "clips": [ + { + "clip_id": c.clip_id, + "trim_start_sec": c.trim_start_sec, + "trim_end_sec": c.trim_end_sec, + } + for c in pruned + ] + } + (work_dir / PRUNE_ARTIFACT_FILENAME).write_text( + json.dumps(payload, indent=2), encoding="utf-8" + ) + (work_dir / PRUNE_RAW_FILENAME).write_text(raw_response, encoding="utf-8") + with open(work_dir / PRUNE_META_FILENAME, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + f.write("\n") + logger.info( + "Wrote %s, %s and %s", + PRUNE_META_FILENAME, + PRUNE_ARTIFACT_FILENAME, + PRUNE_RAW_FILENAME, + ) + + +def _prune_cache_valid( + work_dir: Path, + *, + transcript_fp: str, + clips_fp: str, + config: PipelineConfig, + level: PruneLevel, +) -> bool: + meta_path = work_dir / PRUNE_META_FILENAME + if not meta_path.is_file(): + return False + try: + with open(meta_path, encoding="utf-8") as f: + meta = json.load(f) + except Exception: + return False + if meta.get("version") != PRUNE_META_VERSION: + return False + if meta.get("transcript_sha256") != transcript_fp: + return False + if meta.get("clips_sha256") != clips_fp: + return False + current_provider = current_llm_provider() + meta_provider = meta.get("llm_backend") + if current_provider == "openrouter": + if meta_provider != "openrouter": + return False + elif current_provider == "google": + if meta_provider not in (None, "google"): + return False + if meta.get("gemini_model") != _resolved_gemini_model(config): + return False + if meta.get("prune_level") != level: + return False + return True + + +# --------------------------------------------------------------------------- +# Gemini call +# --------------------------------------------------------------------------- + + +def _parse_decisions(raw_json: str) -> list[_PruneDecision]: + """Parse a raw JSON response into decisions; bare arrays accepted too.""" + data = json.loads(raw_json) + if isinstance(data, dict) and "decisions" in data: + try: + return _PruneResponse.model_validate(data).decisions + except ValidationError as e: + logger.warning("Prune response failed validation: %s", e) + return [] + if isinstance(data, list): + decisions: list[_PruneDecision] = [] + for item in data: + try: + decisions.append(_PruneDecision.model_validate(item)) + except ValidationError: + continue + return decisions + return [] + + +def request_prune_decisions( + clips: list[Clip], + transcript: dict, + *, + level: PruneLevel, + gemini_model: str | None = None, +) -> tuple[list[_PruneDecision], str]: + """Call Gemini for (potentially) one decision per clip. + + Returns ``(decisions, raw_response)``. ``raw_response`` is the literal + string Gemini returned (cached to ``prune_raw.json`` for audit). On + transport or parse failure this raises; callers should catch and treat as + no-op. + """ + if level == "off" or not clips: + return [], "{\"decisions\": []}" + + system = content_pruning_system_prompt( + min_dur=MIN_CLIP_DURATION_SEC, + max_dur=MAX_CLIP_DURATION_SEC, + level=level, + ) + user_text = _build_user_message(clips, transcript) + + provider = resolve_llm_provider() + model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider) + + def _call() -> str: + logger.info( + "%s content pruning (model=%s, level=%s, clips=%d)...", + provider, + model_name, + level, + len(clips), + ) + if provider == "google": + client = genai.Client(api_key=resolve_gemini_api_key()) + response = client.models.generate_content( + model=model_name, + contents=user_text, + config=gemini_generate_config( + system_instruction=system, + temperature=0.2, + response_mime_type="application/json", + ), + ) + if not response.text: + raise RuntimeError("Gemini returned empty response text for content pruning") + return response.text + + client = OpenAI( + api_key=resolve_openrouter_api_key(), + base_url=OPENROUTER_BASE_URL, + default_headers=openrouter_default_headers(), + ) + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user_text}, + ], + temperature=0.2, + response_format={"type": "json_object"}, + ) + text = _openai_message_text(response.choices[0].message.content) + if not text: + raise RuntimeError("OpenRouter returned empty response text for content pruning") + return text + + raw = _retry_llm("Gemini content pruning", _call) + decisions = _parse_decisions(raw) + return decisions, raw + + +# --------------------------------------------------------------------------- +# Public stage entrypoint (used by pipeline.run_pipeline) +# --------------------------------------------------------------------------- + + +def run_content_pruning_stage( + work_dir: Path, + clips: list[Clip], + transcript: dict, + *, + transcript_fp: str, + config: PipelineConfig, +) -> list[Clip]: + """Apply Stage 2.5 pruning to ``clips`` and return the new list. + + - When ``config.prune_level == "off"``, this is a cheap no-op: returns a + copy of the clips with trim_start/end zeroed. + - Otherwise, tries the cache first, then calls Gemini. A failing call + degrades to no-op (the pipeline is never killed by Stage 2.5). + """ + level = _validated_level(config.prune_level) + if level == "off": + logger.info("Content pruning disabled (prune_level=off); skipping Stage 2.5.") + return [ + clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0}) + for clip in clips + ] + + clips_fp = _clips_fingerprint(clips) + + if not config.force_content_pruning and _prune_cache_valid( + work_dir, + transcript_fp=transcript_fp, + clips_fp=clips_fp, + config=config, + level=level, + ): + cached = _load_cached_clips(work_dir, clips) + if cached is not None: + logger.info( + "Content pruning cache hit (level=%s, %d clips); skipping LLM.", + level, + len(clips), + ) + return cached + + try: + decisions, raw = request_prune_decisions( + clips, transcript, level=level, gemini_model=config.gemini_model + ) + except Exception as e: + logger.warning( + "Content pruning call failed (%s); continuing with un-pruned clips.", e + ) + return [ + clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0}) + for clip in clips + ] + + pruned = apply_prune_decisions( + clips, decisions, level=level, transcript=transcript + ) + _log_prune_summary(pruned, clips) + + meta = _prune_meta( + transcript_fp=transcript_fp, + clips_fp=clips_fp, + config=config, + level=level, + ) + try: + _write_cache(work_dir, pruned=pruned, meta=meta, raw_response=raw) + except Exception as e: + logger.warning("Failed to write prune cache (%s); continuing.", e) + return pruned + + +def _validated_level(level: str | None) -> PruneLevel: + lvl = (level or "balanced").strip().lower() + if lvl not in VALID_LEVELS: + logger.warning("Unknown prune_level=%r; falling back to 'balanced'.", level) + return "balanced" + return lvl # type: ignore[return-value] + + +def _log_prune_summary(pruned: list[Clip], original: list[Clip]) -> None: + total_before = sum(c.duration_sec for c in original) + total_after = sum( + max(0.0, c.duration_sec - c.trim_start_sec - c.trim_end_sec) for c in pruned + ) + removed = total_before - total_after + pct = (removed / total_before * 100.0) if total_before > 0 else 0.0 + logger.info( + "Content pruning done: removed %.1fs across %d clips (%.1f%% of total).", + removed, + len(pruned), + pct, + ) + for c in pruned: + if c.trim_start_sec > 0 or c.trim_end_sec > 0: + final = c.duration_sec - c.trim_start_sec - c.trim_end_sec + logger.info( + " [%s] trim=%.2fs/%.2fs %.1fs -> %.1fs", + c.clip_id, + c.trim_start_sec, + c.trim_end_sec, + c.duration_sec, + final, + ) diff --git a/src/humeo/cutter.py b/src/humeo/cutter.py new file mode 100644 index 0000000000000000000000000000000000000000..b203cadae774dc1bae5da380a56ff6652c8102bf --- /dev/null +++ b/src/humeo/cutter.py @@ -0,0 +1,651 @@ +"""Subtitle helpers for the product pipeline.""" + +import logging +import math +import os +import re +from pathlib import Path + +from humeo_core.schemas import Clip, RenderTheme, TranscriptWord + +from humeo.transcript_align import ( + clip_subtitle_words, + clip_words_to_srt_lines, + format_ass, + format_srt, + group_words_to_cue_chunks, +) + +logger = logging.getLogger(__name__) + +_NATIVE_HIGHLIGHT_FONT_NAME = "Arial" +_NATIVE_HIGHLIGHT_PURPLE = "&H00F65C8B" +_NATIVE_HIGHLIGHT_LEAD_SEC = 0.06 +_NATIVE_HIGHLIGHT_MIN_DWELL_SEC = 0.16 +_NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC = 0.035 +_NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC = 1.65 +_NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO = 0.92 +_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE = r"\blur3.0" +_NATIVE_HIGHLIGHT_STOPWORDS = { + "a", + "all", + "an", + "and", + "are", + "as", + "at", + "be", + "but", + "by", + "for", + "from", + "i", + "if", + "in", + "is", + "it", + "of", + "on", + "or", + "so", + "that", + "the", + "their", + "there", + "they", + "this", + "to", + "was", + "we", + "with", + "you", + "your", + "has", + "have", + "had", + "been", + "being", +} + + +def _balance_reference_caption(text: str) -> str: + words = text.split() + if len(words) <= 5 and len(text) <= 28: + return text + best_idx = 1 + best_delta = 10**9 + for idx in range(1, len(words)): + left = " ".join(words[:idx]) + right = " ".join(words[idx:]) + line_penalty = 0 + if len(words[:idx]) < 2 or len(words[idx:]) < 2: + line_penalty += 1000 + delta = abs(len(left) - len(right)) + abs(len(words[:idx]) - len(words[idx:])) * 6 + line_penalty + if delta < best_delta: + best_delta = delta + best_idx = idx + return " ".join(words[:best_idx]) + "\n" + " ".join(words[best_idx:]) + + +def _native_line_width(font, words) -> float: + return _text_width(font, " ".join(word.word for word in words)) + + +def _native_highlight_partition_penalty(lines, font, max_line_width: float) -> float: + widths = [_native_line_width(font, line) for line in lines] + overflow = sum(max(0.0, width - max_line_width) for width in widths) + word_counts = [len(line) for line in lines] + total_words = sum(word_counts) + width_balance = (max(widths) - min(widths)) if len(widths) > 1 else 0.0 + word_balance = (max(word_counts) - min(word_counts)) if len(word_counts) > 1 else 0 + single_word_penalty = sum(260 for line in lines if len(line) == 1 and total_words > 3) + return ( + overflow * 80.0 + + len(lines) * 120.0 + + width_balance * 0.16 + + word_balance * 120.0 + + single_word_penalty + ) + + +def _candidate_native_highlight_partitions(words, max_lines: int): + n = len(words) + if n == 0: + return [] + if max_lines <= 1 or n == 1: + return [[list(words)]] + + out = [[list(words)]] + for first_break in range(1, n): + out.append([list(words[:first_break]), list(words[first_break:])]) + if max_lines >= 3 and n >= 3: + for first_break in range(1, n - 1): + for second_break in range(first_break + 1, n): + out.append( + [ + list(words[:first_break]), + list(words[first_break:second_break]), + list(words[second_break:]), + ] + ) + return out + + +def _split_native_highlight_lines(words, *, font=None, max_line_width: float | None = None): + if len(words) <= 3 and len(" ".join(word.word for word in words)) <= 22: + return [list(words)] + if len(words) < 2: + return [list(words)] + if font is not None and max_line_width is not None: + candidates = _candidate_native_highlight_partitions(words, max_lines=3) + return min( + candidates, + key=lambda lines: _native_highlight_partition_penalty( + lines, + font, + max_line_width, + ), + ) + best_idx = 1 + best_delta = 10**9 + for idx in range(1, len(words)): + left_words = words[:idx] + right_words = words[idx:] + left = " ".join(word.word for word in left_words) + right = " ".join(word.word for word in right_words) + line_penalty = 0 + if len(left_words) < 2 or len(right_words) < 2: + line_penalty += 800 + delta = abs(len(left) - len(right)) + abs(len(left_words) - len(right_words)) * 7 + line_penalty + if delta < best_delta: + best_delta = delta + best_idx = idx + return [list(words[:best_idx]), list(words[best_idx:])] + + +def _clean_native_highlight_token(text: str) -> str: + return re.sub(r"(^[^A-Za-z0-9$%#]+|[^A-Za-z0-9$%#]+$)", "", text or "") + + +def _native_highlight_span_score(words) -> float: + cleaned = [_clean_native_highlight_token(word.word) for word in words] + cleaned = [token for token in cleaned if token] + if not cleaned: + return -1e9 + if all(token.lower() in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned): + return -1e9 + + score = 0.0 + for token in cleaned: + lower = token.lower() + if lower not in _NATIVE_HIGHLIGHT_STOPWORDS: + score += 2.0 + if any(ch.isdigit() for ch in token) or "$" in token or "%" in token: + score += 3.0 + if len(token) >= 6: + score += 0.8 + if token.isupper() and len(token) > 1: + score += 0.6 + if len(cleaned) == 2: + score -= 0.55 + if any(any(ch.isdigit() for ch in token) or "$" in token or "%" in token for token in cleaned): + score += 1.1 + elif cleaned[0].lower() in _NATIVE_HIGHLIGHT_STOPWORDS or cleaned[1].lower() in _NATIVE_HIGHLIGHT_STOPWORDS: + score -= 0.6 + else: + score += 0.3 + if len(" ".join(cleaned)) > 18: + score -= 0.6 + return score + + +def _should_render_native_highlight_group(words) -> bool: + cleaned = [_clean_native_highlight_token(word.word) for word in words] + cleaned = [token for token in cleaned if token] + if not cleaned: + return False + return any(token.lower() not in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned) + + +def _native_highlight_font_path() -> Path | None: + windows_fonts = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" + for filename in ("arialbd.ttf", "Arialbd.ttf", "ARIALBD.TTF", "arial.ttf"): + path = windows_fonts / filename + if path.is_file(): + return path + return None + + +def _text_width(font, text: str) -> float: + if not text: + return 0.0 + if hasattr(font, "getlength"): + return float(font.getlength(text)) + bbox = font.getbbox(text) + return float(bbox[2] - bbox[0]) + + +def _text_height(font) -> int: + bbox = font.getbbox("Ag") + return max(1, int(round(bbox[3] - bbox[1]))) + + +def _escape_ass_text(text: str) -> str: + return ( + text.replace("\\", r"\\") + .replace("{", r"\{") + .replace("}", r"\}") + .replace("\n", r"\N") + ) + + +def _native_highlight_overlay_text(line_words, highlight_idx: int) -> str: + parts: list[str] = [] + for word_idx, word in enumerate(line_words): + if word_idx == highlight_idx: + parts.append( + f"{{\\rHighlight{_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE}}}" + f"{_escape_ass_text(word.word)}" + "{\\rInvisible}" + ) + else: + parts.append(_escape_ass_text(word.word)) + return " ".join(parts) + + +def _word_timing_weight(word: TranscriptWord) -> float: + token = _clean_native_highlight_token(word.word) + return max(0.65, min(2.2, len(token or word.word) / 5.5)) + + +def _suspicious_native_highlight_timing( + words: list[TranscriptWord], + idx: int, + *, + clip_duration: float, +) -> bool: + word = words[idx] + start = float(word.start_time) + end = float(word.end_time) + if not (math.isfinite(start) and math.isfinite(end)): + return True + if start < -0.01 or end > clip_duration + 0.25: + return True + duration = end - start + if duration < _NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC: + return True + if duration > _NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC: + return True + if idx > 0: + prev = words[idx - 1] + if start < float(prev.start_time) - 0.03: + return True + if start < float(prev.end_time) - 0.35: + return True + if idx + 1 < len(words): + nxt = words[idx + 1] + if float(nxt.start_time) < start - 0.03: + return True + return False + + +def _repair_native_highlight_timings( + words: list[TranscriptWord], + *, + clip_duration: float, +) -> list[TranscriptWord]: + """Repair obvious ASR word timestamp glitches before per-word highlighting. + + This is intentionally conservative: clean Whisper/ElevenLabs timings pass + through almost unchanged, while zero-length, reversed, huge, or badly + overlapping word timings get interpolated between neighboring reliable words. + """ + + if not words: + return [] + clip_duration = max(0.0, float(clip_duration)) + records: list[dict[str, object]] = [] + for idx, word in enumerate(words): + start = max(0.0, min(clip_duration, float(word.start_time))) + end = max(0.0, min(clip_duration, float(word.end_time))) + records.append( + { + "word": word.word, + "start": start, + "end": end, + "bad": _suspicious_native_highlight_timing( + words, + idx, + clip_duration=clip_duration, + ), + "weight": _word_timing_weight(word), + } + ) + + idx = 0 + while idx < len(records): + if not records[idx]["bad"]: + idx += 1 + continue + run_start = idx + while idx < len(records) and records[idx]["bad"]: + idx += 1 + run_end = idx - 1 + count = run_end - run_start + 1 + left_time = ( + float(records[run_start - 1]["end"]) + if run_start > 0 + else max(0.0, float(records[run_start]["start"])) + ) + right_time = ( + float(records[run_end + 1]["start"]) + if run_end + 1 < len(records) + else min(clip_duration, max(left_time, float(records[run_end]["end"]))) + ) + weight_span = sum(float(r["weight"]) for r in records[run_start : run_end + 1]) * 0.13 + min_span = max(0.11 * count, weight_span) + if right_time <= left_time + min_span: + right_time = min(clip_duration, left_time + min_span) + if right_time <= left_time: + right_time = min(clip_duration, left_time + max(0.08, 0.12 * count)) + + span = max(0.001, right_time - left_time) + weights = [float(r["weight"]) for r in records[run_start : run_end + 1]] + total_weight = max(0.001, sum(weights)) + cursor = left_time + for offset, weight in enumerate(weights): + rec = records[run_start + offset] + next_cursor = ( + right_time + if offset == count - 1 + else cursor + span * (weight / total_weight) + ) + rec["start"] = cursor + rec["end"] = max(cursor + 0.04, next_cursor) + cursor = float(rec["end"]) + + repaired: list[TranscriptWord] = [] + prev_end = 0.0 + for rec in records: + start = max(0.0, float(rec["start"])) + end = max(start + 0.02, float(rec["end"])) + if start < prev_end - 0.02: + start = prev_end + end = max(end, start + 0.04) + if clip_duration > 0.0: + end = min(clip_duration, end) + if end <= start: + start = max(0.0, min(start, clip_duration - 0.02)) + end = min(clip_duration, start + 0.04) + repaired.append(TranscriptWord(word=str(rec["word"]), start_time=start, end_time=end)) + prev_end = max(prev_end, end) + return repaired + + +def _native_highlight_word_windows( + words: list[TranscriptWord], + *, + lead_sec: float, + min_dwell_sec: float, +) -> list[tuple[float, float]]: + if not words: + return [] + lead_sec = max(0.0, float(lead_sec)) + min_dwell_sec = max(0.02, float(min_dwell_sec)) + cue_start = max(0.0, words[0].start_time - lead_sec) + cue_end = max(words[-1].end_time, words[-1].start_time + min_dwell_sec) + + starts: list[float] = [] + for idx, word in enumerate(words): + start = max(cue_start, float(word.start_time) - lead_sec) + if idx > 0: + start = max(start, starts[-1] + 0.01) + starts.append(start) + + windows: list[tuple[float, float]] = [] + for idx, word in enumerate(words): + start = starts[idx] + natural_end = max(float(word.end_time), start + min_dwell_sec) + limit = starts[idx + 1] if idx + 1 < len(starts) else cue_end + end = min(natural_end, limit) + if end <= start: + end = min(limit, start + 0.01) + windows.append((start, max(start + 0.01, end))) + return windows + + +def _fmt_ass_time(seconds: float) -> str: + seconds = max(0.0, seconds) + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = seconds % 60 + whole = int(secs) + cs = int(round((secs - whole) * 100)) + if cs >= 100: + cs = 99 + return f"{hours:d}:{minutes:02d}:{whole:02d}.{cs:02d}" + + +def _format_native_highlight_ass( + cue_chunks, + *, + play_res_x: int, + play_res_y: int, + font_size: int, + margin_v: int, + font_name: str, + highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC, + highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC, +) -> str: + from PIL import ImageFont + + font_path = _native_highlight_font_path() + if font_path is not None: + font = ImageFont.truetype(str(font_path), size=font_size) + else: + font = ImageFont.load_default() + + line_height = max(font_size, _text_height(font) + 6) + line_gap = max(8, int(round(font_size * 0.08))) + bottom_anchor = play_res_y - margin_v + max_line_width = play_res_x * _NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO + + header = ( + "[Script Info]\n" + "ScriptType: v4.00+\n" + f"PlayResX: {play_res_x}\n" + f"PlayResY: {play_res_y}\n" + "WrapStyle: 0\n" + "ScaledBorderAndShadow: yes\n" + "YCbCr Matrix: None\n" + "\n" + "[V4+ Styles]\n" + "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, " + "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, " + "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, " + "Alignment, MarginL, MarginR, MarginV, Encoding\n" + f"Style: Base,{font_name},{font_size},&H00FFFFFF,&H000000FF,&H00101010,&H00000000,-1,0,0,0,100,100,-1,0,1,4,0,8,0,0,0,0\n" + f"Style: Highlight,{font_name},{font_size},&H00FFFFFF,&H000000FF,{_NATIVE_HIGHLIGHT_PURPLE},&H00000000,-1,0,0,0,100,100,-1,0,3,4,0,8,0,0,0,0\n" + f"Style: Invisible,{font_name},{font_size},&HFF000000,&H000000FF,&HFF000000,&HFF000000,-1,0,0,0,100,100,-1,0,1,0,0,8,0,0,0,0\n" + "\n" + "[Events]\n" + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n" + ) + + events: list[str] = [] + for cue_words in cue_chunks: + if not cue_words: + continue + lines = _split_native_highlight_lines( + cue_words, + font=font, + max_line_width=max_line_width, + ) + cue_windows = _native_highlight_word_windows( + cue_words, + lead_sec=highlight_lead_sec, + min_dwell_sec=highlight_min_dwell_sec, + ) + block_height = len(lines) * line_height + max(0, len(lines) - 1) * line_gap + block_top = bottom_anchor - block_height + cue_start = cue_windows[0][0] if cue_windows else cue_words[0].start_time + cue_end = cue_windows[-1][1] if cue_windows else cue_words[-1].end_time + word_offset = 0 + for line_idx, line_words in enumerate(lines): + if not line_words: + continue + line_text = " ".join(word.word for word in line_words) + line_top = block_top + line_idx * (line_height + line_gap) + line_left = (play_res_x - _text_width(font, line_text)) / 2.0 + events.append( + "Dialogue: 1," + f"{_fmt_ass_time(cue_start)},{_fmt_ass_time(cue_end)},Base,,0,0,0,," + f"{{\\an7\\pos({line_left:.1f},{line_top:.1f})}}{_escape_ass_text(line_text)}" + ) + for word_idx, word in enumerate(line_words): + cleaned = _clean_native_highlight_token(word.word) + if not cleaned: + continue + word_start, word_end = cue_windows[word_offset + word_idx] + events.append( + "Dialogue: 0," + f"{_fmt_ass_time(word_start)},{_fmt_ass_time(word_end)},Invisible,,0,0,0,," + f"{{\\an7\\pos({line_left:.1f},{line_top:.1f})}}" + f"{_native_highlight_overlay_text(line_words, word_idx)}" + ) + word_offset += len(line_words) + + return header + "\n".join(events) + ("\n" if events else "") + + +def generate_srt( + clip: Clip, + transcript: dict, + output_dir: Path, + *, + max_words_per_cue: int = 8, + max_cue_sec: float = 4.0, +) -> Path: + """ + Build an SRT file from word-level ASR aligned to this clip's timeline. + + ``transcript`` is the persisted ``transcript.json`` (segments with optional + per-word timestamps). Times are shifted so 0 = clip in-point. + """ + srt_path = output_dir / f"clip_{clip.clip_id}.srt" + aligned = clip_subtitle_words(transcript, clip) + lines = clip_words_to_srt_lines( + aligned.words, + max_words_per_cue=max_words_per_cue, + max_cue_sec=max_cue_sec, + ) + srt_path.write_text(format_srt(lines), encoding="utf-8") + logger.info("Generated SRT: %s (%d cues)", srt_path, len(lines)) + return srt_path + + +def generate_ass( + clip: Clip, + transcript: dict, + output_dir: Path, + *, + max_words_per_cue: int = 4, + max_cue_sec: float = 2.2, + play_res_x: int = 1080, + play_res_y: int = 1920, + font_size: int = 48, + margin_v: int = 160, + margin_h: int = 60, + font_name: str = "Arial", + render_theme: RenderTheme = RenderTheme.LEGACY, + native_highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC, + native_highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC, + repair_word_timings: bool = True, +) -> Path: + """Generate an ASS caption file tuned for direct libass rendering. + + Unlike SRT β†’ libass (default PlayResY=288), an ASS file with + ``PlayResY = output_height`` means libass' scale factor is 1.0, so the + ``font_size`` / ``margin_v`` arguments below are honest output pixels. + + This is the root-cause fix for the "captions rendering in the middle of + the frame, four times too large" bug the user reported. + """ + ass_path = output_dir / f"clip_{clip.clip_id}.ass" + aligned = clip_subtitle_words(transcript, clip) + cue_words = max_words_per_cue + cue_sec = max_cue_sec + cue_font_size = font_size + cue_margin_v = margin_v + prefer_break_on_punctuation = False + min_words_before_break = 1 + if render_theme == RenderTheme.REFERENCE_LOWER_THIRD: + cue_words = max(max_words_per_cue, 7) + cue_sec = max(max_cue_sec, 2.6) + cue_font_size = max(font_size, 52) + cue_margin_v = min(margin_v, 136) + prefer_break_on_punctuation = True + min_words_before_break = 5 + elif render_theme == RenderTheme.NATIVE_HIGHLIGHT: + cue_words = 8 + cue_sec = 2.4 + cue_font_size = max(font_size, 86) + cue_margin_v = max(margin_v, 300) + prefer_break_on_punctuation = True + min_words_before_break = 4 + + aligned_words = aligned.words + if render_theme == RenderTheme.NATIVE_HIGHLIGHT and repair_word_timings: + aligned_words = _repair_native_highlight_timings( + aligned_words, + clip_duration=clip.duration_sec, + ) + + cue_chunks = group_words_to_cue_chunks( + aligned_words, + max_words_per_cue=cue_words, + max_cue_sec=cue_sec, + prefer_break_on_punctuation=prefer_break_on_punctuation, + min_words_before_break=min_words_before_break, + ) + lines = [ + (chunk[0].start_time, chunk[-1].end_time, " ".join(word.word for word in chunk)) + for chunk in cue_chunks + ] + if render_theme == RenderTheme.REFERENCE_LOWER_THIRD: + lines = [(start, end, _balance_reference_caption(text)) for start, end, text in lines] + ass_text = format_ass( + lines, + play_res_x=play_res_x, + play_res_y=play_res_y, + font_size=cue_font_size, + margin_v=cue_margin_v, + margin_h=margin_h, + font_name="Source Sans 3", + render_theme=render_theme, + ) + elif render_theme == RenderTheme.NATIVE_HIGHLIGHT: + ass_text = _format_native_highlight_ass( + cue_chunks, + play_res_x=play_res_x, + play_res_y=play_res_y, + font_size=cue_font_size, + margin_v=cue_margin_v, + font_name=_NATIVE_HIGHLIGHT_FONT_NAME, + highlight_lead_sec=native_highlight_lead_sec, + highlight_min_dwell_sec=native_highlight_min_dwell_sec, + ) + else: + ass_text = format_ass( + lines, + play_res_x=play_res_x, + play_res_y=play_res_y, + font_size=cue_font_size, + margin_v=cue_margin_v, + margin_h=margin_h, + font_name=font_name, + render_theme=render_theme, + ) + ass_path.write_text(ass_text, encoding="utf-8") + logger.info("Generated ASS: %s (%d cues)", ass_path, len(lines)) + return ass_path diff --git a/src/humeo/env.py b/src/humeo/env.py new file mode 100644 index 0000000000000000000000000000000000000000..63a64e3bd8b4e286d59b6fd3c948b7c31c494d81 --- /dev/null +++ b/src/humeo/env.py @@ -0,0 +1,130 @@ +"""Environment bootstrap (``.env``) and cache path helpers.""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Literal + +_BOOTSTRAPPED = False +LLMProvider = Literal["google", "openrouter"] +OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1" + + +def bootstrap_env() -> None: + """Load ``.env`` from the process cwd (non-fatal if missing). Safe to call twice.""" + global _BOOTSTRAPPED + if _BOOTSTRAPPED: + return + try: + from dotenv import load_dotenv + + load_dotenv() + except ImportError: + pass + _BOOTSTRAPPED = True + + +def default_humeo_cache_root() -> Path: + """Default cache root: ``~/.cache/humeo`` on Unix; ``%LOCALAPPDATA%/humeo`` on Windows.""" + override = (os.environ.get("HUMEO_CACHE_ROOT") or "").strip() + if override: + return Path(override) + if os.name == "nt": + base = Path(os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local"))) + return base / "humeo" + return Path.home() / ".cache" / "humeo" + + +def resolve_gemini_api_key() -> str: + """Return an API key for Gemini, or raise if none is configured. + + Prefer ``GOOGLE_API_KEY``; fall back to ``GEMINI_API_KEY``. Values are read from + the environment after ``bootstrap_env()`` (``.env`` in cwd). + + We require an explicit key so we do not fall back to Application Default + Credentials (e.g. ``gcloud auth application-default login``), which often + lack the Generative Language API scope and produce + ``403 ACCESS_TOKEN_SCOPE_INSUFFICIENT``. + """ + bootstrap_env() + for env_name in ("GOOGLE_API_KEY", "GEMINI_API_KEY"): + val = (os.environ.get(env_name) or "").strip() + if val: + return val + raise ValueError( + "Set GOOGLE_API_KEY or GEMINI_API_KEY for Gemini clip selection. " + "See docs/ENVIRONMENT.md. Without an API key the client may use ADC and fail " + "with insufficient scopes (403)." + ) + + +def resolve_openrouter_api_key() -> str: + """Return the OpenRouter API key, or raise if missing.""" + bootstrap_env() + val = (os.environ.get("OPENROUTER_API_KEY") or "").strip() + if val: + return val + raise ValueError( + "Set OPENROUTER_API_KEY to use OpenRouter as the backend for the Gemini stages. " + "See docs/ENVIRONMENT.md." + ) + + +def current_llm_provider() -> LLMProvider | None: + """Best-effort active backend detection from the environment. + + ``HUMEO_LLM_PROVIDER`` overrides key-based auto-detection when set. + """ + bootstrap_env() + forced = (os.environ.get("HUMEO_LLM_PROVIDER") or "auto").strip().lower() + if forced in ("google", "openrouter"): + return forced # type: ignore[return-value] + if (os.environ.get("GOOGLE_API_KEY") or "").strip(): + return "google" + if (os.environ.get("GEMINI_API_KEY") or "").strip(): + return "google" + if (os.environ.get("OPENROUTER_API_KEY") or "").strip(): + return "openrouter" + return None + + +def resolve_llm_provider() -> LLMProvider: + """Return the active backend for Gemini-like stages, or raise if none is configured.""" + provider = current_llm_provider() + if provider is not None: + if provider == "google": + resolve_gemini_api_key() + else: + resolve_openrouter_api_key() + return provider + raise ValueError( + "Set GOOGLE_API_KEY or GEMINI_API_KEY for the Google Gemini SDK, " + "or set OPENROUTER_API_KEY to route these stages through OpenRouter. " + "You can also force the backend with HUMEO_LLM_PROVIDER=google|openrouter." + ) + + +def model_name_for_provider(model_name: str, provider: LLMProvider) -> str: + """Normalize model identifiers between Google Gemini SDK and OpenRouter. + + - Google SDK expects bare Gemini ids like ``gemini-3.1-flash-lite-preview``. + - OpenRouter expects provider-qualified ids like + ``google/gemini-3.1-flash-lite-preview``. + """ + name = model_name.strip() + if provider == "openrouter": + if "/" not in name and name.startswith(("gemini-", "gemma-")): + return f"google/{name}" + return name + if provider == "google" and name.startswith("google/"): + return name.split("/", 1)[1] + return name + + +def openrouter_default_headers() -> dict[str, str]: + """Headers that help identify Humeo traffic to OpenRouter.""" + return { + "HTTP-Referer": "https://github.com/frenzy2004/shortform", + "X-OpenRouter-Title": "Humeo", + } diff --git a/src/humeo/gemini_generate.py b/src/humeo/gemini_generate.py new file mode 100644 index 0000000000000000000000000000000000000000..847b07dd4efec0ed1def268449e4bd9990bc4682 --- /dev/null +++ b/src/humeo/gemini_generate.py @@ -0,0 +1,24 @@ +"""Shared ``GenerateContentConfig`` for product Gemini calls (KISS / DRY). + +Thinking knobs live here only β€” stages pass stage-specific fields +(temperature, ``response_mime_type``, ``system_instruction``, …). +""" + +from __future__ import annotations + +from typing import Any + +from google.genai import types + +_THINKING = types.ThinkingConfig( + thinking_budget=1024, + include_thoughts=True, +) + + +def gemini_generate_config(**kwargs: Any) -> types.GenerateContentConfig: + """Return config with thinking enabled; ``kwargs`` are merged as-is.""" + return types.GenerateContentConfig( + thinking_config=_THINKING, + **kwargs, + ) diff --git a/src/humeo/hook_detector.py b/src/humeo/hook_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..ff5f851f541ae1593b5903a86784599ca7b2e610 --- /dev/null +++ b/src/humeo/hook_detector.py @@ -0,0 +1,574 @@ +"""Stage 2.25 - Hook detection. + +The clip-selection LLM returns a ``hook_start_sec`` / ``hook_end_sec`` pair +per clip, but in practice it almost always echoes the ``[0.0, 3.0]`` +placeholder from the prompt instead of localising the real hook sentence. +That placeholder is toxic to Stage 2.5 pruning -- the clamp refuses to +trim past ``hook_start_sec``, so every ``trim_start_sec > 0`` the pruner +returns gets zeroed out silently. + +This module is a dedicated Stage 2.25 that runs between clip selection and +content pruning. For each clip it: + +1. Prepares a clip-relative segment listing (same format as pruning uses). +2. Asks Gemini, in one batched JSON call, to localise the hook sentence of + every clip with `hook_start_sec`, `hook_end_sec`, `hook_text`, `reason`. +3. Validates the returned window against the clip's duration + the "real + hook" heuristics, then overwrites ``clip.hook_start_sec`` / + ``clip.hook_end_sec`` on a copy of the clip. + +The stage is: + +- **Cached** (``hooks.json`` / ``hooks.meta.json`` in ``work_dir``) on + ``transcript_sha256 + clips_sha256 + gemini_model``. +- **Never fatal.** Any failure (API error, malformed JSON, clip not + returned, window that still looks like the 0.0-3.0 placeholder) falls + back to the original clip with its original hook -- pruning will then + skip hook protection via the fingerprint guard in + :func:`humeo.content_pruning._looks_like_default_hook`. + +The stage writes three artifacts to ``work_dir`` for audit: + +- ``hooks.meta.json``: cache key (version, fingerprints, model). +- ``hooks.json``: structured per-clip hook windows actually applied. +- ``hooks_raw.json``: verbatim Gemini response text (for prompt tuning). +""" + +from __future__ import annotations + +import hashlib +import json +import logging +import time +from pathlib import Path +from typing import Any, Callable, TypeVar + +from google import genai +from openai import OpenAI +from pydantic import BaseModel, Field, ValidationError + +from humeo_core.schemas import Clip + +from humeo.config import GEMINI_MODEL, PipelineConfig +from humeo.content_pruning import _looks_like_default_hook, _segments_within_clip +from humeo.env import ( + OPENROUTER_BASE_URL, + current_llm_provider, + model_name_for_provider, + openrouter_default_headers, + resolve_gemini_api_key, + resolve_llm_provider, + resolve_openrouter_api_key, +) +from humeo.gemini_generate import gemini_generate_config +from humeo.hook_library import ( + format_hook_examples, + hook_library_fingerprint, + resolve_hook_library_path, + retrieve_hook_examples, +) +from humeo.prompt_loader import hook_detection_system_prompt + +logger = logging.getLogger(__name__) + +T = TypeVar("T") + +HOOK_META_VERSION = 2 +HOOK_META_FILENAME = "hooks.meta.json" +HOOK_ARTIFACT_FILENAME = "hooks.json" +HOOK_RAW_FILENAME = "hooks_raw.json" + +LLM_MAX_ATTEMPTS = 3 +LLM_RETRY_DELAY_SEC = 2.0 + +# Hook window validation thresholds. The prompt asks for 1.5-7.0s windows; +# we enforce 1.0-10.0s to be lenient on rounding while still rejecting +# obvious "LLM returned the whole paragraph" mistakes. +_MIN_HOOK_DURATION_SEC = 1.0 +_MAX_HOOK_DURATION_SEC = 10.0 + + +def _openai_message_text(content: object) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text") + if isinstance(text, str): + parts.append(text) + return "".join(parts) + return "" + + +class _HookDecision(BaseModel): + """Per-clip hook window returned by Gemini (clip-relative seconds).""" + + clip_id: str + hook_start_sec: float = Field(ge=0.0) + hook_end_sec: float = Field(ge=0.0) + hook_text: str = "" + reason: str = "" + + +class _HookResponse(BaseModel): + hooks: list[_HookDecision] = Field(default_factory=list) + + +def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T: + last: Exception | None = None + for i in range(attempts): + try: + return fn() + except Exception as e: # noqa: BLE001 - rethrown below + last = e + if i < attempts - 1: + logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e) + time.sleep(LLM_RETRY_DELAY_SEC * (i + 1)) + assert last is not None + raise last + + +# --------------------------------------------------------------------------- +# Prompt construction +# --------------------------------------------------------------------------- + + +def _build_user_message(clips: list[Clip], transcript: dict) -> str: + """Render clip-relative segments + selector-guessed hook text for each clip.""" + blocks: list[str] = [] + for clip in clips: + segs = _segments_within_clip(transcript, clip) + header_lines = [ + f"clip_id: {clip.clip_id}", + f"duration_sec: {clip.duration_sec:.2f}", + f"topic: {clip.topic}", + ] + if clip.viral_hook: + header_lines.append(f"viral_hook_text: {clip.viral_hook}") + if clip.hook_start_sec is not None and clip.hook_end_sec is not None: + header_lines.append( + f"selector_hook_window_sec: [{clip.hook_start_sec:.2f}, " + f"{clip.hook_end_sec:.2f}] (may be a placeholder; verify)" + ) + header = "\n".join(header_lines) + body = "\n".join( + f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}" for seg in segs + ) + if not body: + body = "(no segments overlap this clip window)" + blocks.append(f"{header}\n---\n{body}") + return "\n\n===\n\n".join(blocks) + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + + +def _validate_hook_window( + clip: Clip, hook_start: float, hook_end: float +) -> tuple[float, float] | None: + """Return a valid (hook_start, hook_end) or None if rejected. + + Rules: + - ``0 <= hook_start < hook_end <= duration_sec`` + - hook duration between ``_MIN_HOOK_DURATION_SEC`` and ``_MAX_HOOK_DURATION_SEC`` + - NOT the ``(0.0, 3.0)`` placeholder fingerprint (we'd rather keep the + selector's value untouched than re-apply the same fake hook). + """ + if hook_start < 0.0 or hook_end <= hook_start: + return None + if hook_end > clip.duration_sec + 1e-3: + # Clamp trailing rounding to duration; reject anything beyond. + if hook_end - clip.duration_sec > 0.5: + return None + hook_end = clip.duration_sec + dur = hook_end - hook_start + if dur < _MIN_HOOK_DURATION_SEC or dur > _MAX_HOOK_DURATION_SEC: + return None + if _looks_like_default_hook(hook_start, hook_end): + return None + return float(hook_start), float(hook_end) + + +# --------------------------------------------------------------------------- +# Apply decisions -> new clips +# --------------------------------------------------------------------------- + + +def apply_hook_decisions( + clips: list[Clip], + decisions: list[_HookDecision], +) -> list[Clip]: + """Return new clips whose hook fields reflect validated decisions. + + Clips without a matching valid decision are returned unchanged (their + original hook metadata, placeholder or not, is preserved). + """ + by_id = {d.clip_id: d for d in decisions} + out: list[Clip] = [] + changed = 0 + rejected = 0 + for clip in clips: + d = by_id.get(clip.clip_id) + if d is None: + out.append(clip) + continue + validated = _validate_hook_window(clip, d.hook_start_sec, d.hook_end_sec) + if validated is None: + logger.info( + "Clip %s: rejected hook window [%.2f, %.2f] (failed validation); " + "keeping selector hook.", + clip.clip_id, + d.hook_start_sec, + d.hook_end_sec, + ) + rejected += 1 + out.append(clip) + continue + hs, he = validated + if ( + clip.hook_start_sec is not None + and clip.hook_end_sec is not None + and abs(clip.hook_start_sec - hs) < 1e-3 + and abs(clip.hook_end_sec - he) < 1e-3 + ): + out.append(clip) + continue + changed += 1 + logger.info( + "Clip %s: hook set to [%.2f, %.2f] (was [%s, %s]) -- %s", + clip.clip_id, + hs, + he, + f"{clip.hook_start_sec:.2f}" if clip.hook_start_sec is not None else "None", + f"{clip.hook_end_sec:.2f}" if clip.hook_end_sec is not None else "None", + d.reason[:120] if d.reason else "(no reason)", + ) + out.append( + clip.model_copy(update={"hook_start_sec": hs, "hook_end_sec": he}) + ) + logger.info( + "Hook detection: updated %d / %d clips (%d rejected, %d kept as-is).", + changed, + len(clips), + rejected, + len(clips) - changed - rejected, + ) + return out + + +# --------------------------------------------------------------------------- +# Cache +# --------------------------------------------------------------------------- + + +def _clips_fingerprint(clips: list[Clip]) -> str: + payload = json.dumps( + [ + {"id": c.clip_id, "s": round(c.start_time_sec, 3), "e": round(c.end_time_sec, 3)} + for c in clips + ], + sort_keys=True, + ensure_ascii=False, + ) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def _resolved_gemini_model(config: PipelineConfig) -> str: + return (config.gemini_model or GEMINI_MODEL).strip() + + +def _hook_meta( + *, + transcript_fp: str, + clips_fp: str, + config: PipelineConfig, +) -> dict[str, Any]: + return { + "version": HOOK_META_VERSION, + "transcript_sha256": transcript_fp, + "clips_sha256": clips_fp, + "gemini_model": _resolved_gemini_model(config), + "llm_backend": current_llm_provider() or "google", + "hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)), + } + + +def _hook_cache_valid( + work_dir: Path, + *, + transcript_fp: str, + clips_fp: str, + config: PipelineConfig, +) -> bool: + meta_path = work_dir / HOOK_META_FILENAME + if not meta_path.is_file(): + return False + try: + with open(meta_path, encoding="utf-8") as f: + meta = json.load(f) + except Exception: + return False + if meta.get("version") != HOOK_META_VERSION: + return False + if meta.get("transcript_sha256") != transcript_fp: + return False + if meta.get("clips_sha256") != clips_fp: + return False + current_provider = current_llm_provider() + meta_provider = meta.get("llm_backend") + if current_provider == "openrouter": + if meta_provider != "openrouter": + return False + elif current_provider == "google": + if meta_provider not in (None, "google"): + return False + if meta.get("gemini_model") != _resolved_gemini_model(config): + return False + if meta.get("hook_library_sha256", "") != hook_library_fingerprint(resolve_hook_library_path(config)): + return False + return True + + +def _load_cached_hooks( + work_dir: Path, clips: list[Clip] +) -> list[Clip] | None: + artifact = work_dir / HOOK_ARTIFACT_FILENAME + if not artifact.is_file(): + return None + try: + with open(artifact, "r", encoding="utf-8") as f: + data = json.load(f) + cached = {item["clip_id"]: item for item in data.get("hooks", [])} + except Exception as e: # noqa: BLE001 - surfaced as warning below + logger.warning("Hook cache artifact unreadable (%s); re-running.", e) + return None + out: list[Clip] = [] + for clip in clips: + c = cached.get(clip.clip_id) + if c is None: + out.append(clip) + continue + hs = c.get("hook_start_sec") + he = c.get("hook_end_sec") + if hs is None or he is None: + out.append(clip) + continue + out.append( + clip.model_copy( + update={"hook_start_sec": float(hs), "hook_end_sec": float(he)} + ) + ) + return out + + +def _write_cache( + work_dir: Path, + *, + clips_with_hooks: list[Clip], + decisions: list[_HookDecision], + meta: dict[str, Any], + raw_response: str, +) -> None: + work_dir.mkdir(parents=True, exist_ok=True) + reasons = {d.clip_id: d for d in decisions} + payload = { + "hooks": [ + { + "clip_id": c.clip_id, + "hook_start_sec": c.hook_start_sec, + "hook_end_sec": c.hook_end_sec, + "hook_text": (reasons.get(c.clip_id).hook_text if reasons.get(c.clip_id) else ""), + "reason": (reasons.get(c.clip_id).reason if reasons.get(c.clip_id) else ""), + } + for c in clips_with_hooks + ] + } + (work_dir / HOOK_ARTIFACT_FILENAME).write_text( + json.dumps(payload, indent=2), encoding="utf-8" + ) + (work_dir / HOOK_RAW_FILENAME).write_text(raw_response, encoding="utf-8") + with open(work_dir / HOOK_META_FILENAME, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + f.write("\n") + logger.info( + "Wrote %s, %s and %s", + HOOK_META_FILENAME, + HOOK_ARTIFACT_FILENAME, + HOOK_RAW_FILENAME, + ) + + +# --------------------------------------------------------------------------- +# Gemini call +# --------------------------------------------------------------------------- + + +def _parse_decisions(raw_json: str) -> list[_HookDecision]: + data = json.loads(raw_json) + if isinstance(data, dict) and "hooks" in data: + try: + return _HookResponse.model_validate(data).hooks + except ValidationError as e: + logger.warning("Hook response failed validation: %s", e) + return [] + if isinstance(data, list): + out: list[_HookDecision] = [] + for item in data: + try: + out.append(_HookDecision.model_validate(item)) + except ValidationError: + continue + return out + return [] + + +def request_hook_decisions( + clips: list[Clip], + transcript: dict, + *, + gemini_model: str | None = None, + hook_library_path: Path | None = None, +) -> tuple[list[_HookDecision], str]: + """Ask Gemini to localise the hook sentence for each clip. + + Returns ``(decisions, raw_response)``. ``raw_response`` is the literal + JSON text from Gemini (cached to ``hooks_raw.json`` for audit). On + transport/parse failure this raises; callers should catch and treat as + no-op. + """ + if not clips: + return [], '{"hooks": []}' + + example_query = " ".join( + filter(None, [*(clip.topic for clip in clips[:4]), *(clip.viral_hook for clip in clips[:4])]) + ) + hook_examples = format_hook_examples( + retrieve_hook_examples(example_query, path=hook_library_path, limit=8) + ) + system = hook_detection_system_prompt(hook_examples=hook_examples) + user_text = _build_user_message(clips, transcript) + + provider = resolve_llm_provider() + model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider) + + def _call() -> str: + logger.info( + "%s hook detection (model=%s, clips=%d)...", provider, model_name, len(clips) + ) + if provider == "google": + client = genai.Client(api_key=resolve_gemini_api_key()) + response = client.models.generate_content( + model=model_name, + contents=user_text, + config=gemini_generate_config( + system_instruction=system, + temperature=0.2, + response_mime_type="application/json", + ), + ) + if not response.text: + raise RuntimeError("Gemini returned empty response text for hook detection") + return response.text + + client = OpenAI( + api_key=resolve_openrouter_api_key(), + base_url=OPENROUTER_BASE_URL, + default_headers=openrouter_default_headers(), + ) + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user_text}, + ], + temperature=0.2, + response_format={"type": "json_object"}, + ) + text = _openai_message_text(response.choices[0].message.content) + if not text: + raise RuntimeError("OpenRouter returned empty response text for hook detection") + return text + + raw = _retry_llm("Gemini hook detection", _call) + decisions = _parse_decisions(raw) + return decisions, raw + + +# --------------------------------------------------------------------------- +# Public stage entrypoint +# --------------------------------------------------------------------------- + + +def run_hook_detection_stage( + work_dir: Path, + clips: list[Clip], + transcript: dict, + *, + transcript_fp: str, + config: PipelineConfig, +) -> list[Clip]: + """Run Stage 2.25 hook detection and return clips with localised hooks. + + - Disabled (``config.detect_hooks is False``): return clips unchanged. + - Cache hit: read ``hooks.json`` and apply cached windows. + - LLM failure: log a warning and return clips unchanged. The downstream + content pruner's fingerprint guard will treat any remaining placeholder + hooks as "no hook" so pruning still runs. + """ + if not config.detect_hooks: + logger.info("Hook detection disabled (detect_hooks=False); skipping Stage 2.25.") + return clips + if not clips: + return clips + + clips_fp = _clips_fingerprint(clips) + + if not config.force_hook_detection and _hook_cache_valid( + work_dir, + transcript_fp=transcript_fp, + clips_fp=clips_fp, + config=config, + ): + cached = _load_cached_hooks(work_dir, clips) + if cached is not None: + logger.info( + "Hook detection cache hit (%d clips); skipping LLM.", len(clips) + ) + return cached + + try: + decisions, raw = request_hook_decisions( + clips, + transcript, + gemini_model=config.gemini_model, + hook_library_path=resolve_hook_library_path(config), + ) + except Exception as e: # noqa: BLE001 - pipeline must not die here + logger.warning( + "Hook detection call failed (%s); continuing with selector hooks. " + "Content pruning will treat any [0.0, 3.0] placeholder as 'no hook'.", + e, + ) + return clips + + updated = apply_hook_decisions(clips, decisions) + + meta = _hook_meta( + transcript_fp=transcript_fp, clips_fp=clips_fp, config=config + ) + try: + _write_cache( + work_dir, + clips_with_hooks=updated, + decisions=decisions, + meta=meta, + raw_response=raw, + ) + except Exception as e: # noqa: BLE001 - cache failure is not fatal + logger.warning("Failed to write hook cache (%s); continuing.", e) + + return updated diff --git a/src/humeo/hook_library.py b/src/humeo/hook_library.py new file mode 100644 index 0000000000000000000000000000000000000000..19da6a94a81027aef80e12ba1217694361a3b675 --- /dev/null +++ b/src/humeo/hook_library.py @@ -0,0 +1,193 @@ +"""Parse and retrieve viral hook examples from a local zip or directory.""" + +from __future__ import annotations + +import hashlib +import os +import re +import zipfile +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +from humeo.config import PipelineConfig + +_ENTRY_RE = re.compile( + r"^\s*\d+\.\s*Hook:\s*(?P.+?)
Example:\s*(?P.+?)
Psychology:\s*(?P.+?)\s*$", + re.IGNORECASE, +) +_TOKEN_RE = re.compile(r"[a-z0-9']+") + + +@dataclass(frozen=True) +class HookExample: + category: str + hook: str + example: str + psychology: str + + +_LIB_CACHE: dict[str, list[HookExample]] = {} + + +def resolve_hook_library_path(config: PipelineConfig | None = None) -> Path | None: + if config is not None and config.hook_library_path is not None: + return Path(config.hook_library_path) + raw = (os.environ.get("HUMEO_HOOK_LIBRARY_PATH") or "").strip() + if raw: + return Path(raw).expanduser() + return None + + +def require_hook_library_path(config: PipelineConfig | None = None) -> Path: + path = resolve_hook_library_path(config) + if path is None: + raise FileNotFoundError( + "HUMEO_HOOK_LIBRARY_PATH is required for the hook retrieval workflow." + ) + if not path.exists(): + raise FileNotFoundError(f"Hook library path does not exist: {path}") + return path + + +def hook_library_fingerprint(path: Path | None) -> str: + if path is None: + return "" + if not path.exists(): + return "" + hasher = hashlib.sha256() + if path.is_file(): + hasher.update(path.read_bytes()) + return hasher.hexdigest() + + for md_path in sorted(p for p in path.rglob("*.md") if p.is_file()): + hasher.update(str(md_path.relative_to(path)).encode("utf-8")) + hasher.update(md_path.read_bytes()) + return hasher.hexdigest() + + +def _tokenize(text: str) -> set[str]: + return {m.group(0) for m in _TOKEN_RE.finditer(text.lower()) if len(m.group(0)) > 2} + + +def _ordered_tokens(text: str) -> list[str]: + return [m.group(0) for m in _TOKEN_RE.finditer(text.lower()) if len(m.group(0)) > 2] + + +def _iter_markdown_files(path: Path) -> Iterable[tuple[str, str]]: + if path.is_file(): + with zipfile.ZipFile(path) as zf: + for name in sorted(n for n in zf.namelist() if n.endswith(".md")): + yield name, zf.read(name).decode("utf-8", errors="replace") + return + + for md_path in sorted(p for p in path.rglob("*.md") if p.is_file()): + yield str(md_path.relative_to(path)).replace("\\", "/"), md_path.read_text( + encoding="utf-8", errors="replace" + ) + + +def _category_from_name(name: str) -> str: + stem = Path(name).stem + stem = stem.replace("_Hooks", "").replace("_", " ").strip() + return stem + + +def _parse_examples(path: Path) -> list[HookExample]: + examples: list[HookExample] = [] + for name, content in _iter_markdown_files(path): + category = _category_from_name(name) + for raw_line in content.splitlines(): + line = raw_line.strip() + if not line or not line[0].isdigit(): + continue + match = _ENTRY_RE.match(line) + if not match: + continue + examples.append( + HookExample( + category=category, + hook=match.group("hook").strip(), + example=match.group("example").strip(), + psychology=match.group("psychology").strip(), + ) + ) + return examples + + +def load_hook_library(path: Path | None) -> list[HookExample]: + if path is None: + return [] + fingerprint = hook_library_fingerprint(path) + if not fingerprint: + return [] + cached = _LIB_CACHE.get(fingerprint) + if cached is not None: + return cached + parsed = _parse_examples(path) + _LIB_CACHE[fingerprint] = parsed + return parsed + + +def retrieve_hook_examples( + query_text: str, + *, + topic: str = "", + path: Path | None, + limit: int = 8, +) -> list[HookExample]: + items = load_hook_library(path) + if not items: + return [] + + query_tokens = _tokenize(f"{topic} {query_text}") + query_phrases = [ + " ".join(pair) + for pair in zip(_ordered_tokens(f"{topic} {query_text}"), _ordered_tokens(f"{topic} {query_text}")[1:]) + ] + if not query_tokens: + return items[:limit] + + scored: list[tuple[tuple[int, int, int], HookExample]] = [] + for item in items: + hook_tokens = _tokenize(item.hook) + example_tokens = _tokenize(item.example) + category_tokens = _tokenize(item.category) + hook_overlap = len(query_tokens & hook_tokens) + example_overlap = len(query_tokens & example_tokens) + category_overlap = len(query_tokens & category_tokens) + overlap = hook_overlap + example_overlap + category_overlap + if overlap == 0: + continue + psychology_overlap = len(query_tokens & _tokenize(item.psychology)) + phrase_bonus = sum(1 for phrase in query_phrases if phrase in item.example.lower()) + scored.append( + ( + ( + phrase_bonus * 5 + example_overlap * 3 + hook_overlap + category_overlap, + phrase_bonus, + example_overlap, + category_overlap + psychology_overlap, + ), + item, + ) + ) + + if not scored: + return items[:limit] + + scored.sort(key=lambda pair: pair[0], reverse=True) + return [item for _, item in scored[:limit]] + + +def format_hook_examples(examples: list[HookExample]) -> str: + if not examples: + return "" + lines: list[str] = [] + for idx, item in enumerate(examples, start=1): + lines.append( + f"{idx}. [{item.category}] Hook: {item.hook}\n" + f" Example: {item.example}\n" + f" Psychology: {item.psychology}" + ) + return "\n".join(lines) diff --git a/src/humeo/ingest.py b/src/humeo/ingest.py new file mode 100644 index 0000000000000000000000000000000000000000..edbfb2a6f221acd4639f140b14f2a1f66ccf7279 --- /dev/null +++ b/src/humeo/ingest.py @@ -0,0 +1,564 @@ +""" +Step 1 - Ingestion: Download video and generate word-level transcript. + +Responsibilities: + - Download source video from YouTube using yt-dlp. + - Extract audio track for transcription. + - Generate word-level timestamped transcript. +""" + +import json +import logging +import os +import shutil +import subprocess +from math import ceil +from pathlib import Path + +import httpx + +from humeo.video_cache import local_source_matches, write_local_source_info + +logger = logging.getLogger(__name__) + +OPENAI_MAX_UPLOAD_BYTES = 25 * 1024 * 1024 +OPENAI_TARGET_UPLOAD_BYTES = 20 * 1024 * 1024 +OPENAI_MIN_CHUNK_SEC = 300.0 +ELEVENLABS_TRANSCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text" +TRANSCRIPT_META_FILENAME = "transcript.meta.json" +ELEVENLABS_SCRIBE_MODEL = "scribe_v2" +_ELEVENLABS_SEGMENT_MAX_GAP_SEC = 0.65 +_ELEVENLABS_SEGMENT_MAX_DURATION_SEC = 6.0 +_ELEVENLABS_SEGMENT_MAX_WORDS = 18 + + +def stage_local_video(source: str | Path, output_dir: Path) -> Path: + """ + Copy a local source video into ``output_dir/source.mp4`` for cacheable reruns. + """ + source_path = Path(source).expanduser().resolve(strict=False) + if not source_path.is_file(): + raise FileNotFoundError(f"Local source video does not exist: {source_path}") + + output_dir.mkdir(parents=True, exist_ok=True) + staged_path = output_dir / "source.mp4" + staged_resolved = staged_path.resolve(strict=False) + + if source_path == staged_resolved: + logger.info("Using local source video in place: %s", source_path) + write_local_source_info(output_dir, source_path) + return staged_path + + if staged_path.exists() and local_source_matches(output_dir, str(source_path)): + logger.info("Local source already staged at: %s", staged_path) + return staged_path + + if source_path.suffix.lower() != ".mp4": + logger.warning( + "Local source uses %s; staging it as source.mp4 anyway.", + source_path.suffix or "", + ) + + action = "Replacing" if staged_path.exists() else "Staging" + logger.info("%s local video: %s -> %s", action, source_path, staged_path) + shutil.copy2(source_path, staged_path) + write_local_source_info(output_dir, source_path) + return staged_path + + +def download_video(youtube_url: str, output_dir: Path) -> Path: + """ + Download the best quality video+audio from YouTube. + + Returns the path to the downloaded MP4 file. + """ + output_template = str(output_dir / "source.%(ext)s") + cmd = [ + "yt-dlp", + "--format", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "--merge-output-format", "mp4", + "--output", output_template, + "--no-playlist", + "--write-info-json", + "--quiet", + youtube_url, + ] + + logger.info("Downloading video: %s", youtube_url) + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + if result.stderr: + logger.warning(result.stderr.strip()) + + # yt-dlp should produce source.mp4 + video_path = output_dir / "source.mp4" + if not video_path.exists(): + # Fallback: find any mp4 in the output dir + mp4_files = list(output_dir.glob("source.*")) + if mp4_files: + video_path = mp4_files[0] + else: + raise FileNotFoundError(f"Download failed - no output found in {output_dir}") + + logger.info("Downloaded to: %s", video_path) + return video_path + + +def extract_audio(video_path: Path, output_dir: Path) -> Path: + """ + Extract audio track from video as WAV (required by most ASR models). + """ + audio_path = output_dir / "source_audio.wav" + cmd = [ + "ffmpeg", "-y", + "-i", str(video_path), + "-vn", # no video + "-acodec", "pcm_s16le", # raw PCM + "-ar", "16000", # 16kHz sample rate (standard for ASR) + "-ac", "1", # mono + str(audio_path), + ] + + logger.info("Extracting audio to: %s", audio_path) + subprocess.run(cmd, check=True, capture_output=True) + return audio_path + + +def _resolve_elevenlabs_api_key() -> str: + key = (os.environ.get("ELEVENLABS_API_KEY") or "").strip() + if key: + return key + raise ValueError("Set ELEVENLABS_API_KEY to use ElevenLabs Scribe v2 transcription.") + + +def _elevenlabs_no_verbatim_enabled() -> bool: + raw = (os.environ.get("ELEVENLABS_NO_VERBATIM") or "true").strip().lower() + return raw not in {"0", "false", "no", "off"} + + +def resolved_transcribe_settings() -> dict[str, object]: + provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "elevenlabs").strip().lower() + if provider in ("", "auto"): + if (os.environ.get("ELEVENLABS_API_KEY") or "").strip(): + provider = "elevenlabs" + else: + provider = "openai" + + if provider in ("api",): + provider = "openai" + if provider in ("local",): + provider = "whisperx" + + settings: dict[str, object] = {"provider": provider} + if provider == "elevenlabs": + settings.update( + { + "model_id": ELEVENLABS_SCRIBE_MODEL, + "no_verbatim": _elevenlabs_no_verbatim_enabled(), + } + ) + return settings + + +def transcript_cache_valid(output_dir: Path) -> bool: + transcript_path = output_dir / "transcript.json" + meta_path = output_dir / TRANSCRIPT_META_FILENAME + if not transcript_path.is_file() or not meta_path.is_file(): + return False + try: + meta = json.loads(meta_path.read_text(encoding="utf-8")) + except Exception: + return False + return meta == resolved_transcribe_settings() + + +def _write_transcript(output_dir: Path, transcript: dict) -> None: + transcript_path = output_dir / "transcript.json" + with open(transcript_path, "w", encoding="utf-8") as f: + json.dump(transcript, f, indent=2, ensure_ascii=False) + with open(output_dir / TRANSCRIPT_META_FILENAME, "w", encoding="utf-8") as f: + json.dump(resolved_transcribe_settings(), f, indent=2, ensure_ascii=False) + f.write("\n") + + +def _normalize_elevenlabs_word(raw_word: dict) -> dict | None: + if not isinstance(raw_word, dict): + return None + if str(raw_word.get("type", "word")).strip().lower() not in {"word", ""}: + return None + text = str(raw_word.get("text", raw_word.get("word", ""))).strip() + if not text: + return None + try: + start = float(raw_word["start"]) + end = float(raw_word["end"]) + except (KeyError, TypeError, ValueError): + return None + if end <= start: + return None + return {"word": text, "start": start, "end": end} + + +def _segment_words_into_transcript(words: list[dict], *, language: str) -> dict: + segments: list[dict] = [] + chunk: list[dict] = [] + + def flush() -> None: + if not chunk: + return + segments.append( + { + "start": chunk[0]["start"], + "end": chunk[-1]["end"], + "text": " ".join(str(word["word"]) for word in chunk).strip(), + "words": list(chunk), + } + ) + chunk.clear() + + for word in words: + if chunk: + gap = float(word["start"]) - float(chunk[-1]["end"]) + dur = float(word["end"]) - float(chunk[0]["start"]) + if ( + gap >= _ELEVENLABS_SEGMENT_MAX_GAP_SEC + or dur >= _ELEVENLABS_SEGMENT_MAX_DURATION_SEC + or len(chunk) >= _ELEVENLABS_SEGMENT_MAX_WORDS + ): + flush() + chunk.append(word) + flush() + return {"segments": segments, "language": language} + + +def _normalize_elevenlabs_response(data: dict) -> dict: + words = [ + word + for raw_word in data.get("words", []) or [] + if (word := _normalize_elevenlabs_word(raw_word)) is not None + ] + language = str( + data.get("language_code") or data.get("language") or "en" + ).strip() or "en" + return _segment_words_into_transcript(words, language=language) + + +def _transcribe_elevenlabs_scribe(audio_path: Path) -> dict: + headers = {"xi-api-key": _resolve_elevenlabs_api_key()} + form = { + "model_id": ELEVENLABS_SCRIBE_MODEL, + "timestamps_granularity": "word", + "diarize": "false", + "tag_audio_events": "false", + "file_format": "pcm_s16le_16", + "no_verbatim": "true" if _elevenlabs_no_verbatim_enabled() else "false", + } + with audio_path.open("rb") as handle: + files = {"file": (audio_path.name, handle, "audio/wav")} + response = httpx.post( + ELEVENLABS_TRANSCRIBE_URL, + headers=headers, + data=form, + files=files, + timeout=600.0, + ) + response.raise_for_status() + return _normalize_elevenlabs_response(response.json()) + + +def _transcribe_whisperx_local(audio_path: Path) -> dict: + """Word-level transcript via WhisperX (local). Raises ImportError if not installed.""" + import whisperx + + logger.info("Transcribing with WhisperX...") + device = "cpu" # Use "cuda" if GPU available + model = whisperx.load_model("base", device=device, compute_type="int8") + audio = whisperx.load_audio(str(audio_path)) + result = model.transcribe(audio, batch_size=16) + + align_model, metadata = whisperx.load_align_model( + language_code=result["language"], device=device + ) + result = whisperx.align( + result["segments"], align_model, metadata, audio, device, + return_char_alignments=False, + ) + + logger.info("Transcription complete: %d segments", len(result["segments"])) + return result + + +def transcribe_whisperx(audio_path: Path, output_dir: Path) -> dict: + """ + Transcribe audio for word-level timestamps. + + Provider is controlled by **HUMEO_TRANSCRIBE_PROVIDER** (default ``auto``): + + - ``auto`` β€” WhisperX if installed, else OpenAI Whisper API. + - ``openai`` / ``api`` β€” OpenAI Whisper API (uses ``OPENAI_API_KEY``), even when WhisperX is installed. + - ``whisperx`` / ``local`` β€” WhisperX only; fails clearly if not installed. + + The result is written to ``output_dir / "transcript.json"``. Re-runs with an + existing transcript are skipped by the pipeline before this function runs. + """ + settings = resolved_transcribe_settings() + provider = str(settings["provider"]) + + if provider == "elevenlabs": + logger.info( + "Transcribing with ElevenLabs Scribe v2 (no_verbatim=%s).", + bool(settings.get("no_verbatim", False)), + ) + result = _transcribe_elevenlabs_scribe(audio_path) + elif provider == "openai": + logger.info( + "Transcribing with OpenAI Whisper API (HUMEO_TRANSCRIBE_PROVIDER=%s).", + provider, + ) + result = _transcribe_openai_api(audio_path) + elif provider == "whisperx": + try: + result = _transcribe_whisperx_local(audio_path) + except ImportError as e: + raise RuntimeError( + "WhisperX requested (HUMEO_TRANSCRIBE_PROVIDER=whisperx) but whisperx is not installed. " + "Install with: uv sync --extra whisper" + ) from e + else: + raise RuntimeError( + f"Unknown HUMEO_TRANSCRIBE_PROVIDER={provider!r}. " + "Use elevenlabs, openai, or whisperx." + ) + + _write_transcript(output_dir, result) + + return result + + +def _transcribe_openai_api(audio_path: Path) -> dict: + """ + Fallback transcription using OpenAI's Whisper API. + Requires OPENAI_API_KEY environment variable. + """ + from openai import OpenAI + + client = OpenAI() + + work_dir = audio_path.parent / "openai_transcribe" + work_dir.mkdir(parents=True, exist_ok=True) + duration_sec = _probe_media_duration(audio_path) + chunk_ranges = _plan_openai_chunk_ranges( + duration_sec=duration_sec, + file_size_bytes=audio_path.stat().st_size, + ) + + if len(chunk_ranges) == 1: + return _transcribe_openai_file(client, audio_path) + + logger.info("Audio exceeds OpenAI upload limit; transcribing in %d chunks.", len(chunk_ranges)) + chunk_transcripts: list[dict] = [] + for idx, (offset_sec, chunk_duration_sec) in enumerate(chunk_ranges, start=1): + chunk_path = work_dir / f"{audio_path.stem}_part_{idx:03d}.wav" + if not chunk_path.exists(): + _extract_openai_audio_chunk( + input_path=audio_path, + output_path=chunk_path, + offset_sec=offset_sec, + duration_sec=chunk_duration_sec, + ) + logger.info( + "Transcribing chunk %d/%d (%.1fs-%.1fs)", + idx, + len(chunk_ranges), + offset_sec, + offset_sec + chunk_duration_sec, + ) + chunk_transcript = _transcribe_openai_file(client, chunk_path) + chunk_transcripts.append(_offset_transcript_timestamps(chunk_transcript, offset_sec)) + + return _merge_transcripts(chunk_transcripts) + + +def _extract_openai_audio_chunk( + input_path: Path, + output_path: Path, + offset_sec: float, + duration_sec: float, +) -> Path: + cmd = [ + "ffmpeg", + "-y", + "-loglevel", + "error", + "-ss", + f"{offset_sec:.3f}", + "-t", + f"{duration_sec:.3f}", + "-i", + str(input_path), + "-vn", + "-acodec", + "pcm_s16le", + "-ac", + "1", + "-ar", + "16000", + str(output_path), + ] + subprocess.run(cmd, check=True, capture_output=True) + return output_path + + +def _probe_media_duration(media_path: Path) -> float: + cmd = [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "json", + str(media_path), + ] + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + data = json.loads(result.stdout) + return float(data["format"]["duration"]) + + +def _plan_openai_chunk_ranges( + *, + duration_sec: float, + file_size_bytes: int, + max_upload_bytes: int = OPENAI_MAX_UPLOAD_BYTES, + target_upload_bytes: int = OPENAI_TARGET_UPLOAD_BYTES, +) -> list[tuple[float, float]]: + if file_size_bytes <= max_upload_bytes: + return [(0.0, duration_sec)] + + chunk_sec = max( + OPENAI_MIN_CHUNK_SEC, + duration_sec * (target_upload_bytes / file_size_bytes), + ) + chunk_count = max(2, ceil(duration_sec / chunk_sec)) + exact_chunk_sec = duration_sec / chunk_count + + ranges: list[tuple[float, float]] = [] + for idx in range(chunk_count): + start = idx * exact_chunk_sec + end = min(duration_sec, (idx + 1) * exact_chunk_sec) + ranges.append((round(start, 3), round(end - start, 3))) + return ranges + + +def _transcribe_openai_file(client, audio_path: Path) -> dict: + with open(audio_path, "rb") as f: + response = client.audio.transcriptions.create( + model="whisper-1", + file=f, + response_format="verbose_json", + timestamp_granularities=["word", "segment"], + ) + return _normalize_openai_response(response) + + +def _normalize_openai_response(response: object) -> dict: + data = response.model_dump() if hasattr(response, "model_dump") else response + if not isinstance(data, dict): + raise TypeError(f"Unexpected transcription payload type: {type(data)!r}") + + top_words = [_normalize_word(word) for word in data.get("words", []) or []] + segments: list[dict] = [] + word_index = 0 + + for raw_segment in data.get("segments", []) or []: + segment = raw_segment.model_dump() if hasattr(raw_segment, "model_dump") else raw_segment + if not isinstance(segment, dict): + continue + + start = float(segment.get("start", 0.0)) + end = float(segment.get("end", 0.0)) + text = str(segment.get("text", "")).strip() + + segment_words = [_normalize_word(word) for word in segment.get("words", []) or []] + if not segment_words and top_words: + while word_index < len(top_words) and top_words[word_index]["end"] <= start: + word_index += 1 + + probe_index = word_index + while probe_index < len(top_words) and top_words[probe_index]["start"] < end: + word = top_words[probe_index] + if word["end"] > start: + segment_words.append(word) + probe_index += 1 + word_index = probe_index + + segments.append( + { + "start": start, + "end": end, + "text": text, + "words": segment_words, + } + ) + + if not segments and top_words: + segments.append( + { + "start": top_words[0]["start"], + "end": top_words[-1]["end"], + "text": " ".join(word["word"] for word in top_words).strip(), + "words": top_words, + } + ) + + return { + "segments": segments, + "language": str(data.get("language", "en") or "en"), + } + + +def _normalize_word(raw_word: object) -> dict: + word = raw_word.model_dump() if hasattr(raw_word, "model_dump") else raw_word + if not isinstance(word, dict): + return {"word": "", "start": 0.0, "end": 0.0} + return { + "word": str(word.get("word", "")).strip(), + "start": float(word.get("start", 0.0)), + "end": float(word.get("end", 0.0)), + } + + +def _offset_transcript_timestamps(transcript: dict, offset_sec: float) -> dict: + shifted_segments = [] + for segment in transcript.get("segments", []): + shifted_segments.append( + { + "start": float(segment["start"]) + offset_sec, + "end": float(segment["end"]) + offset_sec, + "text": segment["text"], + "words": [ + { + "word": word["word"], + "start": float(word["start"]) + offset_sec, + "end": float(word["end"]) + offset_sec, + } + for word in segment.get("words", []) + ], + } + ) + return { + "segments": shifted_segments, + "language": transcript.get("language", "en"), + } + + +def _merge_transcripts(transcripts: list[dict]) -> dict: + merged_segments = [] + language = "en" + for transcript in transcripts: + merged_segments.extend(transcript.get("segments", [])) + if transcript.get("language"): + language = transcript["language"] + return { + "segments": merged_segments, + "language": language, + } diff --git a/src/humeo/interactive.py b/src/humeo/interactive.py new file mode 100644 index 0000000000000000000000000000000000000000..64e193248cdbecc27cc8713103ba3ced458f3025 --- /dev/null +++ b/src/humeo/interactive.py @@ -0,0 +1,140 @@ +"""Plain stdin interactive gates for the pipeline.""" + +from __future__ import annotations + +from pathlib import Path + +from humeo_core.schemas import ApprovalResult, Clip, RatingFeedback + +_ISSUE_MAP = { + "a": "wrong_moments", + "b": "bad_cuts", + "c": "boring", + "d": "confusing", + "e": "wrong_layout", + "f": "length_off", + "g": "other", +} + + +def _preview(text: str, limit: int = 100) -> str: + compact = " ".join(text.split()) + if len(compact) <= limit: + return compact + return compact[: limit - 3].rstrip() + "..." + + +def approve_clips(clips: list[Clip]) -> ApprovalResult: + """Prompt the user to approve or refine the selected clips.""" + clip_ids = [clip.clip_id for clip in clips] + + for clip in clips: + print( + f'[{clip.clip_id}] score={clip.virality_score:.2f} ' + f'duration={clip.duration_sec:.1f}s "{clip.topic}"' + ) + print(f' "{_preview(clip.transcript)}"') + + print() + print("Actions:") + print(" numbers in order (e.g. '3,1,5') β€” select these clips to proceed") + print(" 'all' β€” accept all clips as-is") + print(" 'refine ' β€” re-run selection with steering") + print(" 'quit' β€” abort pipeline") + print() + + while True: + raw = input("> ").strip() + lowered = raw.lower() + + if lowered == "all": + return ApprovalResult(action="accept_all", selected_ids=list(clip_ids)) + if lowered == "quit": + return ApprovalResult(action="quit") + if lowered.startswith("refine"): + note = raw[6:].strip() + if not note: + print("Refine requires a note. Try: refine more emotional clips") + continue + return ApprovalResult(action="refine", steering_note=note) + + tokens = [token.strip() for token in raw.split(",") if token.strip()] + if not tokens: + print("Enter clip numbers, 'all', 'refine ', or 'quit'.") + continue + + selected_ids: list[str] = [] + seen_ids: set[str] = set() + invalid = False + for token in tokens: + clip_id: str | None = None + if token in clip_ids: + clip_id = token + elif token.isdigit(): + idx = int(token) + if 1 <= idx <= len(clips): + clip_id = clips[idx - 1].clip_id + + if clip_id is None: + print(f"Unknown clip selection: {token}") + invalid = True + break + if clip_id in seen_ids: + print(f"Duplicate clip selection: {token}") + invalid = True + break + seen_ids.add(clip_id) + selected_ids.append(clip_id) + + if invalid: + continue + + return ApprovalResult(action="proceed", selected_ids=selected_ids) + + +def rate_output(outputs: list[Path]) -> RatingFeedback: + """Prompt the user to rate the rendered outputs.""" + print("Outputs:") + for path in outputs: + print(f" {path}") + print() + print("Watch them, then rate:") + print(" 1. slop 2. good 3. great") + + while True: + rating_raw = input("> ").strip() + if rating_raw in {"1", "2", "3"}: + rating = int(rating_raw) + break + print("Enter 1, 2, or 3.") + + if rating == 3: + return RatingFeedback(rating=3) + + print("What's wrong? (space-separated letters, or empty for skip):") + print(" [a] wrong_moments [b] bad_cuts [c] boring [d] confusing") + print(" [e] wrong_layout [f] length_off [g] other (free text)") + + while True: + issues_raw = input("> ").strip() + if not issues_raw: + return RatingFeedback(rating=rating) + + tokens = issues_raw.lower().split() + issues: list[str] = [] + invalid = [token for token in tokens if token not in _ISSUE_MAP] + if invalid: + print(f"Unknown issue selection: {' '.join(invalid)}") + continue + + for token in tokens: + issue = _ISSUE_MAP[token] + if issue not in issues: + issues.append(issue) + + free_text = None + if "other" in issues: + other_text = input("> ").strip() + free_text = other_text or None + + return RatingFeedback(rating=rating, issues=issues, free_text=free_text) diff --git a/src/humeo/layout_vision.py b/src/humeo/layout_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..55e7c3e42689c1e4b4ee6b023d5dec3ebd5aa0a6 --- /dev/null +++ b/src/humeo/layout_vision.py @@ -0,0 +1,1582 @@ +"""Per-clip layout + bbox via Gemini vision (no pixel heuristics in the product pipeline).""" + +from __future__ import annotations + +import base64 +import hashlib +import json +import logging +import os +import struct +import subprocess +from collections.abc import Iterable +from io import BytesIO +from pathlib import Path +from typing import Any + +from google import genai +from google.genai import types +from openai import OpenAI + +from humeo_core.schemas import ( + BoundingBox, + LayoutInstruction, + LayoutKind, + Scene, + SceneClassification, + SceneRegions, + TimedCenterPoint, +) +from humeo_core.primitives.vision import layout_instruction_from_regions + +from humeo.config import GEMINI_MODEL, GEMINI_VISION_MODEL, PipelineConfig +from humeo.env import ( + OPENROUTER_BASE_URL, + current_llm_provider, + model_name_for_provider, + openrouter_default_headers, + resolve_gemini_api_key, + resolve_llm_provider, + resolve_openrouter_api_key, +) +from humeo.gemini_generate import gemini_generate_config + +logger = logging.getLogger(__name__) + +LAYOUT_VISION_CACHE_VERSION = 8 +LAYOUT_VISION_META = "layout_vision.meta.json" +LAYOUT_VISION_JSON = "layout_vision.json" +TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10)) +TRACKING_MIN_SPREAD_NORM = 0.08 +TRACKING_OUTLIER_DELTA_NORM = 0.16 +TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10 +TRACKING_DEADBAND_NORM = 0.025 +TRACKING_MIN_USABLE_POINTS = 5 +TRACKING_UNSTABLE_JUMP_NORM = 0.18 +FOCUS_SWITCH_LEAD_SEC = 0.35 +SPEAKER_FOLLOW_MAX_INTERVAL_SEC = 2.0 +TWO_SPEAKER_ACTIVE_ZOOM = 1.28 +TWO_SPEAKER_BOTH_ZOOM = 1.0 +TWO_SPEAKER_WIDE_ACTIVE_ZOOM = 1.12 +TWO_SPEAKER_BOTH_FIT_MARGIN = 0.88 +REPLICATE_SAM2_VIDEO_PINNED = ( + "meta/sam-2-video:2d7219877ca847f463d749d9b224e62f7b078fe035d60a74b58889b455d5cbad" +) +_MIN_SPLIT_STRIP_FRAC = 0.2 +_SPLIT_TOP_RATIO_MIN = 0.32 +_SPLIT_TOP_RATIO_MAX = 0.48 +_SPLIT_FACE_REGION_MIN_HEIGHT = 0.62 +_SPLIT_FACE_REGION_HEIGHT_MULT = 2.0 +_SPLIT_FACE_TOP_PAD_MULT = 0.30 + +GEMINI_LAYOUT_VISION_PROMPT = """You are framing a vertical short (9:16) from a 16:9 video frame. + +HARD RULE: the final short shows AT MOST TWO on-screen items. An "item" is one +of person (a human speaker) or chart (slide, graph, data visual, screenshare). +That gives exactly five layouts to choose from. + +Return ONLY a JSON object with this exact shape: +{ + "layout": "zoom_call_center" | "sit_center" | "split_chart_person" | "split_two_persons" | "split_two_charts", + "person_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "face_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "second_person_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "second_face_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "second_chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null, + "reason": "short rationale" +} + +Bbox rules: +- All bbox coordinates are normalized 0..1 (left/top = 0, right/bottom = 1). Require x2 > x1 and y2 > y1 when a bbox is non-null. +- person_bbox / second_person_bbox: tight box around each speaker's head AND upper body. If two speakers are visible, ``person_bbox`` is the LEFT speaker and ``second_person_bbox`` is the RIGHT speaker (by x-center). +- face_bbox / second_face_bbox: TIGHT box around the SPEAKER'S FACE ONLY (forehead to chin, ear to ear). This is NOT the full body β€” exclude torso, arms, shoulders, tank top, mug, table. The face bbox drives horizontal framing in the 9:16 crop, so putting torso or arms in it will push the face off-screen. + * If the subject is shown in profile, the face_bbox still surrounds only the visible half of the head (ear to nose, forehead to chin). It should be roughly square-ish, not a tall body rectangle. + * ``face_bbox`` matches ``person_bbox`` (same speaker), ``second_face_bbox`` matches ``second_person_bbox``. + * Set face bbox to null ONLY if no face is visible at all (back of head, occluded, off-frame). +- chart_bbox / second_chart_bbox: slide, chart, graph, or large on-screen graphic. If two charts are visible, ``chart_bbox`` is the LEFT chart and ``second_chart_bbox`` is the RIGHT chart. +- The two bboxes of the same kind must not overlap meaningfully; they should partition the source frame into distinct regions. + +Layout selection (pick exactly one): +- zoom_call_center: ONE person, tight webcam / video-call headshot filling much of the frame. person_bbox + face_bbox set; others null. +- sit_center: ONE person, interview / seated framing, or when unsure. person_bbox + face_bbox set; others null. +- split_chart_person: ONE chart + ONE person in distinct regions (webinar / explainer). person_bbox + face_bbox + chart_bbox set; second_* null. +- split_two_persons: TWO visible speakers (interview two-up, podcast panel). person_bbox + face_bbox AND second_person_bbox + second_face_bbox set; chart bboxes null. +- split_two_charts: TWO charts / slides side-by-side. chart_bbox AND second_chart_bbox set; person/face bboxes null. + +When in doubt prefer ``sit_center``. Never output more than two of {person, chart} items in total. +No markdown. JSON only.""" + +ACTIVE_SPEAKER_VISION_PROMPT = """You are analyzing a single frame from a two-person talking video. + +Return ONLY a JSON object: +{ + "speaker": "left" | "right" | "both" | "unclear", + "reason": "short rationale" +} + +Rules: +- "left" means the LEFT visible person appears to be the one speaking in this exact frame. +- "right" means the RIGHT visible person appears to be the one speaking in this exact frame. +- Use visible cues only: open mouth mid-word, facial expression while talking, hand gesture timing, body engagement. +- If both appear to be talking at once, return "both". +- If it is impossible to tell from this frame, return "unclear". +- No markdown. JSON only.""" + + +def _openai_message_text(content: object) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + text = item.get("text") + if isinstance(text, str): + parts.append(text) + return "".join(parts) + return "" + + +def _json_object_from_vision_response(raw: object) -> dict[str, Any]: + if isinstance(raw, dict): + return raw + if isinstance(raw, list): + for item in raw: + if isinstance(item, dict): + return item + raise TypeError(f"Expected vision JSON object, got {type(raw).__name__}") + + +def _clips_fingerprint(clips_path: Path) -> str: + if not clips_path.is_file(): + return "" + return hashlib.sha256(clips_path.read_bytes()).hexdigest() + + +def layout_cache_valid( + work_dir: Path, + *, + transcript_fp: str, + clips_fp: str, + vision_model: str, + segmentation_provider: str = "off", + segmentation_model: str = "meta/sam-2-video", +) -> bool: + meta_path = work_dir / LAYOUT_VISION_META + data_path = work_dir / LAYOUT_VISION_JSON + if not meta_path.is_file() or not data_path.is_file(): + return False + try: + meta: dict[str, Any] = json.loads(meta_path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return False + return ( + meta.get("layout_vision_cache_version") == LAYOUT_VISION_CACHE_VERSION + and + meta.get("transcript_sha256") == transcript_fp + and meta.get("clips_sha256") == clips_fp + and meta.get("gemini_vision_model") == vision_model + and meta.get("segmentation_provider", "off") == segmentation_provider + and meta.get("segmentation_model", "meta/sam-2-video") == segmentation_model + and ( + current_llm_provider() is None + or ( + current_llm_provider() == "google" + and meta.get("llm_backend") in (None, "google") + ) + or meta.get("llm_backend") == current_llm_provider() + ) + ) + + +def load_layout_cache(work_dir: Path) -> dict[str, dict[str, Any]] | None: + p = work_dir / LAYOUT_VISION_JSON + if not p.is_file(): + return None + try: + data = json.loads(p.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return None + clips = data.get("clips") + return clips if isinstance(clips, dict) else None + + +def write_layout_cache( + work_dir: Path, + *, + transcript_fp: str, + clips_fp: str, + vision_model: str, + clips_payload: dict[str, dict[str, Any]], + segmentation_provider: str = "off", + segmentation_model: str = "meta/sam-2-video", +) -> None: + work_dir.mkdir(parents=True, exist_ok=True) + meta = { + "layout_vision_cache_version": LAYOUT_VISION_CACHE_VERSION, + "transcript_sha256": transcript_fp, + "clips_sha256": clips_fp, + "gemini_vision_model": vision_model, + "llm_backend": current_llm_provider() or "google", + "segmentation_provider": segmentation_provider, + "segmentation_model": segmentation_model, + } + (work_dir / LAYOUT_VISION_META).write_text( + json.dumps(meta, indent=2) + "\n", encoding="utf-8" + ) + (work_dir / LAYOUT_VISION_JSON).write_text( + json.dumps({"clips": clips_payload}, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + logger.info("Wrote %s and %s", LAYOUT_VISION_META, LAYOUT_VISION_JSON) + + +def _png_dims(path: Path) -> tuple[int, int] | None: + try: + with path.open("rb") as f: + head = f.read(24) + if head[:8] != b"\x89PNG\r\n\x1a\n": + return None + width, height = struct.unpack(">II", head[16:24]) + return int(width), int(height) + except Exception: + return None + + +def _jpeg_dims(path: Path) -> tuple[int, int] | None: + try: + with path.open("rb") as f: + if f.read(2) != b"\xff\xd8": + return None + sof_markers = { + 0xC0, + 0xC1, + 0xC2, + 0xC3, + 0xC5, + 0xC6, + 0xC7, + 0xC9, + 0xCA, + 0xCB, + 0xCD, + 0xCE, + 0xCF, + } + while True: + marker_start = f.read(1) + if not marker_start: + return None + if marker_start != b"\xff": + continue + marker = f.read(1) + while marker == b"\xff": + marker = f.read(1) + if not marker: + return None + marker_byte = marker[0] + if marker_byte in (0xD8, 0xD9, 0x01) or 0xD0 <= marker_byte <= 0xD7: + continue + seg_len_bytes = f.read(2) + if len(seg_len_bytes) != 2: + return None + seg_len = struct.unpack(">H", seg_len_bytes)[0] + if seg_len < 2: + return None + if marker_byte in sof_markers: + frame_header = f.read(5) + if len(frame_header) != 5: + return None + _, height, width = struct.unpack(">BHH", frame_header) + return int(width), int(height) + f.seek(seg_len - 2, 1) + except Exception: + return None + + +def _keyframe_dimensions(keyframe_path: str) -> tuple[int, int] | None: + path = Path(keyframe_path) + try: + from PIL import Image # type: ignore + + with Image.open(path) as img: + width, height = img.size + return int(width), int(height) + except Exception: + pass + + png_dims = _png_dims(path) + if png_dims is not None: + return png_dims + return _jpeg_dims(path) + + +def _normalize_bbox_payload( + raw: dict[str, Any], image_size: tuple[int, int] | None +) -> dict[str, Any]: + if image_size is None: + return dict(raw) + + width, height = image_size + normalized = dict(raw) + x_values = [ + float(normalized[key]) + for key in ("x1", "x2") + if isinstance(normalized.get(key), (int, float)) + ] + y_values = [ + float(normalized[key]) + for key in ("y1", "y2") + if isinstance(normalized.get(key), (int, float)) + ] + + if not x_values and not y_values: + return normalized + + use_thousand_grid = False + if any(v > 1.0 for v in x_values + y_values): + max_coord = max(x_values + y_values) + fits_image_pixels = ( + all(v <= float(width) for v in x_values) + and all(v <= float(height) for v in y_values) + ) + if max_coord <= 1000.0 and not fits_image_pixels: + use_thousand_grid = True + + x_scale = 1000.0 if use_thousand_grid else float(width) + y_scale = 1000.0 if use_thousand_grid else float(height) + + axis_scales = { + "x1": x_scale, + "x2": x_scale, + "y1": y_scale, + "y2": y_scale, + } + for key, axis_scale in axis_scales.items(): + value = normalized.get(key) + if not isinstance(value, (int, float)): + continue + coord = float(value) + if coord > 1.0 and axis_scale > 0.0: + coord = coord / axis_scale + normalized[key] = max(0.0, min(coord, 1.0)) + return normalized + + +def _parse_bbox( + raw: object, *, image_size: tuple[int, int] | None = None +) -> BoundingBox | None: + if not raw or not isinstance(raw, dict): + return None + try: + return BoundingBox.model_validate(_normalize_bbox_payload(raw, image_size)) + except Exception: + return None + + +def _instruction_from_gemini_json( + scene_id: str, + data: dict[str, Any], + *, + image_size: tuple[int, int] | None = None, +) -> LayoutInstruction: + """Translate Gemini's JSON into a validated :class:`LayoutInstruction`. + + Falls back to ``sit_center`` whenever the LLM returns something the + contract doesn't support, so a bad vision call can never crash the + pipeline. Also downgrades "two-item" layouts when the second bbox is + missing -- e.g. ``split_two_persons`` with only one person_bbox drops + to ``sit_center`` rather than rendering a silently-broken split. + """ + + layout_str = str(data.get("layout", "sit_center")).strip() + try: + kind = LayoutKind(layout_str) + except ValueError: + kind = LayoutKind.SIT_CENTER + + pb = _parse_bbox(data.get("person_bbox"), image_size=image_size) + fb = _parse_bbox(data.get("face_bbox"), image_size=image_size) + cb = _parse_bbox(data.get("chart_bbox"), image_size=image_size) + p2 = _parse_bbox(data.get("second_person_bbox"), image_size=image_size) + f2 = _parse_bbox(data.get("second_face_bbox"), image_size=image_size) + c2 = _parse_bbox(data.get("second_chart_bbox"), image_size=image_size) + reason = str(data.get("reason", ""))[:400] + + # Downgrade any split that is missing its required bboxes, so we never + # emit a split layout that will render as garbage. + if kind == LayoutKind.SPLIT_CHART_PERSON and (pb is None or cb is None): + kind = LayoutKind.SIT_CENTER if pb is not None else LayoutKind.SIT_CENTER + if kind == LayoutKind.SPLIT_TWO_PERSONS and (pb is None or p2 is None): + kind = LayoutKind.SIT_CENTER + if kind == LayoutKind.SPLIT_TWO_CHARTS and (cb is None or c2 is None): + kind = LayoutKind.SIT_CENTER + + regions = SceneRegions( + scene_id=scene_id, person_bbox=pb, chart_bbox=cb, raw_reason=reason + ) + classification = SceneClassification( + scene_id=scene_id, layout=kind, confidence=1.0, reason=reason + ) + instr = layout_instruction_from_regions( + regions, classification, clip_id=scene_id + ) + + updates: dict[str, Any] = {} + + # CENTERING FIX: the single-person 9:16 crop is driven by ``person_x_norm``. + # A ``person_bbox`` that spans head + torso + arms is fine for framing + # *extent* but its center_x can drift far from the actual face when the + # subject is in profile or asymmetric (one arm up, mug on the table, etc). + # Prefer the tight ``face_bbox`` center when the model gave us one so the + # face lands in the visual center of the vertical crop instead of the + # torso doing. + face_center = _face_center_x(fb, pb) + if face_center is not None: + updates["person_x_norm"] = face_center + + if kind == LayoutKind.SPLIT_CHART_PERSON and pb is not None and cb is not None: + render_person = _render_safe_split_person_region(pb, fb) + updates["split_chart_region"] = cb + updates["split_person_region"] = render_person + updates["top_band_ratio"] = _split_chart_person_top_band_ratio(cb, render_person) + elif kind == LayoutKind.SPLIT_TWO_PERSONS and pb is not None and p2 is not None: + # Order by x-center so ``split_person_region`` is always the LEFT speaker. + left, right = sorted((pb, p2), key=lambda b: b.center_x) + updates["split_person_region"] = left + updates["split_second_person_region"] = right + elif kind == LayoutKind.SPLIT_TWO_CHARTS and cb is not None and c2 is not None: + left, right = sorted((cb, c2), key=lambda b: b.center_x) + updates["split_chart_region"] = left + updates["split_second_chart_region"] = right + + if updates: + instr = instr.model_copy(update=updates) + return instr + + +def _face_center_x( + face: BoundingBox | None, person: BoundingBox | None +) -> float | None: + """Pick a horizontal center to aim the 9:16 crop at. + + Priority: + 1. ``face`` bbox center when it looks reasonable (narrow, plausibly + inside the matching person bbox). + 2. No override (caller keeps the person-bbox center, or the default 0.5 + when neither was provided). + + We sanity-check the face box because Gemini sometimes echoes the full + person bbox into ``face_bbox``. If the face bbox is as wide as the + person bbox, it gives us nothing new; fall back to the person center + rather than pretending we have a tighter signal. + """ + if face is None: + return None + face_w = max(0.0, face.x2 - face.x1) + if face_w <= 0.0: + return None + # A real face in a 16:9 frame is rarely wider than ~35% of frame width, + # even for tight webcam framing. A face "bbox" that's wider than that + # almost certainly includes torso and is no better than person_bbox. + if face_w > 0.40: + return None + # If we have a person bbox too, require the face center to sit inside it + # β€” otherwise the model got confused and matched the wrong subject. + if person is not None: + if not (person.x1 - 0.02 <= face.center_x <= person.x2 + 0.02): + return None + return float(face.center_x) + + +def _render_safe_split_person_region( + person: BoundingBox, + face: BoundingBox | None, +) -> BoundingBox: + """Bias split speaker crops toward head-and-shoulders instead of torso.""" + + if face is None or _face_center_x(face, person) is None: + return person + + face_h = max(0.0, face.y2 - face.y1) + if face_h <= 0.0: + return person + + target_h = min( + person.y2 - person.y1, + max(_SPLIT_FACE_REGION_MIN_HEIGHT, face_h * _SPLIT_FACE_REGION_HEIGHT_MULT), + ) + top = max(0.0, min(person.y1, face.y1 - face_h * _SPLIT_FACE_TOP_PAD_MULT)) + bottom = min(person.y2, top + target_h) + if bottom - top < target_h: + top = max(0.0, bottom - target_h) + if bottom - top <= face_h: + return person + + return person.model_copy(update={"y1": top, "y2": bottom}) + + +def _split_chart_person_top_band_ratio( + chart: BoundingBox, + person: BoundingBox, +) -> float: + """Allocate top/bottom band height from the chart/person aspect needs.""" + + seam = (chart.x2 + person.x1) / 2.0 + seam = max(_MIN_SPLIT_STRIP_FRAC, min(1.0 - _MIN_SPLIT_STRIP_FRAC, seam)) + chart_w = max(1e-6, seam) + person_w = max(1e-6, 1.0 - seam) + chart_need = max(1e-6, (chart.y2 - chart.y1) / chart_w) + person_need = max(1e-6, (person.y2 - person.y1) / person_w) + ratio = chart_need / (chart_need + person_need) + return round(max(_SPLIT_TOP_RATIO_MIN, min(_SPLIT_TOP_RATIO_MAX, ratio)), 3) + + +def _person_center_x_from_data( + data: dict[str, Any], image_size: tuple[int, int] | None = None +) -> float | None: + person_bbox = _parse_bbox(data.get("person_bbox"), image_size=image_size) + face_bbox = _parse_bbox(data.get("face_bbox"), image_size=image_size) + face_center = _face_center_x(face_bbox, person_bbox) + if face_center is not None: + return face_center + if person_bbox is not None: + return float(person_bbox.center_x) + return None + + +def _tracking_sample_times(duration_sec: float) -> list[float]: + seen: set[float] = set() + out: list[float] = [] + for fraction in TRACKING_SAMPLE_FRACTIONS: + t_sec = max(0.0, min(duration_sec, duration_sec * fraction)) + key = round(t_sec, 3) + if key in seen: + continue + seen.add(key) + out.append(t_sec) + return out + + +def _tracking_points_from_centers( + duration_sec: float, centers: list[tuple[float, float]] +) -> list[TimedCenterPoint]: + deduped: list[tuple[float, float]] = [] + for t_sec, x_norm in sorted(centers, key=lambda item: item[0]): + clamped_t = max(0.0, min(duration_sec, float(t_sec))) + clamped_x = max(0.0, min(1.0, float(x_norm))) + if deduped and abs(clamped_t - deduped[-1][0]) < 1e-6: + deduped[-1] = (clamped_t, clamped_x) + else: + deduped.append((clamped_t, clamped_x)) + + if len(deduped) < 2: + return [] + + filtered = list(deduped) + for idx in range(1, len(filtered) - 1): + prev_x = filtered[idx - 1][1] + curr_t, curr_x = filtered[idx] + next_x = filtered[idx + 1][1] + if ( + abs(prev_x - next_x) <= TRACKING_OUTLIER_NEIGHBOR_MAX_NORM + and abs(curr_x - prev_x) >= TRACKING_OUTLIER_DELTA_NORM + and abs(curr_x - next_x) >= TRACKING_OUTLIER_DELTA_NORM + ): + filtered[idx] = (curr_t, (prev_x + next_x) / 2.0) + + smoothed = list(filtered) + for idx in range(1, len(filtered) - 1): + prev_x = filtered[idx - 1][1] + curr_t, curr_x = filtered[idx] + next_x = filtered[idx + 1][1] + median_x = sorted((prev_x, curr_x, next_x))[1] + if abs(curr_x - median_x) > TRACKING_DEADBAND_NORM: + smoothed[idx] = (curr_t, median_x) + + if len(smoothed) >= 5: + wider_smoothed = list(smoothed) + for idx in range(1, len(smoothed) - 1): + window = smoothed[max(0, idx - 2) : min(len(smoothed), idx + 3)] + median_x = sorted(x for _, x in window)[len(window) // 2] + curr_t, curr_x = smoothed[idx] + if abs(curr_x - median_x) >= TRACKING_OUTLIER_DELTA_NORM: + wider_smoothed[idx] = (curr_t, median_x) + smoothed = wider_smoothed + + filtered = list(smoothed) + for idx in range(1, len(filtered)): + prev_t, prev_x = filtered[idx - 1] + curr_t, curr_x = filtered[idx] + if abs(curr_x - prev_x) < TRACKING_DEADBAND_NORM: + filtered[idx] = (curr_t, prev_x) + + spread = max(x for _, x in filtered) - min(x for _, x in filtered) + if spread < TRACKING_MIN_SPREAD_NORM: + stable_x = sum(x for _, x in filtered) / len(filtered) + return [ + TimedCenterPoint(t_sec=0.0, x_norm=stable_x), + TimedCenterPoint(t_sec=duration_sec, x_norm=stable_x), + ] + + if filtered[0][0] > 0.0: + filtered.insert(0, (0.0, filtered[0][1])) + else: + filtered[0] = (0.0, filtered[0][1]) + + if filtered[-1][0] < duration_sec: + filtered.append((duration_sec, filtered[-1][1])) + else: + filtered[-1] = (duration_sec, filtered[-1][1]) + + return [TimedCenterPoint(t_sec=t_sec, x_norm=x_norm) for t_sec, x_norm in filtered] + + +def _tracking_is_unstable(points: list[TimedCenterPoint]) -> bool: + if len(points) < TRACKING_MIN_USABLE_POINTS: + return True + return any( + abs(points[idx].x_norm - points[idx - 1].x_norm) > TRACKING_UNSTABLE_JUMP_NORM + for idx in range(1, len(points)) + ) + + +def _interpolate_tracking_x(points: list[TimedCenterPoint], t_sec: float) -> float | None: + if not points: + return None + if t_sec <= points[0].t_sec: + return float(points[0].x_norm) + if t_sec >= points[-1].t_sec: + return float(points[-1].x_norm) + for idx in range(1, len(points)): + left = points[idx - 1] + right = points[idx] + if right.t_sec < t_sec: + continue + span = right.t_sec - left.t_sec + if span <= 1e-6: + return float(right.x_norm) + alpha = (t_sec - left.t_sec) / span + return float(left.x_norm + (right.x_norm - left.x_norm) * alpha) + return float(points[-1].x_norm) + + +def _speaker_seed_boxes( + data: dict[str, Any], image_size: tuple[int, int] | None +) -> tuple[BoundingBox, BoundingBox] | None: + first_person = _parse_bbox(data.get("person_bbox"), image_size=image_size) + first_face = _parse_bbox(data.get("face_bbox"), image_size=image_size) + second_person = _parse_bbox(data.get("second_person_bbox"), image_size=image_size) + second_face = _parse_bbox(data.get("second_face_bbox"), image_size=image_size) + left = first_face or first_person + right = second_face or second_person + if left is None or right is None: + return None + ordered = sorted((left, right), key=lambda box: box.center_x) + return ordered[0], ordered[1] + + +def _nearest_seed_side( + center_x: float, + *, + left_seed: BoundingBox, + right_seed: BoundingBox, +) -> str: + left_delta = abs(center_x - left_seed.center_x) + right_delta = abs(center_x - right_seed.center_x) + return "left" if left_delta <= right_delta else "right" + + +def _focus_frame_visible_speaker_centers( + data: dict[str, Any] | None, + image_size: tuple[int, int] | None, + *, + left_seed: BoundingBox, + right_seed: BoundingBox, +) -> tuple[dict[str, float], bool]: + if not data: + return {}, False + + first_person = _parse_bbox(data.get("person_bbox"), image_size=image_size) + first_face = _parse_bbox(data.get("face_bbox"), image_size=image_size) + second_person = _parse_bbox(data.get("second_person_bbox"), image_size=image_size) + second_face = _parse_bbox(data.get("second_face_bbox"), image_size=image_size) + + visible_boxes = [box for box in (first_face or first_person, second_face or second_person) if box] + if not visible_boxes: + return {}, False + + if len(visible_boxes) >= 2: + ordered = sorted(visible_boxes, key=lambda box: box.center_x) + return {"left": ordered[0].center_x, "right": ordered[1].center_x}, True + + only_box = visible_boxes[0] + side = _nearest_seed_side(only_box.center_x, left_seed=left_seed, right_seed=right_seed) + return {side: only_box.center_x}, False + + +def _two_speaker_full_width_span_norm(image_size: tuple[int, int] | None) -> float: + if image_size is None: + return 1.0 + width, height = image_size + if width <= 0 or height <= 0: + return 1.0 + target_aspect = 9 / 16 + if width / height >= target_aspect: + return min(1.0, (height * target_aspect) / width) + return 1.0 + + +def _can_fit_both_speakers( + left_x: float, + right_x: float, + *, + image_size: tuple[int, int] | None, +) -> bool: + span = abs(right_x - left_x) + allowed = _two_speaker_full_width_span_norm(image_size) * TWO_SPEAKER_BOTH_FIT_MARGIN + return span <= allowed + + +def _speaker_follow_sample_times(duration_sec: float) -> list[float]: + seen: set[float] = set() + out: list[float] = [] + dense_times: list[float] = [] + if duration_sec > 0: + steps = max(1, int(duration_sec / SPEAKER_FOLLOW_MAX_INTERVAL_SEC)) + dense_times = [ + min(duration_sec, idx * SPEAKER_FOLLOW_MAX_INTERVAL_SEC) + for idx in range(1, steps + 1) + ] + for t_sec in [0.0, *_tracking_sample_times(duration_sec), *dense_times, duration_sec]: + key = round(max(0.0, min(duration_sec, t_sec)), 3) + if key in seen: + continue + seen.add(key) + out.append(key) + return out + + +def _resolve_speaker_focus_samples( + samples: list[tuple[float, str]], + *, + default_side: str = "left", +) -> list[tuple[float, str]]: + normalized: list[tuple[float, str | None]] = [] + allowed = {"left", "right", "both"} + for t_sec, side in samples: + normalized.append((float(t_sec), side if side in allowed else None)) + + out: list[tuple[float, str]] = [] + for idx, (t_sec, side) in enumerate(normalized): + if side is not None: + out.append((t_sec, side)) + continue + + prev_side = out[-1][1] if out else None + next_side: str | None = None + for _, future_side in normalized[idx + 1 :]: + if future_side is not None: + next_side = future_side + break + + resolved_side: str + if prev_side is not None and next_side is not None: + resolved_side = prev_side if prev_side == next_side else "both" + else: + resolved_side = prev_side or next_side or default_side + out.append((t_sec, resolved_side)) + return out + + +def _tracking_points_from_focus_states( + duration_sec: float, + framings: list[tuple[float, float, float]], +) -> list[TimedCenterPoint]: + deduped: list[tuple[float, float, float]] = [] + for t_sec, x_norm, zoom in sorted(framings, key=lambda item: item[0]): + clamped_t = max(0.0, min(duration_sec, float(t_sec))) + clamped_x = max(0.0, min(1.0, float(x_norm))) + clamped_zoom = max(1.0, min(4.0, float(zoom))) + if deduped and abs(clamped_t - deduped[-1][0]) < 1e-6: + deduped[-1] = (clamped_t, clamped_x, clamped_zoom) + else: + deduped.append((clamped_t, clamped_x, clamped_zoom)) + + if len(deduped) < 2: + return [] + + if deduped[0][0] > 0.0: + deduped.insert(0, (0.0, deduped[0][1], deduped[0][2])) + else: + deduped[0] = (0.0, deduped[0][1], deduped[0][2]) + + if deduped[-1][0] < duration_sec: + deduped.append((duration_sec, deduped[-1][1], deduped[-1][2])) + else: + deduped[-1] = (duration_sec, deduped[-1][1], deduped[-1][2]) + + expanded: list[tuple[float, float, float]] = [deduped[0]] + for t_sec, x_norm, zoom in deduped[1:]: + prev_t, prev_x, prev_zoom = expanded[-1] + switch_changed = ( + abs(x_norm - prev_x) > TRACKING_DEADBAND_NORM + or abs(zoom - prev_zoom) > 0.05 + ) + if switch_changed: + hold_t = max(prev_t, min(t_sec, t_sec - FOCUS_SWITCH_LEAD_SEC)) + if hold_t - prev_t > 1e-6: + expanded.append((hold_t, prev_x, prev_zoom)) + if abs(t_sec - expanded[-1][0]) < 1e-6: + expanded[-1] = (t_sec, x_norm, zoom) + else: + expanded.append((t_sec, x_norm, zoom)) + + return [ + TimedCenterPoint(t_sec=t_sec, x_norm=x_norm, zoom=zoom) + for t_sec, x_norm, zoom in expanded + ] + + +def _nearest_non_both_focus_side( + resolved_focus: list[tuple[float, str]], + start_idx: int, + *, + step: int, +) -> str | None: + idx = start_idx + while 0 <= idx < len(resolved_focus): + side = resolved_focus[idx][1] + if side in ("left", "right"): + return side + idx += step + return None + + +def _extract_frame_at_time(source_path: Path, time_sec: float, output_path: Path) -> Path: + output_path.parent.mkdir(parents=True, exist_ok=True) + subprocess.run( + [ + "ffmpeg", + "-y", + "-loglevel", + "error", + "-ss", + f"{time_sec:.3f}", + "-i", + str(source_path), + "-frames:v", + "1", + "-q:v", + "2", + str(output_path), + ], + check=True, + capture_output=True, + ) + return output_path + + +def _probe_video_fps(source_path: Path) -> float: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-select_streams", + "v:0", + "-show_entries", + "stream=r_frame_rate", + "-of", + "default=noprint_wrappers=1:nokey=1", + str(source_path), + ], + check=False, + capture_output=True, + text=True, + ) + rate = (result.stdout or "").strip() + if "/" in rate: + num, den = rate.split("/", 1) + try: + return max(1.0, float(num) / max(float(den), 1.0)) + except ValueError: + return 30.0 + try: + return max(1.0, float(rate)) + except ValueError: + return 30.0 + + +def _segmentation_center_x_from_url(mask_url: str) -> float | None: + try: + import httpx + from PIL import Image # type: ignore + except ImportError: + return None + + response = httpx.get(mask_url, timeout=120.0) + response.raise_for_status() + with Image.open(BytesIO(response.content)) as image: + image = image.convert("L") + width, height = image.size + pixels = image.load() + xs: list[int] = [] + for y in range(height): + for x in range(width): + if pixels[x, y] > 16: + xs.append(x) + if not xs or width <= 0: + return None + return float(sum(xs) / len(xs) / width) + + +def _segmentation_mask_urls(output: object) -> list[str]: + def _coerce_urls(items: Iterable[object]) -> list[str]: + urls: list[str] = [] + for item in items: + if item is None: + continue + if isinstance(item, (str, Path)): + text = str(item).strip() + else: + url = getattr(item, "url", None) + text = str(url).strip() if isinstance(url, str) else str(item).strip() + if text: + urls.append(text) + return urls + + if isinstance(output, dict): + for key in ("black_white_masks", "masks", "output"): + value = output.get(key) + if isinstance(value, (str, bytes, bytearray)) or value is None: + continue + try: + urls = _coerce_urls(value) + except TypeError: + continue + if urls: + return urls + return [] + if isinstance(output, (str, bytes, bytearray)) or output is None: + return [] + try: + return _coerce_urls(output) + except TypeError: + return [] + + +def _infer_person_tracking_with_segmentation( + scene: Scene, + *, + source_video: Path, + segmentation_model: str, + initial_data: dict[str, Any] | None = None, + initial_image_size: tuple[int, int] | None = None, + seed_bbox: BoundingBox | None = None, + object_id: str = "speaker", +) -> tuple[list[TimedCenterPoint], dict[str, Any] | None]: + token = (os.environ.get("REPLICATE_API_TOKEN") or "").strip() + if not token: + raise RuntimeError("REPLICATE_API_TOKEN is not set") + if initial_image_size is None: + raise RuntimeError("Segmentation fallback requires the keyframe dimensions") + if seed_bbox is None and initial_data is None: + raise RuntimeError("Segmentation fallback requires an initial vision bbox") + + if seed_bbox is None: + face_bbox = _parse_bbox(initial_data.get("face_bbox"), image_size=initial_image_size) + person_bbox = _parse_bbox(initial_data.get("person_bbox"), image_size=initial_image_size) + seed_bbox = face_bbox or person_bbox + if seed_bbox is None: + raise RuntimeError("No seed bbox available for segmentation fallback") + + try: + import replicate + except ImportError as exc: + raise RuntimeError("replicate package is not installed") from exc + + width, height = initial_image_size + fps = _probe_video_fps(source_video) + midpoint_frame = max(0, int(round((scene.duration / 2.0) * fps))) + output_frame_interval = max(1, int(round(max(1.0, scene.duration * fps) / 10.0))) + click_x = int(round(seed_bbox.center_x * width)) + click_y = int(round(seed_bbox.center_y * height)) + prompt_frames = [0] + if midpoint_frame > 0: + prompt_frames.append(midpoint_frame) + prompt_coordinates = ",".join(f"[{click_x},{click_y}]" for _ in prompt_frames) + prompt_labels = ",".join("1" for _ in prompt_frames) + prompt_frame_str = ",".join(str(frame_idx) for frame_idx in prompt_frames) + prompt_object_ids = ",".join(object_id for _ in prompt_frames) + run_input = { + "input_video": None, + "click_coordinates": prompt_coordinates, + "click_labels": prompt_labels, + "click_frames": prompt_frame_str, + "click_object_ids": prompt_object_ids, + "mask_type": "binary", + "annotation_type": "mask", + "output_video": False, + "output_format": "png", + "output_frame_interval": output_frame_interval, + } + + with source_video.open("rb") as handle: + client = replicate.Client(api_token=token) + run_input["input_video"] = handle + try: + output = client.run(segmentation_model, input=run_input) + resolved_model = segmentation_model + except Exception as exc: + if ":" in segmentation_model or "404" not in str(exc): + raise + handle.seek(0) + output = client.run(REPLICATE_SAM2_VIDEO_PINNED, input=run_input) + resolved_model = REPLICATE_SAM2_VIDEO_PINNED + + urls = _segmentation_mask_urls(output) + if not urls: + raise RuntimeError("Segmentation fallback returned no masks") + + centers: list[tuple[float, float]] = [] + for idx, mask_url in enumerate(urls): + center_x = _segmentation_center_x_from_url(mask_url) + if center_x is None: + continue + rel_time = min(scene.duration, (idx * output_frame_interval) / fps) + centers.append((rel_time, center_x)) + + points = _tracking_points_from_centers(scene.duration, centers) + detail = { + "provider": "replicate", + "model": resolved_model, + "seed_point_px": [click_x, click_y], + "seed_frame": midpoint_frame, + "prompt_frames": prompt_frames, + "output_frame_interval": output_frame_interval, + "mask_count": len(urls), + } + return points, detail + + +def _infer_two_speaker_focus_tracking_with_segmentation( + scene: Scene, + *, + source_video: Path, + tracking_dir: Path, + model_name: str, + segmentation_model: str, + initial_data: dict[str, Any], + initial_image_size: tuple[int, int] | None, +) -> tuple[list[TimedCenterPoint], dict[str, Any] | None]: + seeds = _speaker_seed_boxes(initial_data, initial_image_size) + if seeds is None: + raise RuntimeError("Two-speaker SAM follow requires both speaker bboxes") + + left_seed, right_seed = seeds + left_points, left_detail = _infer_person_tracking_with_segmentation( + scene, + source_video=source_video, + segmentation_model=segmentation_model, + initial_data=initial_data, + initial_image_size=initial_image_size, + seed_bbox=left_seed, + object_id="left_speaker", + ) + right_points, right_detail = _infer_person_tracking_with_segmentation( + scene, + source_video=source_video, + segmentation_model=segmentation_model, + initial_data=initial_data, + initial_image_size=initial_image_size, + seed_bbox=right_seed, + object_id="right_speaker", + ) + if not left_points or not right_points: + raise RuntimeError("Two-speaker SAM follow did not return both speaker tracks") + + focus_dir = tracking_dir / scene.scene_id / "speaker_focus" + focus_samples: list[dict[str, Any]] = [] + focus_choices: list[tuple[float, str]] = [] + for rel_time in _speaker_follow_sample_times(max(0.0, scene.duration)): + abs_time = scene.start_time + rel_time + frame_path = focus_dir / f"{scene.scene_id}_{int(round(rel_time * 1000)):06d}.jpg" + visible_centers: dict[str, float] = {} + both_visible = False + try: + _extract_frame_at_time(source_video, abs_time, frame_path) + frame_image_size = _keyframe_dimensions(str(frame_path)) + layout_data: dict[str, Any] | None = None + layout_error: str | None = None + try: + layout_data = _call_gemini_vision(str(frame_path), model_name) + visible_centers, both_visible = _focus_frame_visible_speaker_centers( + layout_data, + frame_image_size, + left_seed=left_seed, + right_seed=right_seed, + ) + except Exception as exc: + layout_error = str(exc) + + data = _call_active_speaker_vision(str(frame_path), model_name) + speaker = str(data.get("speaker", "unclear")).strip().lower() + if speaker not in ("left", "right", "both", "unclear"): + speaker = "unclear" + if speaker == "unclear" and len(visible_centers) == 1 and not both_visible: + speaker = next(iter(visible_centers)) + focus_choices.append((rel_time, speaker)) + sample = { + "time_sec": rel_time, + "frame_path": str(frame_path), + "speaker": speaker, + "raw": data, + "visible_centers": visible_centers, + "both_visible": both_visible, + } + if layout_data is not None: + sample["layout_raw"] = layout_data + if layout_error: + sample["layout_error"] = layout_error + focus_samples.append(sample) + except Exception as exc: + focus_choices.append((rel_time, "unclear")) + focus_samples.append( + { + "time_sec": rel_time, + "frame_path": str(frame_path), + "speaker": "unclear", + "visible_centers": visible_centers, + "both_visible": both_visible, + "error": str(exc), + } + ) + + resolved_focus = _resolve_speaker_focus_samples(focus_choices, default_side="left") + framings: list[tuple[float, float, float]] = [] + for idx, (rel_time, speaker) in enumerate(resolved_focus): + sample = focus_samples[idx] if idx < len(focus_samples) else {} + sample_visible_centers = sample.get("visible_centers", {}) + frame_left_x = ( + float(sample_visible_centers["left"]) + if isinstance(sample_visible_centers, dict) and "left" in sample_visible_centers + else None + ) + frame_right_x = ( + float(sample_visible_centers["right"]) + if isinstance(sample_visible_centers, dict) and "right" in sample_visible_centers + else None + ) + both_visible = bool(sample.get("both_visible")) + + left_x = frame_left_x if frame_left_x is not None else _interpolate_tracking_x(left_points, rel_time) + right_x = ( + frame_right_x if frame_right_x is not None else _interpolate_tracking_x(right_points, rel_time) + ) + if left_x is None: + left_x = left_seed.center_x + if right_x is None: + right_x = right_seed.center_x + + prev_side = _nearest_non_both_focus_side(resolved_focus, idx - 1, step=-1) + next_side = _nearest_non_both_focus_side(resolved_focus, idx + 1, step=1) + should_widen = False + if both_visible and _can_fit_both_speakers(left_x, right_x, image_size=initial_image_size): + if speaker == "both": + should_widen = True + elif ( + prev_side is not None + and next_side is not None + and prev_side != next_side + ): + should_widen = True + + if should_widen: + x_norm = (left_x + right_x) / 2.0 + zoom = TWO_SPEAKER_BOTH_ZOOM + elif speaker == "left": + x_norm = left_x + zoom = TWO_SPEAKER_ACTIVE_ZOOM + elif speaker == "right": + x_norm = right_x + zoom = TWO_SPEAKER_ACTIVE_ZOOM + else: + fallback_side = prev_side or next_side or "left" + x_norm = left_x if fallback_side == "left" else right_x + zoom = TWO_SPEAKER_WIDE_ACTIVE_ZOOM + framings.append((rel_time, x_norm, zoom)) + + points = _tracking_points_from_focus_states(scene.duration, framings) + detail = { + "mode": "two_speaker_follow", + "left_segmentation": left_detail, + "right_segmentation": right_detail, + "focus_samples": focus_samples, + "resolved_focus": [ + {"time_sec": rel_time, "speaker": speaker} for rel_time, speaker in resolved_focus + ], + "framing_samples": [ + {"time_sec": rel_time, "x_norm": x_norm, "zoom": zoom} + for rel_time, x_norm, zoom in framings + ], + } + return points, detail + + +def _infer_person_tracking( + scene: Scene, + *, + source_video: Path, + tracking_dir: Path, + model_name: str, + initial_data: dict[str, Any] | None = None, + initial_image_size: tuple[int, int] | None = None, +) -> tuple[list[TimedCenterPoint], list[dict[str, Any]]]: + duration_sec = max(0.0, scene.duration) + if duration_sec <= 0.0: + return [], [] + + midpoint_rel = duration_sec / 2.0 + centers: list[tuple[float, float]] = [] + samples: list[dict[str, Any]] = [] + + if initial_data is not None: + center_x = _person_center_x_from_data(initial_data, image_size=initial_image_size) + samples.append( + { + "sample_kind": "midpoint_keyframe", + "time_sec": midpoint_rel, + "frame_path": scene.keyframe_path, + "center_x_norm": center_x, + "raw": initial_data, + } + ) + if center_x is not None: + centers.append((midpoint_rel, center_x)) + + scene_tracking_dir = tracking_dir / scene.scene_id + for rel_time in _tracking_sample_times(duration_sec): + if abs(rel_time - midpoint_rel) < 1e-3: + continue + abs_time = scene.start_time + rel_time + frame_path = scene_tracking_dir / f"{scene.scene_id}_{int(round(rel_time * 1000)):06d}.jpg" + try: + _extract_frame_at_time(source_video, abs_time, frame_path) + data = _call_gemini_vision(str(frame_path), model_name) + image_size = _keyframe_dimensions(str(frame_path)) + center_x = _person_center_x_from_data(data, image_size=image_size) + samples.append( + { + "sample_kind": "tracking_frame", + "time_sec": rel_time, + "frame_path": str(frame_path), + "center_x_norm": center_x, + "raw": data, + } + ) + if center_x is not None: + centers.append((rel_time, center_x)) + except Exception as e: + logger.warning( + "Speaker tracking sample failed for %s at %.2fs: %s", + scene.scene_id, + rel_time, + e, + ) + samples.append( + { + "sample_kind": "tracking_frame", + "time_sec": rel_time, + "frame_path": str(frame_path), + "error": str(e), + } + ) + + return _tracking_points_from_centers(duration_sec, centers), samples + + +def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[str, Any]: + path = Path(keyframe_path) + data = path.read_bytes() + mime = "image/jpeg" if path.suffix.lower() in (".jpg", ".jpeg") else "image/png" + provider = resolve_llm_provider() + resolved_model = model_name_for_provider(model_name, provider) + + if provider == "google": + client = genai.Client(api_key=resolve_gemini_api_key()) + response = client.models.generate_content( + model=resolved_model, + contents=[ + types.Part.from_text(text=prompt), + types.Part.from_bytes(data=data, mime_type=mime), + ], + config=gemini_generate_config( + temperature=0.2, + response_mime_type="application/json", + ), + ) + if not response.text: + raise RuntimeError("Gemini vision returned empty response") + return _json_object_from_vision_response(json.loads(response.text)) + + client = OpenAI( + api_key=resolve_openrouter_api_key(), + base_url=OPENROUTER_BASE_URL, + default_headers=openrouter_default_headers(), + ) + data_url = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}" + response = client.chat.completions.create( + model=resolved_model, + messages=[ + {"role": "system", "content": prompt}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this keyframe and return only JSON."}, + {"type": "image_url", "image_url": {"url": data_url}}, + ], + }, + ], + temperature=0.2, + response_format={"type": "json_object"}, + ) + text = _openai_message_text(response.choices[0].message.content) + if not text: + raise RuntimeError("OpenRouter vision returned empty response") + return _json_object_from_vision_response(json.loads(text)) + + +def _call_gemini_vision(keyframe_path: str, model_name: str) -> dict[str, Any]: + return _call_vision_json(keyframe_path, model_name, GEMINI_LAYOUT_VISION_PROMPT) + + +def _call_active_speaker_vision(frame_path: str, model_name: str) -> dict[str, Any]: + return _call_vision_json(frame_path, model_name, ACTIVE_SPEAKER_VISION_PROMPT) + + +def infer_layout_instructions( + scenes: list[Scene], + *, + gemini_vision_model: str, + source_video: Path, + tracking_dir: Path, + source_videos_by_scene: dict[str, Path] | None = None, + segmentation_provider: str = "off", + segmentation_model: str = "meta/sam-2-video", +) -> tuple[dict[str, LayoutInstruction], dict[str, dict[str, Any]]]: + """Return ``(clip_id -> LayoutInstruction, clip_id -> raw_gemini_json)``.""" + + out: dict[str, LayoutInstruction] = {} + raw_by_clip: dict[str, dict[str, Any]] = {} + model_name = gemini_vision_model.strip() + + for s in scenes: + sid = s.scene_id + if not s.keyframe_path: + logger.warning("No keyframe for %s; using sit_center.", sid) + out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER) + raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"} + continue + try: + data = _call_gemini_vision(s.keyframe_path, model_name) + image_size = _keyframe_dimensions(s.keyframe_path) + instr = _instruction_from_gemini_json( + sid, + data, + image_size=image_size, + ) + raw_data = dict(data) + speaker_follow_applied = False + tracking_source = ( + source_videos_by_scene.get(sid, source_video) + if source_videos_by_scene + else source_video + ) + if instr.layout == LayoutKind.SPLIT_TWO_PERSONS and segmentation_provider == "replicate": + try: + focus_points, focus_detail = _infer_two_speaker_focus_tracking_with_segmentation( + s, + source_video=tracking_source, + tracking_dir=tracking_dir, + model_name=model_name, + segmentation_model=segmentation_model, + initial_data=data, + initial_image_size=image_size, + ) + if focus_detail: + raw_data["speaker_follow_tracking"] = focus_detail + if focus_points: + instr = LayoutInstruction( + clip_id=sid, + layout=LayoutKind.SIT_CENTER, + zoom=focus_points[0].zoom or 1.0, + person_x_norm=focus_points[0].x_norm, + person_tracking=focus_points, + ) + speaker_follow_applied = True + except Exception as exc: + raw_data["speaker_follow_tracking"] = {"error": str(exc)} + if instr.layout in (LayoutKind.SIT_CENTER, LayoutKind.ZOOM_CALL_CENTER): + if speaker_follow_applied: + raw_by_clip[sid] = raw_data + out[sid] = instr + continue + tracking_points: list[TimedCenterPoint] = [] + tracking_samples: list[dict[str, Any]] = [] + attempted_segmentation = False + + if segmentation_provider == "replicate": + attempted_segmentation = True + try: + sam_points, sam_detail = _infer_person_tracking_with_segmentation( + s, + source_video=tracking_source, + segmentation_model=segmentation_model, + initial_data=data, + initial_image_size=image_size, + ) + if sam_points: + tracking_points = sam_points + if sam_detail: + raw_data["segmentation_tracking"] = sam_detail + except Exception as exc: + raw_data["segmentation_tracking"] = {"error": str(exc)} + if not tracking_points: + tracking_points, tracking_samples = _infer_person_tracking( + s, + source_video=tracking_source, + tracking_dir=tracking_dir, + model_name=model_name, + initial_data=data, + initial_image_size=image_size, + ) + if ( + segmentation_provider == "replicate" + and _tracking_is_unstable(tracking_points) + and not attempted_segmentation + ): + try: + sam_points, sam_detail = _infer_person_tracking_with_segmentation( + s, + source_video=tracking_source, + segmentation_model=segmentation_model, + initial_data=data, + initial_image_size=image_size, + ) + if sam_points: + tracking_points = sam_points + if sam_detail: + raw_data["segmentation_tracking"] = sam_detail + except Exception as exc: + raw_data.setdefault("segmentation_tracking", {"error": str(exc)}) + if tracking_points: + instr = instr.model_copy(update={"person_tracking": tracking_points}) + if tracking_samples: + raw_data["person_tracking_samples"] = tracking_samples + raw_by_clip[sid] = raw_data + out[sid] = instr + except Exception as e: + logger.warning("Gemini vision failed for %s: %s β€” defaulting sit_center", sid, e) + out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER) + raw_by_clip[sid] = {"error": str(e), "layout": "sit_center"} + + return out, raw_by_clip + + +def _apply_layout_hint_fallbacks( + instructions: dict[str, LayoutInstruction], + raw_by_clip: dict[str, dict[str, Any]], + layout_hints_by_clip: dict[str, LayoutKind], +) -> None: + for clip_id, hint in layout_hints_by_clip.items(): + instr = instructions.get(clip_id) + raw = raw_by_clip.get(clip_id) + if instr is None or raw is None or "error" not in raw: + continue + if instr.layout != LayoutKind.SIT_CENTER: + continue + instructions[clip_id] = instr.model_copy(update={"layout": hint}) + updated_raw = dict(raw) + updated_raw["layout"] = hint.value + updated_raw["layout_hint_fallback"] = hint.value + raw_by_clip[clip_id] = updated_raw + + +def resolved_vision_model(config: PipelineConfig) -> str: + if config.gemini_vision_model: + return config.gemini_vision_model.strip() + if GEMINI_VISION_MODEL: + return GEMINI_VISION_MODEL + return (config.gemini_model or GEMINI_MODEL).strip() + + +def run_layout_vision_stage( + work_dir: Path, + scenes: list[Scene], + *, + source_video: Path, + source_videos_by_scene: dict[str, Path] | None = None, + transcript_fp: str, + clips_path: Path, + config: PipelineConfig, +) -> dict[str, LayoutInstruction]: + """Load cache or call Gemini vision for each keyframe; persist JSON artifacts.""" + from humeo.clip_selector import load_clips + + clips_fp = _clips_fingerprint(clips_path) + vm = resolved_vision_model(config) + layout_hints_by_clip = { + clip.clip_id: hint + for clip in load_clips(clips_path) + if (hint := (clip.layout_hint or clip.layout)) is not None + } + + if ( + not config.force_layout_vision + and layout_cache_valid( + work_dir, + transcript_fp=transcript_fp, + clips_fp=clips_fp, + vision_model=vm, + segmentation_provider=config.segmentation_provider, + segmentation_model=config.segmentation_model, + ) + ): + cached = load_layout_cache(work_dir) + if cached: + logger.info("Layout vision cache hit; skipping Gemini vision calls.") + return { + k: LayoutInstruction.model_validate(v["instruction"]) + for k, v in cached.items() + if isinstance(v, dict) and "instruction" in v + } + + instructions, raw_by_clip = infer_layout_instructions( + scenes, + gemini_vision_model=vm, + source_video=source_video, + tracking_dir=work_dir / "layout_tracking", + source_videos_by_scene=source_videos_by_scene, + segmentation_provider=config.segmentation_provider, + segmentation_model=config.segmentation_model, + ) + _apply_layout_hint_fallbacks(instructions, raw_by_clip, layout_hints_by_clip) + + payload: dict[str, dict[str, Any]] = {} + for sid, instr in instructions.items(): + payload[sid] = { + "instruction": json.loads(instr.model_dump_json()), + "raw": raw_by_clip.get(sid, {}), + } + write_layout_cache( + work_dir, + transcript_fp=transcript_fp, + clips_fp=clips_fp, + vision_model=vm, + clips_payload=payload, + segmentation_provider=config.segmentation_provider, + segmentation_model=config.segmentation_model, + ) + return instructions diff --git a/src/humeo/pipeline.py b/src/humeo/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..e390087d9af1f129d1000860f89bebbd1f51e087 --- /dev/null +++ b/src/humeo/pipeline.py @@ -0,0 +1,797 @@ +"""End-to-end product pipeline.""" + +import dataclasses +import json +import logging +import re +from pathlib import Path + +from humeo_core.primitives.ingest import extract_keyframes +from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RatingFeedback, RenderTheme, Scene + +from humeo import interactive, session_state +from humeo.clip_assembly import apply_render_spans, assemble_clip, write_clip_plan +from humeo.clip_selection_cache import cache_valid, load_meta, transcript_fingerprint, write_artifacts +from humeo.clip_selector import ( + clip_quality_priority_score, + load_clips, + renumber_clips_dense, + save_clips, + select_clips, +) +from humeo.config import MAX_CLIP_DURATION_SEC, MIN_CLIP_DURATION_SEC, PipelineConfig +from humeo.content_pruning import run_content_pruning_stage, snap_render_windows_to_sentence_boundaries +from humeo.cutter import generate_ass +from humeo.hook_detector import run_hook_detection_stage +from humeo.hook_library import resolve_hook_library_path +from humeo.ingest import ( + download_video, + extract_audio, + stage_local_video, + transcript_cache_valid, + transcribe_whisperx, +) +from humeo.layout_vision import run_layout_vision_stage +from humeo.render_qa import qa_record_flags, run_render_qa +from humeo.render_window import clip_for_render +from humeo.reframe_ffmpeg import reframe_clip_ffmpeg +from humeo.transcript_align import clip_subtitle_words, group_words_to_cue_chunks +from humeo.video_cache import ( + extract_youtube_video_id, + ingest_complete, + normalize_local_source_path, + read_youtube_info_json, + resolve_work_directory, + upsert_manifest_from_info, +) + +logger = logging.getLogger(__name__) + +_WEAK_HOOK_START_WORDS = { + "actually", + "basically", + "honestly", + "look", + "listen", + "okay", + "ok", + "right", + "so", + "well", + "yeah", +} +_WEAK_HOOK_START_PHRASES = {"i mean", "kind of", "sort of", "you know"} +_STRONG_HOOK_LATEST_START_SEC = 6.0 +_FINAL_QUALITY_THRESHOLD = 0.68 +_NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2 = 0.68 +_NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH = 0.42 +_NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1 = 0.12 +_NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20 +_PRESENTATION_REFERENCE_RE = re.compile( + r"\b(" + r"as you can see|you can see|what you can see|look at|take a look|shown here|" + r"shown on|on the screen|on this slide|this chart|the chart|this graph|" + r"the graph|this slide|this matrix|the matrix|red line|yellow line|" + r"blue line|green line|top there|bottom there|x-axis|y-axis" + r")\b", + flags=re.IGNORECASE, +) + + +def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig: + return dataclasses.replace( + config, + steering_notes=list(steering_notes), + force_clip_selection=True, + overwrite_outputs=True, + ) + + +def _build_steering_from_feedback(feedback: RatingFeedback) -> str: + parts: list[str] = [] + if "wrong_moments" in feedback.issues: + parts.append("Previous selection picked the wrong moments. Reselect with different candidates.") + if "bad_cuts" in feedback.issues: + parts.append( + "Clip boundaries were bad. Prefer clips starting on clean sentence beginnings and ending on completed thoughts." + ) + if "boring" in feedback.issues: + parts.append("Previous selection lacked energy. Bias strongly toward high-emotion, high-hook moments.") + if "confusing" in feedback.issues: + parts.append("Previous clips needed too much context. Pick moments that make sense standalone.") + if "wrong_layout" in feedback.issues: + logger.warning("Received wrong_layout feedback, but layout overrides are not available until Gate 2 ships.") + if "length_off" in feedback.issues: + parts.append("Clip durations felt off. Respect the duration bounds strictly.") + if "other" in feedback.issues and feedback.free_text: + parts.append(feedback.free_text) + return " ".join(parts).strip() + + +def _ensure_work_dir(config: PipelineConfig) -> None: + """Resolve ``config.work_dir`` when unset (per-video cache) or ensure it exists.""" + if config.work_dir is not None: + return + config.work_dir = resolve_work_directory( + youtube_url=config.youtube_url, + explicit_work_dir=None, + use_video_cache=config.use_video_cache, + cache_root=config.cache_root, + ) + + +def _filter_render_valid_clips(clips: list, *, stage_label: str) -> list: + """Drop clips whose actual render window violates the duration contract.""" + valid: list = [] + dropped = 0 + for clip in clips: + render_clip = clip_for_render(clip) + render_duration = render_clip.duration_sec + if MIN_CLIP_DURATION_SEC <= render_duration <= MAX_CLIP_DURATION_SEC: + valid.append(clip) + continue + dropped += 1 + logger.warning( + "%s: dropping clip %s because render-window duration %.1fs is outside [%ds, %ds] " + "(trim_start=%.1fs trim_end=%.1fs).", + stage_label, + clip.clip_id, + render_duration, + MIN_CLIP_DURATION_SEC, + MAX_CLIP_DURATION_SEC, + clip.trim_start_sec, + clip.trim_end_sec, + ) + if dropped: + logger.warning("%s: dropped %d invalid render-window clip(s).", stage_label, dropped) + return valid + + +def _hook_window_text(clip, transcript: dict) -> str: + if clip.hook_start_sec is None or clip.hook_end_sec is None: + return "" + abs_start = clip.start_time_sec + clip.hook_start_sec + abs_end = clip.start_time_sec + clip.hook_end_sec + parts: list[str] = [] + for seg in transcript.get("segments", []) or []: + start = float(seg.get("start", 0.0)) + end = float(seg.get("end", start)) + if end <= abs_start or start >= abs_end: + continue + text = str(seg.get("text", "")).strip() + if text: + parts.append(text) + return " ".join(parts).strip() + + +def _filter_weak_hook_clips(clips: list, transcript: dict, *, min_kept: int) -> list: + if len(clips) <= min_kept: + return clips + kept: list = [] + dropped: list[str] = [] + for clip in clips: + hook_start = clip.hook_start_sec + if ( + hook_start is not None + and hook_start > _STRONG_HOOK_LATEST_START_SEC + and len(clips) - len(dropped) > min_kept + ): + dropped.append( + f"{clip.clip_id} (hook starts at {hook_start:.1f}s; target <= {_STRONG_HOOK_LATEST_START_SEC:.1f}s)" + ) + continue + hook_text = _hook_window_text(clip, transcript).lower() + first_words = [word.strip(".,!?;:'\"()[]{}") for word in hook_text.split()] + first_words = [word for word in first_words if word] + first_word = first_words[0] if first_words else "" + first_phrase = " ".join(first_words[:2]) + if ( + first_word in _WEAK_HOOK_START_WORDS or first_phrase in _WEAK_HOOK_START_PHRASES + ) and len(clips) - len(dropped) > min_kept: + weak_text = first_phrase if first_phrase in _WEAK_HOOK_START_PHRASES else first_word + dropped.append(f"{clip.clip_id} (weak opener: {weak_text})") + continue + kept.append(clip) + if dropped: + logger.info("Dropped %d weak-hook clip(s): %s", len(dropped), ", ".join(dropped)) + return kept + + +def _caption_chunk_penalty(clip, transcript: dict, *, render_theme) -> float: + words = clip_subtitle_words(transcript, clip).words + if not words: + return 0.08 + + if str(render_theme) == "native_highlight": + cue_words = 6 + cue_sec = 2.4 + prefer_break_on_punctuation = True + min_words_before_break = 4 + elif str(render_theme) == "reference_lower_third": + cue_words = 10 + cue_sec = 2.8 + prefer_break_on_punctuation = True + min_words_before_break = 5 + else: + cue_words = 10 + cue_sec = 2.8 + prefer_break_on_punctuation = False + min_words_before_break = 1 + + cue_chunks = group_words_to_cue_chunks( + words, + max_words_per_cue=cue_words, + max_cue_sec=cue_sec, + prefer_break_on_punctuation=prefer_break_on_punctuation, + min_words_before_break=min_words_before_break, + ) + penalty = 0.0 + for chunk in cue_chunks: + duration = chunk[-1].end_time - chunk[0].start_time + if len(chunk) == 1 and len(cue_chunks) > 1: + penalty += 0.04 + if len(chunk) >= cue_words and duration < 0.65: + penalty += 0.04 + if duration > cue_sec + 0.35: + penalty += 0.03 + return min(0.18, penalty) + + +def _filter_low_quality_clips(clips: list, transcript: dict, *, min_kept: int, render_theme) -> list: + if len(clips) <= min_kept: + return renumber_clips_dense(clips) + + ranked: list[tuple[float, object, float]] = [] + for clip in clips: + render_clip = clip_for_render(clip) + caption_penalty = _caption_chunk_penalty(render_clip, transcript, render_theme=render_theme) + score = clip_quality_priority_score(clip) - caption_penalty + ranked.append((score, clip, caption_penalty)) + + ranked.sort(key=lambda item: item[0], reverse=True) + kept = [clip for score, clip, _ in ranked if score >= _FINAL_QUALITY_THRESHOLD] + if len(kept) < min_kept: + kept = [clip for _score, clip, _penalty in ranked[:min_kept]] + + dropped = [ + f"{clip.clip_id} (score={score:.2f}, caption_penalty={caption_penalty:.2f})" + for score, clip, caption_penalty in ranked + if clip not in kept + ] + if dropped: + logger.info( + "Dropped %d low-quality clip(s) after pruning: %s", + len(dropped), + ", ".join(dropped), + ) + return renumber_clips_dense(kept) + + +def _clip_references_presentation(clip) -> bool: + text_parts = [ + getattr(clip, "viral_hook", ""), + getattr(clip, "transcript", ""), + getattr(clip, "suggested_overlay_title", ""), + getattr(clip, "topic", ""), + ] + text = " ".join(str(part or "") for part in text_parts) + return bool(_PRESENTATION_REFERENCE_RE.search(text)) + + +def _normalize_layout_for_render( + instruction: LayoutInstruction, + *, + render_theme: RenderTheme, + clip=None, +) -> LayoutInstruction: + if render_theme != RenderTheme.NATIVE_HIGHLIGHT: + return instruction + if instruction.layout != LayoutKind.SPLIT_CHART_PERSON: + return instruction + chart = instruction.split_chart_region + person = instruction.split_person_region + if chart is None or person is None: + return instruction + chart_dominates = chart.y2 >= _NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2 + person_too_small = person.width <= _NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH + # Keep Bryan's newer head-and-shoulders presenter crops in split mode even + # when the speaker strip is narrow; the older fallback-to-center rule was + # written for lower-anchored full-body crops that rendered badly here. + person_is_top_anchored = person.y1 <= _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1 + if not (chart_dominates and person_too_small and not person_is_top_anchored): + return instruction + if clip is not None and _clip_references_presentation(clip): + return instruction + return instruction.model_copy( + update={ + "layout": LayoutKind.SIT_CENTER, + "zoom": max(float(instruction.zoom), _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM), + "split_chart_region": None, + "split_person_region": None, + "split_second_chart_region": None, + "split_second_person_region": None, + "chart_x_norm": 0.0, + "top_band_ratio": 0.5, + } + ) + + +def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]: + path = work_dir / "layout_vision.json" + if not path.is_file(): + return {} + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 - optional QA metadata + logger.warning("Could not read layout raw metadata for QA: %s", exc) + return {} + clips = payload.get("clips", {}) + if not isinstance(clips, dict): + return {} + out: dict[str, dict] = {} + for clip_id, item in clips.items(): + if isinstance(item, dict) and isinstance(item.get("raw"), dict): + out[str(clip_id)] = item["raw"] + return out + + +def _normalize_rerender_clip_id(raw: str) -> str: + text = str(raw).strip() + match = re.search(r"(\d+)$", text) + if match: + return f"{int(match.group(1)):03d}" + return text + + +def _warned_clip_ids_from_qa(output_dir: Path) -> set[str]: + manifest_path = output_dir / "render_qa" / "qa_manifest.json" + if not manifest_path.is_file(): + return set() + try: + payload = json.loads(manifest_path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 - stale QA should not block renders + logger.warning("Could not read QA manifest for warned-only rerender: %s", exc) + return set() + + warned: set[str] = set() + for record in payload.get("shorts", []): + if not isinstance(record, dict): + continue + clip_id = record.get("clip_id") + if clip_id and qa_record_flags(record): + warned.add(_normalize_rerender_clip_id(str(clip_id))) + return warned + + +def _load_layout_instruction_cache(work_dir: Path) -> dict[str, LayoutInstruction]: + path = work_dir / "layout_vision.json" + if not path.is_file(): + return {} + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 - cache fallback + logger.warning("Could not read cached layout instructions: %s", exc) + return {} + clips = payload.get("clips", {}) + if not isinstance(clips, dict): + return {} + out: dict[str, LayoutInstruction] = {} + for clip_id, item in clips.items(): + if not isinstance(item, dict) or "instruction" not in item: + continue + try: + out[str(clip_id)] = LayoutInstruction.model_validate(item["instruction"]) + except Exception as exc: # noqa: BLE001 + logger.warning("Ignoring invalid cached layout for clip %s: %s", clip_id, exc) + return out + + +def run_pipeline(config: PipelineConfig) -> list[Path]: + """ + Execute the full podcast-to-shorts pipeline. + + Args: + config: Pipeline configuration. + + Returns: + List of paths to the final short-form MP4 files. + """ + logger.info("=" * 60) + logger.info("HUMEO PIPELINE START") + logger.info("Source: %s", config.youtube_url) + logger.info("Output: %s", config.output_dir) + logger.info("=" * 60) + + _ensure_work_dir(config) + assert config.work_dir is not None + + state = None + if config.interactive: + state = session_state.load_state(config.work_dir, config.youtube_url) + if config.steering_notes: + if list(config.steering_notes) != state.steering_notes: + state.steering_notes = list(config.steering_notes) + session_state.save_state(config.work_dir, state) + elif state.steering_notes: + config = dataclasses.replace( + config, + steering_notes=list(state.steering_notes), + force_clip_selection=True, + overwrite_outputs=True, + ) + logger.info( + "Loaded %d steering note(s) from session state for this source.", + len(state.steering_notes), + ) + + # ------------------------------------------------------------------ + # Stage 1: Ingest + # ------------------------------------------------------------------ + logger.info("--- STAGE 1: INGESTION ---") + + source_video = config.work_dir / "source.mp4" + transcript_path = config.work_dir / "transcript.json" + local_source_path = normalize_local_source_path(config.youtube_url) + reuse_ingest = ingest_complete(config.work_dir, config.youtube_url) + + if reuse_ingest: + logger.info("Cached ingest found for this source (reusing source + transcript).") + elif local_source_path is not None: + source_video = stage_local_video(local_source_path, config.work_dir) + elif source_video.exists(): + logger.info("Source video already downloaded, skipping download.") + else: + source_video = download_video(config.youtube_url, config.work_dir) + + transcript_reusable = transcript_cache_valid(config.work_dir) + if reuse_ingest and transcript_reusable: + logger.info("Transcript already exists, loading.") + with open(transcript_path, "r", encoding="utf-8") as f: + transcript = json.load(f) + elif transcript_reusable and local_source_path is None: + logger.info("Transcript already exists, loading.") + with open(transcript_path, "r", encoding="utf-8") as f: + transcript = json.load(f) + else: + if transcript_path.exists(): + logger.info("Transcript cache mismatch for current transcription settings; regenerating.") + audio_path = extract_audio(source_video, config.work_dir) + transcript = transcribe_whisperx(audio_path, config.work_dir) + + if local_source_path is None: + vid = extract_youtube_video_id(config.youtube_url) + info = read_youtube_info_json(config.work_dir) + if not info and vid: + info = {"id": vid, "webpage_url": config.youtube_url} + if info: + upsert_manifest_from_info( + work_dir=config.work_dir, + youtube_url=config.youtube_url, + info=info, + cache_root=config.cache_root, + ) + + # ------------------------------------------------------------------ + # Stage 2: Clip Selection + # ------------------------------------------------------------------ + logger.info("--- STAGE 2: CLIP SELECTION ---") + + clips_path = config.work_dir / "clips.json" + fp = transcript_fingerprint(transcript) + meta = load_meta(config.work_dir) + cache_hit = ( + clips_path.is_file() + and not config.force_clip_selection + and meta is not None + and cache_valid(meta, fp, config) + ) + + if cache_hit: + clips = load_clips(clips_path) + logger.info("Clip selection cache hit (transcript + provider/model unchanged); skipping LLM.") + else: + clips, raw = select_clips( + transcript, + gemini_model=config.gemini_model, + hook_library_path=resolve_hook_library_path(config), + candidate_count=config.clip_selection_candidate_count, + quality_threshold=config.clip_selection_quality_threshold, + min_kept=config.clip_selection_min_kept, + max_kept=config.clip_selection_max_kept, + steering_notes=config.steering_notes, + ) + save_clips(clips, clips_path) + write_artifacts( + config.work_dir, + transcript=transcript, + config=config, + raw_response=raw, + ) + + logger.info("Selected %d clips:", len(clips)) + for clip in clips: + logger.info( + " [%s] %.1fs-%.1fs (%.1fs) score=%.2f - %s", + clip.clip_id, + clip.start_time_sec, + clip.end_time_sec, + clip.duration_sec, + clip.virality_score, + clip.topic, + ) + + # ------------------------------------------------------------------ + # Stage 2.25: Hook Detection + # ------------------------------------------------------------------ + # The clip selector is unreliable at localising the hook sentence and + # tends to return the 0.0-3.0s placeholder verbatim, which would disable + # start-trim in Stage 2.5. This stage asks Gemini to localise the real + # hook per clip so Stage 2.5 can clamp against a real window. + logger.info("--- STAGE 2.25: HOOK DETECTION (enabled=%s) ---", config.detect_hooks) + clips = run_hook_detection_stage( + config.work_dir, + clips, + transcript, + transcript_fp=fp, + config=config, + ) + clips = _filter_weak_hook_clips( + clips, + transcript, + min_kept=config.clip_selection_min_kept, + ) + + # ------------------------------------------------------------------ + # Stage 2.5: Content Pruning (HIVE-style inner-clip tightening) + # ------------------------------------------------------------------ + # Tightens each candidate window by writing trim_start_sec / trim_end_sec + # on the Clip models. keyframe extraction and layout vision below both + # consume ``clip_for_render(clip)`` so they automatically operate on the + # pruned window without further changes. + logger.info("--- STAGE 2.5: CONTENT PRUNING (level=%s) ---", config.prune_level) + clips = run_content_pruning_stage( + config.work_dir, + clips, + transcript, + transcript_fp=fp, + config=config, + ) + clips = snap_render_windows_to_sentence_boundaries(clips, transcript) + clips = _filter_render_valid_clips(clips, stage_label="Stage 2.5 guardrail") + clips = _filter_low_quality_clips( + clips, + transcript, + min_kept=config.clip_selection_min_kept, + render_theme=config.render_theme, + ) + + rerender_target_ids = { + _normalize_rerender_clip_id(clip_id) + for clip_id in config.rerender_clip_ids + } + if config.rerender_warned_only: + rerender_target_ids.update(_warned_clip_ids_from_qa(config.output_dir)) + if rerender_target_ids: + before_count = len(clips) + clips = [clip for clip in clips if clip.clip_id in rerender_target_ids] + missing = sorted(rerender_target_ids - {clip.clip_id for clip in clips}) + logger.info( + "Rerender target filter: keeping %d / %d clip(s): %s", + len(clips), + before_count, + ", ".join(clip.clip_id for clip in clips) or "(none)", + ) + if missing: + logger.warning("Requested rerender clip id(s) not found: %s", ", ".join(missing)) + if not clips: + logger.warning("No clips matched rerender target filter; nothing to render.") + return [] + + # ------------------------------------------------------------------ + # Stage 2.75: Hard-cut assembly + # ------------------------------------------------------------------ + logger.info("--- STAGE 2.75: CLIP ASSEMBLY ---") + clips = apply_render_spans(clips, transcript) + assembled_dir = config.work_dir / "assembled" + assembled_by_id = { + clip.clip_id: assemble_clip(source_video, clip, transcript, assembled_dir) + for clip in clips + } + clips = [assembled_by_id[clip.clip_id].clip for clip in clips] + assembled_clips_path = write_clip_plan(config.work_dir / "assembled_clips.json", clips) + + if config.interactive and state is not None: + result = interactive.approve_clips(clips) + if result.action == "quit": + logger.info("Aborted by user at Gate 1.") + return [] + if result.action == "refine": + state.iteration += 1 + if result.steering_note: + state.steering_notes.append(result.steering_note) + state.last_selected_ids = None + session_state.save_state(config.work_dir, state) + if state.iteration >= config.max_iterations: + logger.warning("Iteration cap hit. Proceeding with current clips.") + else: + return run_pipeline(_rerun_config(config, state.steering_notes)) + elif result.action == "proceed": + selected_ids = list(result.selected_ids or []) + state.last_selected_ids = selected_ids + session_state.save_state(config.work_dir, state) + clip_by_id = {clip.clip_id: clip for clip in clips} + clips = [clip_by_id[clip_id] for clip_id in selected_ids] + elif result.action == "accept_all": + state.last_selected_ids = [clip.clip_id for clip in clips] + session_state.save_state(config.work_dir, state) + + # ------------------------------------------------------------------ + # Stage 3: Clip layouts + # ------------------------------------------------------------------ + logger.info("--- STAGE 3: CLIP LAYOUTS ---") + + keyframes_dir = config.work_dir / "keyframes" + clip_scenes: list[Scene] = [] + source_videos_by_scene: dict[str, Path] = {} + for clip in clips: + assembled = assembled_by_id[clip.clip_id] + rw = clip_for_render(clip) + clip_scenes.append( + Scene(scene_id=clip.clip_id, start_time=rw.start_time_sec, end_time=rw.end_time_sec) + ) + source_videos_by_scene[clip.clip_id] = assembled.source_path + + layout_instructions: dict[str, LayoutInstruction] = {} + if rerender_target_ids: + cached_layouts = _load_layout_instruction_cache(config.work_dir) + if all(clip.clip_id in cached_layouts for clip in clips): + layout_instructions = { + clip.clip_id: cached_layouts[clip.clip_id] + for clip in clips + } + logger.info( + "Using cached layout instructions for rerender target(s): %s", + ", ".join(layout_instructions), + ) + + if not layout_instructions: + extracted_scenes: list[Scene] = [] + for scene in clip_scenes: + extracted_scenes.extend( + extract_keyframes( + str(source_videos_by_scene[scene.scene_id]), + [scene], + str(keyframes_dir / scene.scene_id), + ) + ) + clip_scenes = extracted_scenes + layout_instructions = run_layout_vision_stage( + config.work_dir, + clip_scenes, + source_video=source_video, + source_videos_by_scene=source_videos_by_scene, + transcript_fp=fp, + clips_path=assembled_clips_path, + config=config, + ) + + # ------------------------------------------------------------------ + # Stage 4: Render + # ------------------------------------------------------------------ + logger.info("--- STAGE 4: RENDER ---") + + final_outputs: list[Path] = [] + render_clips_by_id: dict[str, Clip] = {} + render_transcripts_by_id: dict[str, dict] = {} + render_layouts_by_id: dict[str, LayoutInstruction] = {} + render_sources_by_id: dict[str, Path] = {} + subtitles_dir = config.work_dir / "subtitles" + subtitles_dir.mkdir(parents=True, exist_ok=True) + + for clip in clips: + assembled = assembled_by_id[clip.clip_id] + instr = layout_instructions.get(clip.clip_id) + if instr is None: + hint = clip.layout_hint or LayoutKind.SIT_CENTER + instr = LayoutInstruction(clip_id=clip.clip_id, layout=hint) + instr = _normalize_layout_for_render(instr, render_theme=config.render_theme, clip=clip) + clip.layout = instr.layout + rclip = clip_for_render(clip) + render_clips_by_id[clip.clip_id] = rclip + render_transcripts_by_id[clip.clip_id] = assembled.transcript + render_layouts_by_id[clip.clip_id] = instr + render_sources_by_id[clip.clip_id] = assembled.source_path + subtitle_path = None + if config.burn_subtitles: + # ASS (not SRT) so the caption file's PlayResY matches the output + # resolution and libass' font/margin scaling is 1:1. + subtitle_path = generate_ass( + rclip, + assembled.transcript, + subtitles_dir, + max_words_per_cue=config.subtitle_max_words_per_cue, + max_cue_sec=config.subtitle_max_cue_sec, + play_res_x=1080, + play_res_y=1920, + font_size=config.subtitle_font_size, + margin_v=config.subtitle_margin_v, + render_theme=config.render_theme, + native_highlight_lead_sec=config.subtitle_highlight_lead_sec, + native_highlight_min_dwell_sec=config.subtitle_highlight_min_dwell_sec, + repair_word_timings=config.repair_subtitle_word_timings, + ) + else: + logger.info("Clip %s: subtitle burn disabled for this run.", clip.clip_id) + final_path = config.output_dir / f"short_{clip.clip_id}.mp4" + should_overwrite_clip = config.overwrite_outputs or clip.clip_id in rerender_target_ids + if final_path.exists() and not should_overwrite_clip: + logger.info("Clip %s already rendered, skipping.", clip.clip_id) + final_outputs.append(final_path) + continue + if final_path.exists() and should_overwrite_clip: + logger.info("Clip %s exists; overwriting for this render pass.", clip.clip_id) + + # Font size and margin are already baked into the ASS file at + # PlayResY=1920, so the compile primitive does not need to override + # them -- but it still does, harmlessly, for single-source overrides. + reframe_clip_ffmpeg( + input_path=assembled.source_path, + output_path=final_path, + clip=rclip, + layout_instruction=instr, + subtitle_path=subtitle_path, + subtitle_font_size=config.subtitle_font_size, + subtitle_margin_v=config.subtitle_margin_v, + title_text=clip.suggested_overlay_title, + render_theme=config.render_theme, + ) + final_outputs.append(final_path) + + if config.render_qa and final_outputs: + logger.info("--- STAGE 4.5: RENDER QA ---") + try: + run_render_qa( + output_dir=config.output_dir, + final_outputs=final_outputs, + render_clips_by_id=render_clips_by_id, + transcripts_by_id=render_transcripts_by_id, + layouts_by_id=render_layouts_by_id, + assembled_sources_by_id=render_sources_by_id, + raw_layouts_by_id=_load_layout_raw_by_clip(config.work_dir), + reference_video=config.qa_reference_video, + debug_overlay=config.qa_debug_overlay, + ) + except Exception as exc: # noqa: BLE001 - QA must not fail delivery + logger.warning("Render QA failed, leaving rendered shorts intact: %s", exc) + + # ------------------------------------------------------------------ + # Done + # ------------------------------------------------------------------ + logger.info("=" * 60) + logger.info("PIPELINE COMPLETE - %d shorts generated:", len(final_outputs)) + for p in final_outputs: + logger.info(" -> %s", p) + logger.info("=" * 60) + + if config.interactive and final_outputs and state is not None: + feedback = interactive.rate_output(final_outputs) + state.last_rating = feedback + session_state.save_state(config.work_dir, state) + if feedback.rating == 3: + logger.info("Rated Great. Shipped.") + return final_outputs + + steering = _build_steering_from_feedback(feedback) + if not steering: + logger.warning("Interactive feedback recorded, but it is not actionable until a later gate ships.") + return final_outputs + + state.iteration += 1 + state.steering_notes.append(steering) + session_state.save_state(config.work_dir, state) + if state.iteration >= config.max_iterations: + logger.warning("Iteration cap hit. Source may not have a strong short.") + return final_outputs + return run_pipeline(_rerun_config(config, state.steering_notes)) + + return final_outputs diff --git a/src/humeo/prompt_loader.py b/src/humeo/prompt_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..391b2d40bcef4131aa7de7a9de492716bfac124f --- /dev/null +++ b/src/humeo/prompt_loader.py @@ -0,0 +1,66 @@ +"""Load Jinja2 prompt templates (editable; override dir via HUMEO_PROMPTS_DIR).""" + +from __future__ import annotations + +import os +from pathlib import Path + +import jinja2 + + +def _prompt_loader() -> jinja2.BaseLoader: + override = (os.environ.get("HUMEO_PROMPTS_DIR") or "").strip() + if override: + return jinja2.FileSystemLoader(str(Path(override).expanduser())) + return jinja2.PackageLoader("humeo", "prompts") + + +def clip_selection_prompts( + *, + transcript_text: str, + min_dur: float, + max_dur: float, + count: int, + steering_notes: list[str] | None = None, + hook_examples: str = "", +) -> tuple[str, str]: + """Return ``(system_instruction, user_message)`` for Gemini clip selection.""" + env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True) + ctx = { + "min_dur": min_dur, + "max_dur": max_dur, + "count": count, + "transcript_text": transcript_text, + "steering_notes": steering_notes or [], + "hook_examples": hook_examples, + } + system = env.get_template("clip_selection_system.jinja2").render(**ctx) + user = env.get_template("clip_selection_user.jinja2").render(**ctx) + return system, user + + +def hook_detection_system_prompt(*, hook_examples: str = "") -> str: + """Return the system prompt for Stage 2.25 hook detection. + + The user message is built in :mod:`humeo.hook_detector` because the + segment listing is dynamic per-clip. + """ + env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True) + return env.get_template("hook_detection_system.jinja2").render(hook_examples=hook_examples) + + +def content_pruning_system_prompt( + *, + min_dur: float, + max_dur: float, + level: str, +) -> str: + """Return the system prompt for Stage 2.5 content pruning. + + The user message is built in ``humeo.content_pruning`` from the list of + candidate clips (clip-relative segment lines) since it is not static text. + """ + env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True) + return env.get_template("content_pruning_system.jinja2").render( + min_dur=min_dur, max_dur=max_dur, level=level + ) diff --git a/src/humeo/prompts/clip_selection_system.jinja2 b/src/humeo/prompts/clip_selection_system.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..0065815dd6cfe4b574b0eadaae4f7bfb128541be --- /dev/null +++ b/src/humeo/prompts/clip_selection_system.jinja2 @@ -0,0 +1,197 @@ +## Role +{% if steering_notes %} +Additional instructions from previous iterations (honor these): +{% for note in steering_notes %} +- {{ note }} +{% endfor %} + +{% endif %} + +You are a professional short-form video editor with a deep understanding of +long-form podcast and interview structure. Your job is to watch a full +transcript and isolate the moments that function as self-contained +vertical-video clips on TikTok, YouTube Shorts, and Instagram Reels. + +You do not write new content. You identify the best moments that are already +there, score them honestly, and return strict JSON. + +## Input + +- A time-aligned podcast transcript. Each line is `[start_sec - end_sec] text`. + Timestamps are on the source timeline and are word-accurate. +- Candidate clip count: **exactly {{ count }}** (a candidate pool). A + downstream ranker keeps only the top clips that clear a quality bar, so + your job is to populate a large enough pool for it to rank. Do NOT + self-censor by omitting weaker moments - instead return them with a + lower `virality_score` and `needs_review: true`. The ranker drops weak + candidates automatically. Only return fewer than {{ count }} if the + transcript truly does not contain {{ count }} distinguishable moments. +- Target clip duration: {{ min_dur }}s to {{ max_dur }}s (hard bounds). +{% if hook_examples %} +- Retrieved viral hook examples from the team's hook library. Use these as pattern guidance for the opening, title phrasing, and overall framing style: + +{{ hook_examples }} +{% endif %} + +## Output + +Return a single JSON object of the form: + +{ + "clips": [ + { + "clip_id": "001", + "topic": "Brief topic label", + "start_time_sec": 123.0, + "end_time_sec": 165.5, + "viral_hook": "The attention-grabbing opening line or idea", + "virality_score": 0.71, + "transcript": "Full verbatim text of this segment for subtitle generation", + "suggested_overlay_title": "Short punchy title for overlay (max 5 words)", + "hook_start_sec": 0.0, + "hook_end_sec": 3.0, + "trim_start_sec": 0.0, + "trim_end_sec": 0.0, + "shorts_title": "Platform-ready title for the short", + "description": "1-2 sentence description for upload", + "hashtags": ["topic", "news"], + "layout_hint": "zoom_call_center", + "needs_review": false, + "review_reason": "", + "reasoning": "Why this moment scores strongly on the three text axes.", + "score_breakdown": { + "message_wow": 0.84, + "hook_emotion": 0.63, + "catchy": 0.57 + } + } + ] +} + +Explanation of each field: + +- `clip_id`: zero-padded 3-digit id, sorted by `virality_score` descending. +- `topic`: human-readable label (<= 6 words). +- `start_time_sec` / `end_time_sec`: inclusive source-timeline boundaries of + the clip. Must sit on or very near word-boundary timestamps present in the + transcript. +- `viral_hook`: the exact hook line (verbatim substring of the transcript) + that earns the first 3 seconds of attention. +- `virality_score`: float in [0, 1]. This is derived from `score_breakdown` + and must equal: + `0.4 * message_wow + 0.35 * hook_emotion + 0.25 * catchy`. +- `transcript`: exact verbatim text from the segment lines covering the + window. No paraphrasing. No added punctuation. +- `suggested_overlay_title`, `shorts_title`, `description`, `hashtags`: + platform metadata. Hashtags are short tokens without `#`. + `suggested_overlay_title` must be 4-8 words, headline-cased when natural, + and should feel specific, provocative, and pattern-rich rather than generic. +- `hook_start_sec` / `hook_end_sec`: clip-relative seconds (0 = clip start). + Must satisfy `0 <= hook_start < hook_end <= (end_time_sec - start_time_sec)`. +- `trim_start_sec` / `trim_end_sec`: default 0. A downstream pruning pass will + tighten these; you may leave both at 0 unless the weak lead-in / trailing + filler is obvious. +- `layout_hint`: one of `zoom_call_center`, `sit_center`, + `split_chart_person`, or null. Use `split_chart_person` whenever the + segment involves an on-screen chart, slide, or graphic with the host + visible beside it. Use `sit_center` for a plain talking-head with no + side-by-side slide. Use `zoom_call_center` for a tight webcam or call grid. + Prefer setting this from the transcript topic (productivity data, debt + chart, AI jobs slide, etc.), not null, when slides are clearly part of the + clip. +- `needs_review` / `review_reason`: flag segments that need human review. +- `reasoning`: short audit trail explaining why the moment scores the way it + does on `message_wow`, `hook_emotion`, and `catchy`. Keep it to 1-3 + sentences. +- `score_breakdown`: required mapping with exactly three keys: + `message_wow`, `hook_emotion`, and `catchy`. Each value must be a float in + [0, 1]. Do not emit any extra keys. Do not emit visual axes such as + `hook_visual` or `human_vibe`. + +## Text-Axis Scoring + +Score honestly. Most moments should land between 0.4 and 0.6 on each axis. +Reserve 0.8 or above only for moments that are genuinely exceptional in that +dimension. A response where every clip scores 0.85+ is wrong -- rescore with a +more critical eye. + +- `message_wow` (0.0-1.0): strength of the central idea. High when the clip + contains a genuinely strong message, insight, reframe, prediction, or + takeaway that makes the viewer think "that's interesting." +- `hook_emotion` (0.0-1.0): emotional punch or stakes. High when the opening + or the overall moment creates urgency, surprise, tension, awe, laughter, + fear, or personal stakes that make the viewer feel something. +- `catchy` (0.0-1.0): memorability and quotability. High when the clip has a + sticky phrase, metaphor, analogy, framing, or sentence a viewer would want + to repeat or post. + +Use these axes as the scoring rubric. You may still use concrete signals like +predictions, hard numbers, metaphors, or strong anecdotes to judge the axes, +but do not output rule-point maps. Output only the three-axis breakdown. + +### Hard disqualifiers +- Requires prior context the viewer does not have. +- Cuts in the middle of a sentence or in the middle of a thought. +- Both speakers are on filler ("yeah, totally, right, exactly"). +- No clear hook in the first 3 seconds of the window. +- The first real hook lands later than 10 seconds into the candidate. + +## Hook Selection (first 3 seconds) + +Every clip must open on a line that grabs attention immediately. + +- **Engagement.** The first sentence contains a claim, a number, a named + person, or a striking image. Not a throat-clear, not a question setup, not + a filler phrase. +- **Clarity.** The opening does not depend on earlier context. A cold viewer + can follow it without knowing who was talking or what was just discussed. +- **Self-contained premise.** If introducing a character, setting, or + premise is required, do it in the hook itself. + +Record the exact hook window in `hook_start_sec` / `hook_end_sec` +(clip-relative seconds). The default 0.0 - 3.0 window is a fallback, not the +goal - prefer the real boundary of the hook sentence. + +## Ending Selection (last moments of the clip) + +Prefer ending on one of: + +- **Suspense.** An unresolved question, a challenge, or a "and then..." + beat. Viewers should feel they would benefit from watching more. +- **Complete mini-arc.** A setup-payoff pair that closes cleanly inside the + window. A satisfying button. +- **Neutral but on-topic.** If a clip has no strong ending, a clean + sentence-end that stays on the highlight's topic is acceptable. + +Avoid endings that: + +- Drift into a new, unrelated arc. +- Cut mid-sentence or mid-argument. +- Rely on content that sits outside your chosen window. + +## Requirements + +1. Return **exactly {{ count }}** clips (unless the transcript genuinely + contains fewer distinguishable moments), ranked by `virality_score` + descending. Populate the pool - the downstream ranker handles filtering. + Flag any clip that is meaningfully weaker than the rest of the pool with + `needs_review: true` and a one-sentence `review_reason`; keep it in the + list. Never drop a candidate just because it is weaker than another - the + ranker needs the full pool to make a quality-vs-quantity trade-off. +2. Each clip duration `end_time_sec - start_time_sec` must be between + {{ min_dur }} and {{ max_dur }} seconds inclusive. +3. `start_time_sec` and `end_time_sec` must match word-level timestamps that + appear in the provided transcript. Do not invent timestamps. +4. `transcript` must be the exact verbatim text from the source. No + paraphrasing, no summarisation, no added punctuation. +5. Clips must not overlap on the source timeline. +6. Do not cut inside a sentence or inside an argument. Prefer + sentence-boundary or breath-boundary cuts. +7. `virality_score` must equal + `0.4 * message_wow + 0.35 * hook_emotion + 0.25 * catchy`. +8. `score_breakdown` must contain exactly `message_wow`, `hook_emotion`, and + `catchy`. Do not include any other keys. +9. Return ONLY the JSON object. No markdown, no prose, no trailing text. +10. If the first 10 seconds do not contain a real hook, do not return the clip. +11. Avoid generic titles like "Big Opportunity", "Important Lesson", "Why This Matters". + Prefer titles whose wording clearly echoes the strongest hook pattern in the clip. diff --git a/src/humeo/prompts/clip_selection_user.jinja2 b/src/humeo/prompts/clip_selection_user.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..ab6265a66f20fe09c5d63bb1203d992a37530928 --- /dev/null +++ b/src/humeo/prompts/clip_selection_user.jinja2 @@ -0,0 +1,3 @@ +Analyze this podcast transcript and identify the top viral clips: + +{{ transcript_text }} diff --git a/src/humeo/prompts/content_pruning_system.jinja2 b/src/humeo/prompts/content_pruning_system.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..b717340002e26538e1cb629cc74bf95c2c067964 --- /dev/null +++ b/src/humeo/prompts/content_pruning_system.jinja2 @@ -0,0 +1,119 @@ +## Role + +You are a precision short-form video editor with a deep understanding of +narrative pacing. The clip selection stage has already chosen +attention-worthy windows from a longer podcast. Your job for each candidate +clip is to remove redundant lead-in and trailing filler so the final cut is +tighter and more gripping β€” while keeping the highlight intact. + +You do not re-order, you do not cut in the middle of the clip. You only +decide how many seconds to drop from the START and how many from the END. +Think of it as moving the in-point forward and the out-point backward to +land on the highest-value sub-window. + +## Input + +- A list of candidate clips. For every clip you receive: + - `clip_id`: stable identifier. + - `duration_sec`: total length of the candidate window. + - `topic`: short label from clip selection. + - `hook_window_sec` (optional): `[start, end]` in clip-relative seconds. + This window is the protected highlight and must stay fully inside the + final cut. + - Segment lines: `[rel_start_sec - rel_end_sec] text` where all times are + clip-relative (0 = start of the candidate window). +- Aggressiveness for this run: **{{ level }}**. + +Interpret `level` as a budget for how much of the clip you may drop in +total (start + end combined): + +- `conservative`: aim for 0-10% total trim. Remove only obvious dead air, + throat-clears, stutters, false starts, or "um, so, uh, basically..." + ramble. +- `balanced`: aim for 5-20% total trim. Also remove slow setup, + self-correction, and minor tangents that do not advance the hook or + payoff. +- `aggressive`: aim for 15-35% total trim. Also remove any sentence that + does not directly advance the core claim, hook, or punchline. Never + sacrifice coherence. Never cut mid-idea. + +## Output + +Return a single JSON object of the form: + +{ + "decisions": [ + { + "clip_id": "001", + "trim_start_sec": 0.0, + "trim_end_sec": 0.0, + "reason": "Short justification, one sentence.", + "thought": "What you considered keeping vs dropping and why." + } + ] +} + +Explanation of each field: + +- `clip_id`: must match an input clip exactly. Return ONE decision per + input clip, in input order. +- `trim_start_sec`: clip-relative seconds to drop from the start. 0 = no + change. +- `trim_end_sec`: clip-relative seconds to drop from the end. 0 = no + change. +- `reason`: one-sentence justification suitable for logging. Name the + specific filler you removed (e.g. "Dropped 1.2s of 'yeah so basically' + before the hook"). +- `thought`: optional longer reasoning about what you weighed. This is + your chain-of-thought; keep it to 1-3 sentences. + +## Opening Selection (what to keep at the start) + +After the trim, the first sentence of the clip must: + +- **Grab attention immediately.** Start on a claim, a number, a named + person, or a striking image. Not a throat-clear, not a filler phrase, not + a "so anyway, the thing is..." ramp. +- **Stand alone.** A cold viewer can follow it without the context that sat + before the original in-point. If the opening depends on prior context, + trim forward to the first line that stands on its own. +- **Introduce the premise inside the clip.** If a character, setting, or + concept is required, it should be introduced in the new opening line β€” + not assumed. + +## Ending Selection (what to keep at the end) + +After the trim, the last moments of the clip must: + +- **Stay on-topic with the highlight.** Do not end on a sentence that + starts a new, unrelated arc. +- **Prefer suspense or a clean button.** An unresolved question, a + challenge, or a satisfying setup-payoff close. +- **Neutral-but-on-topic endings are acceptable.** If nothing better exists, + a clean sentence-end that does not drift off-topic is fine. +- **Never cut mid-sentence.** Prefer sentence or breath boundaries present + in the segment timestamps. + +## Requirements + +1. Return ONE decision for EVERY input clip, in the same order as the + input. Never skip a clip; if no trimming is warranted, return 0.0 / 0.0 + with a short `reason`. +2. `trim_start_sec` and `trim_end_sec` must be >= 0.0. +3. The final duration `duration_sec - trim_start_sec - trim_end_sec` must + stay between {{ min_dur }} and {{ max_dur }} seconds. If the candidate + clip is already near {{ min_dur }}s, return small or zero trims. +4. If `hook_window_sec = [hs, he]` is provided, the hook MUST remain fully + inside the final cut. Concretely: + - `trim_start_sec <= hs` + - `duration_sec - trim_end_sec >= he` +5. Trim points must land on (or very close to) a segment timestamp + provided in the input. Do not invent times that fall in the middle of a + word. +6. Do not cut inside a sentence. Do not cut inside an argument. +7. Total trim (`trim_start_sec + trim_end_sec`) must respect the budget + implied by `level`. The pipeline will clamp overruns, but staying inside + the budget yields better downstream results. +8. Be cautious. Over-trimming destroys coherence, and the first and last + seconds disproportionately shape the viewer's experience. +9. Return ONLY the JSON object. No markdown, no prose, no trailing text. diff --git a/src/humeo/prompts/hook_detection_system.jinja2 b/src/humeo/prompts/hook_detection_system.jinja2 new file mode 100644 index 0000000000000000000000000000000000000000..695b1489c528d87d87804d109185814a23b0d5bc --- /dev/null +++ b/src/humeo/prompts/hook_detection_system.jinja2 @@ -0,0 +1,91 @@ +## Role + +You are a short-form video editor who has watched thousands of podcast and +interview clips perform on TikTok, YouTube Shorts, and Instagram Reels. For +each candidate clip you know exactly which sentence inside it is the "hook" β€” +the first line that, on its own, earns the next 3 seconds of attention. + +Your job is precise: for each clip, return the clip-relative seconds range +of the single sentence that functions as the hook. + +## Input + +- A list of candidate clips. For every clip you receive: + - `clip_id`: stable identifier. + - `duration_sec`: total length of the candidate window. + - `topic`: short label from clip selection. + - `viral_hook_text`: the sentence the selector guessed as the hook. It may + be right, it may be wrong, it may be a placeholder β€” do not trust it + blindly; verify against the segment timing. + - Segment lines: `[rel_start_sec - rel_end_sec] text` where all times are + clip-relative (0 = start of the candidate window). +{% if hook_examples %} +- Retrieved hook patterns from the local hook library. Use these as style guidance + for what a strong opening sounds like, but never invent transcript text: + +{{ hook_examples }} +{% endif %} + +## Output + +Return a single JSON object of the form: + +{ + "hooks": [ + { + "clip_id": "001", + "hook_start_sec": 4.2, + "hook_end_sec": 7.8, + "hook_text": "The exact sentence that functions as the hook.", + "reason": "One-sentence justification: why this sentence, and why not the previous/next one." + } + ] +} + +Explanation of each field: + +- `clip_id`: must match an input clip exactly. Return ONE entry per input + clip, in input order. +- `hook_start_sec` / `hook_end_sec`: clip-relative seconds (0 = clip start). + Both must satisfy `0 <= hook_start_sec < hook_end_sec <= duration_sec`. +- `hook_text`: the verbatim substring of the transcript corresponding to the + hook sentence (or best-matching segment). Used for auditing. +- `reason`: why this is the hook. One sentence. + +## What counts as a hook + +A hook is the first **sentence** inside the clip that a cold viewer β€” someone +who has not heard the previous minute of podcast β€” would find compelling on +its own, without setup. + +Prefer sentences that open with: + +- **A claim.** "Prediction markets are the purest form of risk." +- **A hard number.** "This market could explode to five trillion dollars." +- **A named person or institution.** "Cathie Wood thinks Trumponomics resembles Reaganomics." +- **A striking image or metaphor.** "Active investing has been left for dead." +- **A direct question to the viewer.** "What if everyone's wrong about passive investing?" + +Avoid sentences that start with: + +- Filler: "Yeah", "So", "Right", "Well", "I mean". +- Pronoun references to unstated antecedents: "That's why they did it." +- Mid-thought conjunctions: "And because of that, ...". +- Acknowledgment of the host: "And Nick, as we were discussing..." + +## Requirements + +1. Return ONE hook for EVERY input clip, in the same order as the input. +2. The hook must land on sentence or phrase boundaries actually present in + the segment timestamps β€” do not invent times that fall mid-word. +3. Hook duration (`hook_end_sec - hook_start_sec`) must be between **1.5s and + 7.0s**. A longer hook is almost always a whole paragraph you mis-labelled. +4. Prefer a hook that starts within the first 15 seconds of the clip. If the + real hook is later than 15s, it means the clip has a long weak lead-in β€” + still return the real hook; the downstream pruning stage will use it to + trim lead-in. +5. NEVER return the literal placeholder window `0.0 - 3.0` unless it is + genuinely the correct hook window (i.e. the clip opens on a compelling + sentence that ends around the 3s mark). If the first sentence is weak, + find the real hook later in the clip. +6. Return ONLY the JSON object. No markdown, no prose, no trailing text. diff --git a/src/humeo/reframe_ffmpeg.py b/src/humeo/reframe_ffmpeg.py new file mode 100644 index 0000000000000000000000000000000000000000..8153194fe24d5997253879dcf2b44e0796cef6b0 --- /dev/null +++ b/src/humeo/reframe_ffmpeg.py @@ -0,0 +1,74 @@ +"""Thin adapter from the product pipeline to the reusable render primitive.""" + +from __future__ import annotations + +import logging +from pathlib import Path + +from humeo_core.primitives import compile as compile_mod +from humeo_core.schemas import ( + Clip, + LayoutInstruction, + LayoutKind, + RenderRequest, + RenderTheme, +) + +logger = logging.getLogger(__name__) + + +def layout_for_clip( + clip: Clip, + default_layout: LayoutKind = LayoutKind.SIT_CENTER, + zoom: float = 1.0, +) -> LayoutInstruction: + """Build the layout instruction for a clip using the shared schema.""" + layout = clip.layout or default_layout + return LayoutInstruction(clip_id=clip.clip_id, layout=layout, zoom=zoom) + + +def reframe_clip_ffmpeg( + input_path: Path | str, + output_path: Path | str, + clip: Clip, + *, + zoom: float = 1.0, + layout_instruction: LayoutInstruction | None = None, + subtitle_path: Path | str | None = None, + subtitle_font_size: int = 48, + subtitle_margin_v: int = 160, + title_text: str = "", + render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT, + dry_run: bool = False, +) -> RenderRequest: + """Render a single clip to 9:16 via one ffmpeg call. + + If ``layout_instruction`` is set (e.g. from Gemini vision), it is used in full + including ``person_x_norm``, ``chart_x_norm``, and optional split bbox fields. + Otherwise defaults are derived from ``clip.layout`` via ``layout_for_clip``. + """ + + instr = layout_instruction if layout_instruction is not None else layout_for_clip(clip, zoom=zoom) + req = RenderRequest( + source_path=str(input_path), + clip=clip, + layout=instr, + output_path=str(output_path), + subtitle_path=str(subtitle_path) if subtitle_path else None, + subtitle_font_size=subtitle_font_size, + subtitle_margin_v=subtitle_margin_v, + title_text=title_text, + render_theme=render_theme, + mode="dry_run" if dry_run else "normal", + ) + result = compile_mod.render_clip(req) + if not result.success and not dry_run: + raise RuntimeError(f"ffmpeg failed for clip {clip.clip_id}: {result.error}") + logger.info( + "reframe_clip_ffmpeg: clip=%s layout=%s output=%s success=%s", + clip.clip_id, + instr.layout.value, + output_path, + result.success, + ) + return req diff --git a/src/humeo/render_qa.py b/src/humeo/render_qa.py new file mode 100644 index 0000000000000000000000000000000000000000..de5b33b2fd6c4f693d2e16aca5c7d16b968735b2 --- /dev/null +++ b/src/humeo/render_qa.py @@ -0,0 +1,955 @@ +"""Best-effort render QA artifacts for finished shorts.""" + +from __future__ import annotations + +import argparse +import json +import logging +import math +import re +import shutil +import subprocess +from pathlib import Path +from typing import Any + +import numpy as np +from PIL import Image, ImageDraw + +from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, TranscriptWord + +from humeo.transcript_align import clip_subtitle_words + +logger = logging.getLogger(__name__) + +_CONTACT_COLUMNS = 8 +_CONTACT_ROWS = 5 +_CONTACT_THUMB_W = 270 +_DEBUG_FPS = 10 +_PIXEL_QA_SAMPLES = 8 +_PIXEL_QA_W = 360 +_PIXEL_QA_CAPTION_MIN_Y_RATIO = 0.40 + + +def _clamp(value: float, lo: float = 0.0, hi: float = 1.0) -> float: + return max(lo, min(hi, value)) + + +def _ensure_ffmpeg() -> str: + exe = shutil.which("ffmpeg") + if not exe: + raise RuntimeError("ffmpeg not found on PATH") + return exe + + +def _ensure_ffprobe() -> str: + exe = shutil.which("ffprobe") + if not exe: + raise RuntimeError("ffprobe not found on PATH") + return exe + + +def _run(cmd: list[str]) -> None: + subprocess.run(cmd, check=True, capture_output=True) + + +def _probe_duration(path: Path) -> float | None: + try: + out = subprocess.run( + [ + _ensure_ffprobe(), + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=nokey=1:noprint_wrappers=1", + str(path), + ], + check=True, + capture_output=True, + text=True, + ) + return float((out.stdout or "").strip()) + except Exception: + return None + + +def _probe_size(path: Path) -> tuple[int, int] | None: + try: + out = subprocess.run( + [ + _ensure_ffprobe(), + "-v", + "error", + "-select_streams", + "v:0", + "-show_entries", + "stream=width,height", + "-of", + "csv=p=0", + str(path), + ], + check=True, + capture_output=True, + text=True, + ) + width, height = (out.stdout or "").strip().split(",") + return int(width), int(height) + except Exception: + return None + + +def create_contact_sheet( + video_path: Path, + output_path: Path, + *, + columns: int = _CONTACT_COLUMNS, + rows: int = _CONTACT_ROWS, + thumb_width: int = _CONTACT_THUMB_W, +) -> Path: + """Create an evenly sampled contact sheet for one rendered short.""" + + output_path.parent.mkdir(parents=True, exist_ok=True) + duration = _probe_duration(video_path) or 40.0 + frame_count = max(1, columns * rows) + sample_fps = max(0.1, min(4.0, frame_count / max(duration, 1.0))) + vf = ( + f"fps={sample_fps:.6f}," + f"scale={thumb_width}:-1," + f"tile={columns}x{rows}:padding=2:margin=0" + ) + _run( + [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + "-i", + str(video_path), + "-vf", + vf, + "-frames:v", + "1", + str(output_path), + ] + ) + return output_path + + +def create_ab_compare( + reference_video: Path, + output_video: Path, + compare_path: Path, + *, + fps: float = 4.0, + columns: int = _CONTACT_COLUMNS, + rows: int = _CONTACT_ROWS, + thumb_width: int = _CONTACT_THUMB_W, + output_seek_sec: float = 0.0, +) -> Path: + """Stack reference and output contact sheets into one compare image.""" + + compare_path.parent.mkdir(parents=True, exist_ok=True) + ref_sheet = compare_path.with_name(compare_path.stem + "_reference.jpg") + out_sheet = compare_path.with_name(compare_path.stem + "_output.jpg") + tile = f"tile={columns}x{rows}:padding=2:margin=0" + common_vf = f"fps={fps:.6f},scale={thumb_width}:-1,{tile}" + + _run( + [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + "-i", + str(reference_video), + "-vf", + common_vf, + "-frames:v", + "1", + str(ref_sheet), + ] + ) + output_cmd = [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + ] + if output_seek_sec > 0.0: + output_cmd.extend(["-ss", f"{output_seek_sec:.3f}"]) + output_cmd.extend( + [ + "-i", + str(output_video), + "-vf", + common_vf, + "-frames:v", + "1", + str(out_sheet), + ] + ) + _run(output_cmd) + _run( + [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + "-i", + str(ref_sheet), + "-i", + str(out_sheet), + "-filter_complex", + "[0:v][1:v]vstack=inputs=2", + "-frames:v", + "1", + str(compare_path), + ] + ) + return compare_path + + +def _even(value: int) -> int: + return max(2, value - (value % 2)) + + +def _base_crop_size(src_w: int, src_h: int, target_aspect: float) -> tuple[int, int]: + if src_w / src_h >= target_aspect: + base_ch = src_h + base_cw = int(round(base_ch * target_aspect)) + else: + base_cw = src_w + base_ch = int(round(base_cw / target_aspect)) + return _even(base_cw), _even(base_ch) + + +def _crop_size(src_w: int, src_h: int, zoom: float) -> tuple[int, int]: + base_cw, base_ch = _base_crop_size(src_w, src_h, 9 / 16) + zoom = max(1.0, float(zoom)) + return _even(int(round(base_cw / zoom))), _even(int(round(base_ch / zoom))) + + +def _center_expr(layout: LayoutInstruction, src_w: int) -> str: + points = sorted(layout.person_tracking, key=lambda p: p.t_sec) + if not points: + return f"{_clamp(layout.person_x_norm) * src_w:.3f}" + + expr = f"{_clamp(points[-1].x_norm) * src_w:.3f}" + for idx in range(len(points) - 2, -1, -1): + threshold = (float(points[idx].t_sec) + float(points[idx + 1].t_sec)) / 2.0 + value = _clamp(points[idx].x_norm) * src_w + expr = f"if(lt(t\\,{threshold:.3f})\\,{value:.3f}\\,{expr})" + return expr + + +def _raw_bbox_filter( + raw_layout: dict[str, Any], + key: str, + *, + src_w: int, + src_h: int, + color: str, +) -> str | None: + box = raw_layout.get(key) + if not isinstance(box, dict): + return None + try: + x1 = float(box["x1"]) + y1 = float(box["y1"]) + x2 = float(box["x2"]) + y2 = float(box["y2"]) + except (KeyError, TypeError, ValueError): + return None + if max(abs(x1), abs(y1), abs(x2), abs(y2)) <= 1.5: + x1, x2 = x1 * src_w, x2 * src_w + y1, y2 = y1 * src_h, y2 * src_h + x = max(0, min(src_w - 2, int(round(x1)))) + y = max(0, min(src_h - 2, int(round(y1)))) + w = max(2, min(src_w - x, int(round(x2 - x1)))) + h = max(2, min(src_h - y, int(round(y2 - y1)))) + return f"drawbox=x={x}:y={y}:w={w}:h={h}:color={color}:t=4" + + +def create_crop_debug_overlay( + source_video: Path, + output_path: Path, + *, + clip: Clip, + layout: LayoutInstruction, + raw_layout: dict[str, Any] | None = None, +) -> Path: + """Create a low-res source preview with crop, speaker center, and bbox overlays.""" + + output_path.parent.mkdir(parents=True, exist_ok=True) + src_w, src_h = _probe_size(source_video) or (1920, 1080) + zoom = ( + max(layout.zoom, 1.25) + if layout.layout == LayoutKind.ZOOM_CALL_CENTER + else max(layout.zoom, 1.0) + ) + cw, ch = _crop_size(src_w, src_h, zoom) + center_y = 0.5 if layout.layout == LayoutKind.ZOOM_CALL_CENTER else 0.48 + y = _even(max(0, min(src_h - ch, int(round(center_y * src_h - ch / 2))))) + center = _center_expr(layout, src_w) + max_x = max(0, src_w - cw) + crop_x = f"floor(max(0\\,min({max_x}\\,({center})-{cw}/2))/2)*2" + + filters = [ + f"fps={_DEBUG_FPS}", + f"drawbox=x={crop_x}:y={y}:w={cw}:h={ch}:color=0x00FF00@0.85:t=6", + f"drawbox=x=({center})-3:y=0:w=6:h=ih:color=0xA855F7@0.45:t=fill", + ] + raw_layout = raw_layout or {} + for key, color in ( + ("person_bbox", "0x38BDF8@0.85"), + ("face_bbox", "0xFACC15@0.9"), + ("second_person_bbox", "0xFB923C@0.85"), + ("second_face_bbox", "0xF97316@0.9"), + ): + bbox_filter = _raw_bbox_filter(raw_layout, key, src_w=src_w, src_h=src_h, color=color) + if bbox_filter: + filters.append(bbox_filter) + filters.append("scale=540:-2") + + duration = max(0.1, clip.duration_sec) + _run( + [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + "-t", + f"{duration:.3f}", + "-i", + str(source_video), + "-vf", + ",".join(filters), + "-an", + "-c:v", + "libx264", + "-preset", + "ultrafast", + "-crf", + "26", + "-movflags", + "+faststart", + str(output_path), + ] + ) + return output_path + + +def _word_timing_metrics(words: list[TranscriptWord]) -> dict[str, Any]: + invalid = 0 + very_short = 0 + very_long = 0 + overlaps = 0 + max_gap = 0.0 + prev_end: float | None = None + for word in words: + start = float(word.start_time) + end = float(word.end_time) + duration = end - start + if not (math.isfinite(start) and math.isfinite(end)) or duration <= 0.0: + invalid += 1 + if 0.0 < duration < 0.055: + very_short += 1 + if duration > 1.65: + very_long += 1 + if prev_end is not None: + if start < prev_end - 0.06: + overlaps += 1 + max_gap = max(max_gap, start - prev_end) + prev_end = end + count = len(words) + return { + "word_count": count, + "invalid_count": invalid, + "very_short_count": very_short, + "very_long_count": very_long, + "overlap_count": overlaps, + "max_gap_sec": round(max_gap, 3), + } + + +def _tracking_metrics(layout: LayoutInstruction) -> dict[str, Any]: + points = sorted(layout.person_tracking, key=lambda p: p.t_sec) + jumps = [ + abs(float(points[idx].x_norm) - float(points[idx - 1].x_norm)) + for idx in range(1, len(points)) + ] + edge_count = sum(1 for p in points if p.x_norm < 0.16 or p.x_norm > 0.84) + return { + "tracking_sample_count": len(points), + "max_tracking_jump_norm": round(max(jumps) if jumps else 0.0, 4), + "edge_sample_count": edge_count, + } + + +def _bbox_from_mask(mask: np.ndarray) -> tuple[int, int, int, int] | None: + ys, xs = np.where(mask) + if len(xs) == 0 or len(ys) == 0: + return None + return int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1 + + +def _expand_bbox( + bbox: tuple[int, int, int, int], + *, + width: int, + height: int, + pad_x: int, + pad_y: int, +) -> tuple[int, int, int, int]: + x1, y1, x2, y2 = bbox + return ( + max(0, x1 - pad_x), + max(0, y1 - pad_y), + min(width, x2 + pad_x), + min(height, y2 + pad_y), + ) + + +def _bbox_area(bbox: tuple[int, int, int, int] | None) -> int: + if bbox is None: + return 0 + x1, y1, x2, y2 = bbox + return max(0, x2 - x1) * max(0, y2 - y1) + + +def _bbox_intersection_area( + first: tuple[int, int, int, int] | None, + second: tuple[int, int, int, int] | None, +) -> int: + if first is None or second is None: + return 0 + ax1, ay1, ax2, ay2 = first + bx1, by1, bx2, by2 = second + return _bbox_area((max(ax1, bx1), max(ay1, by1), min(ax2, bx2), min(ay2, by2))) + + +def _sample_final_frames( + video_path: Path, + frames_dir: Path, + *, + sample_count: int = _PIXEL_QA_SAMPLES, + width: int = _PIXEL_QA_W, +) -> list[tuple[float, Path]]: + duration = _probe_duration(video_path) or 0.0 + if duration <= 0.0: + return [] + frames_dir.mkdir(parents=True, exist_ok=True) + samples: list[tuple[float, Path]] = [] + for idx in range(max(1, sample_count)): + time_sec = duration * float(idx + 1) / float(sample_count + 1) + frame_path = frames_dir / f"frame_{idx + 1:03d}.jpg" + try: + _run( + [ + _ensure_ffmpeg(), + "-y", + "-loglevel", + "error", + "-ss", + f"{time_sec:.3f}", + "-i", + str(video_path), + "-frames:v", + "1", + "-vf", + f"scale={width}:-2", + str(frame_path), + ] + ) + except Exception as exc: # noqa: BLE001 - keep QA warning-based + logger.warning( + "Pixel QA frame sample failed for %s at %.2fs: %s", + video_path, + time_sec, + exc, + ) + continue + if frame_path.is_file(): + samples.append((time_sec, frame_path)) + return samples + + +def _caption_masks(arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + rgb = arr.astype(np.int16) + r = rgb[:, :, 0] + g = rgb[:, :, 1] + b = rgb[:, :, 2] + purple = ( + (r >= 85) + & (r <= 190) + & (g >= 35) + & (g <= 155) + & (b >= 145) + & ((b - r) >= 32) + & ((r - g) >= 8) + ) + white = (r >= 205) & (g >= 205) & (b >= 205) + return purple, white + + +def _frame_pixel_record(frame_path: Path, *, time_sec: float) -> dict[str, Any]: + image = Image.open(frame_path).convert("RGB") + arr = np.asarray(image) + height, width = arr.shape[:2] + brightness = float(arr.mean() / 255.0) + contrast = float(arr.std() / 255.0) + blank = brightness < 0.035 or contrast < 0.025 + + purple, white = _caption_masks(arr) + y_grid = np.arange(height)[:, None] + x_grid = np.arange(width)[None, :] + caption_region = y_grid >= int(round(height * _PIXEL_QA_CAPTION_MIN_Y_RATIO)) + purple = purple & caption_region + purple_bbox = _bbox_from_mask(purple) + caption_bbox = None + if purple_bbox is not None: + expanded = _expand_bbox( + purple_bbox, + width=width, + height=height, + pad_x=max(36, width // 8), + pad_y=max(14, height // 34), + ) + ex1, ey1, ex2, ey2 = expanded + nearby_white = ( + white + & (x_grid >= ex1) + & (x_grid <= ex2) + & (y_grid >= ey1) + & (y_grid <= ey2) + ) + caption_bbox = _bbox_from_mask(purple | nearby_white) + if caption_bbox is not None: + caption_bbox = _expand_bbox( + caption_bbox, + width=width, + height=height, + pad_x=4, + pad_y=4, + ) + + face_safe_zone = ( + int(round(width * 0.10)), + int(round(height * 0.06)), + int(round(width * 0.90)), + int(round(height * 0.52)), + ) + caption_area = _bbox_area(caption_bbox) + overlap_area = _bbox_intersection_area(caption_bbox, face_safe_zone) + overlap_ratio = overlap_area / max(1, caption_area) + edge_hit = False + edge_bbox = purple_bbox or caption_bbox + if edge_bbox is not None: + x1, y1, x2, y2 = edge_bbox + edge_margin_x = max(2, int(round(width * 0.015))) + edge_margin_y = max(2, int(round(height * 0.01))) + edge_hit = ( + x1 <= edge_margin_x + or x2 >= width - edge_margin_x + or y2 >= height - edge_margin_y + ) + + flags: list[str] = [] + if blank: + flags.append("blank_or_flat_frame") + if edge_hit: + flags.append("caption_edge_clip_check") + if caption_bbox is not None and overlap_ratio >= 0.18: + flags.append("caption_face_safe_zone_check") + + return { + "time_sec": round(time_sec, 3), + "frame_path": str(frame_path), + "brightness": round(brightness, 4), + "contrast": round(contrast, 4), + "caption_bbox": list(caption_bbox) if caption_bbox is not None else None, + "purple_bbox": list(purple_bbox) if purple_bbox is not None else None, + "face_safe_zone": list(face_safe_zone), + "caption_face_safe_zone_overlap": round(overlap_ratio, 4), + "flags": flags, + } + + +def _draw_bbox( + draw: ImageDraw.ImageDraw, + bbox: list[int] | tuple[int, int, int, int] | None, + *, + color: str, + width: int = 3, +) -> None: + if not bbox: + return + draw.rectangle(tuple(int(v) for v in bbox), outline=color, width=width) + + +def _write_pixel_qa_sheet(records: list[dict[str, Any]], output_path: Path) -> Path | None: + if not records: + return None + frames: list[Image.Image] = [] + for record in records: + frame_path = Path(str(record.get("frame_path", ""))) + if not frame_path.is_file(): + continue + img = Image.open(frame_path).convert("RGB") + draw = ImageDraw.Draw(img) + has_warning = bool(record.get("flags")) + _draw_bbox(draw, record.get("face_safe_zone"), color="#22c55e", width=2) + _draw_bbox(draw, record.get("caption_bbox"), color="#ef4444" if has_warning else "#a855f7") + label = f"{record.get('time_sec', 0):.1f}s" + if has_warning: + label += " " + ",".join(str(flag) for flag in record.get("flags", [])) + draw.rectangle((0, 0, img.width, 24), fill=(0, 0, 0)) + draw.text((6, 5), label, fill=(255, 255, 255)) + frames.append(img) + if not frames: + return None + + columns = min(4, len(frames)) + rows = int(math.ceil(len(frames) / columns)) + tile_w = max(frame.width for frame in frames) + tile_h = max(frame.height for frame in frames) + sheet = Image.new("RGB", (columns * tile_w, rows * tile_h), (12, 12, 12)) + for idx, frame in enumerate(frames): + x = (idx % columns) * tile_w + y = (idx // columns) * tile_h + sheet.paste(frame, (x, y)) + output_path.parent.mkdir(parents=True, exist_ok=True) + sheet.save(output_path, quality=92) + return output_path + + +def analyze_rendered_pixels(video_path: Path, qa_dir: Path, *, clip_id: str) -> dict[str, Any]: + """Sample rendered frames and run simple pixel-level QA checks.""" + + frames_dir = qa_dir / f"short_{clip_id}_pixel_frames" + records: list[dict[str, Any]] = [] + sheet: Path | None = None + try: + samples = _sample_final_frames(video_path, frames_dir) + for time_sec, frame_path in samples: + records.append(_frame_pixel_record(frame_path, time_sec=time_sec)) + sheet = _write_pixel_qa_sheet(records, qa_dir / f"short_{clip_id}_pixel_qa.jpg") + finally: + shutil.rmtree(frames_dir, ignore_errors=True) + + sample_count = len(records) + caption_seen = sum(1 for record in records if record.get("caption_bbox") is not None) + blank_count = sum(1 for record in records if "blank_or_flat_frame" in record.get("flags", [])) + edge_hits = sum(1 for record in records if "caption_edge_clip_check" in record.get("flags", [])) + safe_zone_hits = sum( + 1 for record in records if "caption_face_safe_zone_check" in record.get("flags", []) + ) + min_contrast = min((float(record.get("contrast", 0.0)) for record in records), default=0.0) + mean_brightness = ( + sum(float(record.get("brightness", 0.0)) for record in records) / sample_count + if sample_count + else 0.0 + ) + + score = 1.0 + if sample_count == 0: + score = 0.0 + else: + missing_ratio = max(0.0, (max(2, sample_count // 4) - caption_seen) / max(1, sample_count)) + score -= (blank_count / sample_count) * 0.55 + score -= (edge_hits / sample_count) * 0.28 + score -= (safe_zone_hits / sample_count) * 0.35 + score -= missing_ratio * 0.20 + score = _clamp(score) + + flags: list[str] = [] + if sample_count == 0: + flags.append("pixel_qa_no_samples") + if blank_count: + flags.append("blank_or_flat_frame") + if edge_hits: + flags.append("caption_edge_clip_check") + if safe_zone_hits: + flags.append("caption_face_safe_zone_check") + if sample_count and caption_seen < max(2, sample_count // 4): + flags.append("caption_pixels_sparse_check") + + return { + "pixel_score": round(score, 3), + "flags": flags, + "sample_count": sample_count, + "caption_seen_frames": caption_seen, + "blank_frame_count": blank_count, + "caption_edge_hit_count": edge_hits, + "caption_face_safe_zone_hit_count": safe_zone_hits, + "mean_brightness": round(mean_brightness, 4), + "min_contrast": round(min_contrast, 4), + "annotated_sheet": str(sheet) if sheet is not None else None, + "frames": [ + { + "time_sec": record["time_sec"], + "caption_bbox": record["caption_bbox"], + "flags": record["flags"], + } + for record in records + ], + } + + +def score_short( + output_video: Path, + *, + clip: Clip, + transcript: dict, + layout: LayoutInstruction, +) -> dict[str, Any]: + """Return lightweight, deterministic QA scores for one rendered short.""" + + words = clip_subtitle_words(transcript, clip).words + word_metrics = _word_timing_metrics(words) + tracking = _tracking_metrics(layout) + width_height = _probe_size(output_video) + duration = _probe_duration(output_video) + + word_count = max(1, int(word_metrics["word_count"])) + caption_score = 1.0 + caption_score -= (word_metrics["invalid_count"] / word_count) * 0.55 + caption_score -= (word_metrics["very_short_count"] / word_count) * 0.22 + caption_score -= (word_metrics["very_long_count"] / word_count) * 0.20 + caption_score -= (word_metrics["overlap_count"] / word_count) * 0.28 + if word_metrics["word_count"] == 0: + caption_score = 0.25 + caption_score = _clamp(caption_score) + + sample_count = max(1, int(tracking["tracking_sample_count"])) + max_jump = float(tracking["max_tracking_jump_norm"]) + speaker_score = 1.0 + speaker_score -= (int(tracking["edge_sample_count"]) / sample_count) * 0.35 + speaker_score -= max(0.0, max_jump - 0.18) * 1.4 + if layout.layout not in (LayoutKind.SIT_CENTER, LayoutKind.ZOOM_CALL_CENTER): + speaker_score = max(0.82, speaker_score) + speaker_score = _clamp(speaker_score) + + crop_jump_score = _clamp(1.0 - max(0.0, max_jump - 0.12) * 2.1) + video_score = 1.0 + if width_height != (1080, 1920): + video_score -= 0.18 + if duration is None or duration <= 0.0: + video_score -= 0.35 + video_score = _clamp(video_score) + + overall = ( + caption_score * 0.35 + + speaker_score * 0.30 + + crop_jump_score * 0.20 + + video_score * 0.15 + ) + flags: list[str] = [] + if caption_score < 0.82: + flags.append("caption_timing_check") + if speaker_score < 0.82: + flags.append("speaker_centering_check") + if crop_jump_score < 0.82: + flags.append("crop_jump_check") + if video_score < 0.9: + flags.append("video_probe_check") + + return { + "overall_score": round(overall, 3), + "caption_score": round(caption_score, 3), + "speaker_centering_score": round(speaker_score, 3), + "crop_jump_score": round(crop_jump_score, 3), + "video_score": round(video_score, 3), + "flags": flags, + "video": { + "duration_sec": round(duration, 3) if duration is not None else None, + "size": list(width_height) if width_height else None, + }, + "word_timing": word_metrics, + "tracking": tracking, + } + + +def _clip_id_from_output(path: Path) -> str: + match = re.search(r"short_([^\\/]+?)\.mp4$", path.name, flags=re.IGNORECASE) + return match.group(1) if match else path.stem + + +def qa_record_flags(record: dict[str, Any]) -> list[str]: + flags: list[str] = [] + score = record.get("score") + if isinstance(score, dict): + flags.extend(str(flag) for flag in score.get("flags", []) if str(flag)) + pixel_qa = record.get("pixel_qa") + if isinstance(pixel_qa, dict): + flags.extend(str(flag) for flag in pixel_qa.get("flags", []) if str(flag)) + if record.get("errors"): + flags.append("qa_error") + return list(dict.fromkeys(flags)) + + +def qa_summary_lines(manifest_path: Path) -> list[str]: + if not manifest_path.is_file(): + return [] + try: + payload = json.loads(manifest_path.read_text(encoding="utf-8")) + except Exception: + return [] + records = payload.get("shorts", []) + if not isinstance(records, list): + return [] + lines: list[str] = [] + for record in records: + if not isinstance(record, dict): + continue + clip_id = str(record.get("clip_id", "")).strip() + if not clip_id: + continue + flags = qa_record_flags(record) + status = "WARN " + ", ".join(flags) if flags else "OK" + lines.append(f"short_{clip_id} {status}") + return lines + + +def run_render_qa( + *, + output_dir: Path, + final_outputs: list[Path], + render_clips_by_id: dict[str, Clip], + transcripts_by_id: dict[str, dict], + layouts_by_id: dict[str, LayoutInstruction], + assembled_sources_by_id: dict[str, Path], + raw_layouts_by_id: dict[str, dict[str, Any]] | None = None, + reference_video: Path | None = None, + debug_overlay: bool = True, +) -> Path: + """Create QA artifacts for all rendered shorts and return the manifest path.""" + + qa_dir = output_dir / "render_qa" + qa_dir.mkdir(parents=True, exist_ok=True) + raw_layouts_by_id = raw_layouts_by_id or {} + manifest_path = qa_dir / "qa_manifest.json" + records_by_id: dict[str, dict[str, Any]] = {} + if manifest_path.is_file(): + try: + existing = json.loads(manifest_path.read_text(encoding="utf-8")) + for item in existing.get("shorts", []): + if isinstance(item, dict) and item.get("clip_id"): + records_by_id[str(item["clip_id"])] = item + except Exception as exc: # noqa: BLE001 - stale QA should not block updates + logger.warning("Ignoring stale QA manifest at %s: %s", manifest_path, exc) + + for video_path in final_outputs: + clip_id = _clip_id_from_output(video_path) + clip = render_clips_by_id.get(clip_id) + transcript = transcripts_by_id.get(clip_id) + layout = layouts_by_id.get(clip_id) + record: dict[str, Any] = { + "clip_id": clip_id, + "output": str(video_path), + "artifacts": {}, + "errors": [], + } + + try: + sheet = create_contact_sheet(video_path, qa_dir / f"short_{clip_id}_contact.jpg") + record["artifacts"]["contact_sheet"] = str(sheet) + except Exception as exc: # noqa: BLE001 - QA must not fail the render + record["errors"].append(f"contact_sheet: {exc}") + logger.warning("Render QA contact sheet failed for %s: %s", clip_id, exc) + + if reference_video is not None and reference_video.is_file(): + try: + compare = create_ab_compare( + reference_video, + video_path, + qa_dir / f"short_{clip_id}_ab_compare.jpg", + ) + record["artifacts"]["ab_compare"] = str(compare) + except Exception as exc: # noqa: BLE001 + record["errors"].append(f"ab_compare: {exc}") + logger.warning("Render QA A/B compare failed for %s: %s", clip_id, exc) + + if debug_overlay and clip is not None and layout is not None: + source = assembled_sources_by_id.get(clip_id) + if source is not None and source.is_file(): + try: + debug = create_crop_debug_overlay( + source, + qa_dir / f"short_{clip_id}_crop_debug.mp4", + clip=clip, + layout=layout, + raw_layout=raw_layouts_by_id.get(clip_id), + ) + record["artifacts"]["crop_debug_overlay"] = str(debug) + except Exception as exc: # noqa: BLE001 + record["errors"].append(f"crop_debug_overlay: {exc}") + logger.warning("Render QA crop debug failed for %s: %s", clip_id, exc) + + try: + pixel_qa = analyze_rendered_pixels(video_path, qa_dir, clip_id=clip_id) + record["pixel_qa"] = pixel_qa + if pixel_qa.get("annotated_sheet"): + record["artifacts"]["pixel_qa_sheet"] = pixel_qa["annotated_sheet"] + except Exception as exc: # noqa: BLE001 + record["errors"].append(f"pixel_qa: {exc}") + logger.warning("Render QA pixel checks failed for %s: %s", clip_id, exc) + pixel_qa = None + + if clip is not None and transcript is not None and layout is not None: + score = score_short( + video_path, + clip=clip, + transcript=transcript, + layout=layout, + ) + if isinstance(pixel_qa, dict): + pixel_score = float(pixel_qa.get("pixel_score", 0.0)) + score["pixel_score"] = round(pixel_score, 3) + merged_flags = list( + dict.fromkeys(score.get("flags", []) + pixel_qa.get("flags", [])) + ) + score["flags"] = merged_flags + score["overall_score"] = round( + _clamp(float(score["overall_score"]) * 0.80 + pixel_score * 0.20), + 3, + ) + record["score"] = score + else: + record["errors"].append("score: missing clip, transcript, or layout") + + records_by_id[clip_id] = record + + manifest: dict[str, Any] = { + "shorts": [records_by_id[key] for key in sorted(records_by_id)] + } + manifest_path.write_text( + json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + logger.info("Render QA manifest written: %s", manifest_path) + logger.info("Render QA summary:") + for line in qa_summary_lines(manifest_path): + logger.info(" %s", line) + return manifest_path + + +def _main() -> None: + parser = argparse.ArgumentParser(description="Create a reference/output A/B contact sheet.") + parser.add_argument("--reference", type=Path, required=True, help="Reference video path.") + parser.add_argument( + "--output-video", + type=Path, + required=True, + help="Rendered output video path.", + ) + parser.add_argument("--out", type=Path, required=True, help="Compare image output path.") + parser.add_argument("--fps", type=float, default=4.0, help="Contact-sheet sample FPS.") + args = parser.parse_args() + create_ab_compare(args.reference, args.output_video, args.out, fps=args.fps) + print(args.out) + + +if __name__ == "__main__": + _main() diff --git a/src/humeo/render_window.py b/src/humeo/render_window.py new file mode 100644 index 0000000000000000000000000000000000000000..461cd06fb243dc4d621e743a4f1dff1cbb239993 --- /dev/null +++ b/src/humeo/render_window.py @@ -0,0 +1,48 @@ +"""Map LLM clip timing (segment + trim + hook) to one ffmpeg source window. + +``humeo_core.primitives.compile`` already cuts with ``-ss`` / ``-t`` from ``Clip``; +this module is the single place that turns trim/hook fields into concrete bounds. +""" + +from __future__ import annotations + +from humeo_core.schemas import Clip + + +def effective_export_bounds(clip: Clip) -> tuple[float, float]: + """Return ``(start_sec, end_sec)`` on the source timeline for the exported short. + + 1. **Trim** narrows ``[start_time_sec, end_time_sec]``. + 2. ``render_spans`` override contiguous trim export when present. + 2. Hook fields remain metadata and do not change the export window. + """ + if clip.render_spans: + return clip.render_spans[0].start_time_sec, clip.render_spans[-1].end_time_sec + + s0 = clip.start_time_sec + s1 = clip.end_time_sec + + t_lo = s0 + clip.trim_start_sec + t_hi = s1 - clip.trim_end_sec + if t_hi <= t_lo: + t_lo, t_hi = s0, s1 + + if t_hi <= t_lo: + t_lo, t_hi = s0, s1 + + return t_lo, t_hi + + +def clip_for_render(clip: Clip) -> Clip: + """Copy with ``start``/``end`` set to the actual cut; trim/hook cleared.""" + t0, t1 = effective_export_bounds(clip) + return clip.model_copy( + update={ + "start_time_sec": t0, + "end_time_sec": t1, + "trim_start_sec": 0.0, + "trim_end_sec": 0.0, + "hook_start_sec": None, + "hook_end_sec": None, + } + ) diff --git a/src/humeo/session_state.py b/src/humeo/session_state.py new file mode 100644 index 0000000000000000000000000000000000000000..d6abeb0d4cbac5f90d2481316e0bdd91c3a2bd92 --- /dev/null +++ b/src/humeo/session_state.py @@ -0,0 +1,67 @@ +"""Persist and load interactive session state.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path + +from humeo.video_cache import extract_youtube_video_id, normalize_local_source_path +from humeo_core.schemas import SessionState + +logger = logging.getLogger(__name__) + +SESSION_STATE_FILENAME = "session_state.json" + + +def source_key_for_url(youtube_url: str) -> str: + video_id = extract_youtube_video_id(youtube_url) + if video_id: + return f"youtube:{video_id}" + local_path = normalize_local_source_path(youtube_url) + if local_path is not None: + return f"local:{local_path}" + return f"url:{youtube_url}" + + +def fresh_state(youtube_url: str) -> SessionState: + return SessionState(source_key=source_key_for_url(youtube_url)) + + +def load_state(work_dir: Path, youtube_url: str) -> SessionState: + """Load session state, resetting on corruption or source mismatch.""" + path = work_dir / SESSION_STATE_FILENAME + state = fresh_state(youtube_url) + if not path.is_file(): + return state + + try: + with open(path, encoding="utf-8") as f: + payload = json.load(f) + loaded = SessionState.model_validate(payload) + except Exception as exc: + logger.warning("Session state at %s is invalid. Starting fresh. Error: %s", path, exc) + return state + + if loaded.source_key and loaded.source_key != state.source_key: + logger.warning( + "Session state at %s belongs to %s, not %s. Starting fresh.", + path, + loaded.source_key, + state.source_key, + ) + return state + + if not loaded.source_key: + loaded.source_key = state.source_key + return loaded + + +def save_state(work_dir: Path, state: SessionState) -> Path: + """Persist session state to the work dir.""" + work_dir.mkdir(parents=True, exist_ok=True) + path = work_dir / SESSION_STATE_FILENAME + with open(path, "w", encoding="utf-8") as f: + f.write(state.model_dump_json(indent=2)) + f.write("\n") + return path diff --git a/src/humeo/transcript_align.py b/src/humeo/transcript_align.py new file mode 100644 index 0000000000000000000000000000000000000000..da1cb690a684c60277fb9207ee2b76b9f77fbc8b --- /dev/null +++ b/src/humeo/transcript_align.py @@ -0,0 +1,290 @@ +"""Map source-timeline ASR words to per-clip subtitle timings (t=0 at clip in-point).""" + +from __future__ import annotations + +from humeo_core.schemas import Clip, ClipSubtitleWords, RenderTheme, TranscriptWord + +# Whisper / WhisperX / OpenAI-normalized segment shapes +_MAX_WORDS_PER_CUE = 8 +_MAX_CUE_SEC = 4.0 +_PUNCTUATION_BREAK_CHARS = (".", "?", "!", ";", ":") +_SENTENCE_RESTART_WORDS = frozenset( + { + "And", + "But", + "Did", + "Now", + "So", + "That", + "Then", + "This", + "Those", + "What", + "When", + "Where", + "Why", + } +) + + +def _iter_words_from_segments(transcript: dict) -> list[TranscriptWord]: + out: list[TranscriptWord] = [] + for seg in transcript.get("segments", []) or []: + words = seg.get("words") or [] + if words: + for raw in words: + w = str(raw.get("word", "")).strip() + if not w: + continue + out.append( + TranscriptWord( + word=w, + start_time=float(raw["start"]), + end_time=float(raw["end"]), + ) + ) + continue + # Segment-level only (no word list): treat whole segment as one token + text = str(seg.get("text", "")).strip() + if text: + out.append( + TranscriptWord( + word=text, + start_time=float(seg.get("start", 0.0)), + end_time=float(seg.get("end", 0.0)), + ) + ) + return out + + +def clip_subtitle_words(transcript: dict, clip: Clip) -> ClipSubtitleWords: + """Words overlapping ``clip`` with times shifted to start at 0 (clip-local).""" + clip_start = clip.start_time_sec + clip_end = clip.end_time_sec + words = _iter_words_from_segments(transcript) + local: list[TranscriptWord] = [] + for w in words: + if w.end_time <= clip_start or w.start_time >= clip_end: + continue + t0 = max(w.start_time, clip_start) - clip_start + t1 = min(w.end_time, clip_end) - clip_start + if t1 <= t0: + continue + local.append(TranscriptWord(word=w.word, start_time=t0, end_time=t1)) + + if local: + return ClipSubtitleWords(words=local) + + return ClipSubtitleWords(words=_fallback_even_words(clip)) + + +def _fallback_even_words(clip: Clip) -> list[TranscriptWord]: + """Even split over clip duration when no word timestamps exist.""" + text = (clip.transcript or "").strip() + if not text: + return [] + parts = text.split() + if not parts: + return [] + d = clip.duration_sec + step = d / len(parts) + out: list[TranscriptWord] = [] + for i, p in enumerate(parts): + out.append( + TranscriptWord( + word=p, + start_time=i * step, + end_time=(i + 1) * step if i < len(parts) - 1 else d, + ) + ) + return out + + +def _looks_like_sentence_restart(prev_word: str, next_word: str) -> bool: + prev = prev_word.rstrip("\"')]}") + nxt = next_word.lstrip("\"'([{") + if not prev or not nxt: + return False + if nxt in _SENTENCE_RESTART_WORDS: + return True + return any(ch.isdigit() for ch in prev) and nxt[0].isupper() + + +def clip_words_to_srt_lines( + words: list[TranscriptWord], + *, + max_words_per_cue: int = _MAX_WORDS_PER_CUE, + max_cue_sec: float = _MAX_CUE_SEC, + prefer_break_on_punctuation: bool = False, + min_words_before_break: int = 1, +) -> list[tuple[float, float, str]]: + """Group words into SRT cues: max N words and max duration per cue.""" + chunks = group_words_to_cue_chunks( + words, + max_words_per_cue=max_words_per_cue, + max_cue_sec=max_cue_sec, + prefer_break_on_punctuation=prefer_break_on_punctuation, + min_words_before_break=min_words_before_break, + ) + return [ + (chunk[0].start_time, chunk[-1].end_time, " ".join(w.word for w in chunk)) + for chunk in chunks + ] + + +def group_words_to_cue_chunks( + words: list[TranscriptWord], + *, + max_words_per_cue: int = _MAX_WORDS_PER_CUE, + max_cue_sec: float = _MAX_CUE_SEC, + prefer_break_on_punctuation: bool = False, + min_words_before_break: int = 1, +) -> list[list[TranscriptWord]]: + """Group words into timed cue chunks while preserving per-word timings.""" + if not words: + return [] + max_words_per_cue = max(1, int(max_words_per_cue)) + max_cue_sec = max(0.2, float(max_cue_sec)) + min_words_before_break = max(1, int(min_words_before_break)) + chunks_out: list[list[TranscriptWord]] = [] + i = 0 + n = len(words) + while i < n: + chunk: list[TranscriptWord] = [words[i]] + t0 = words[i].start_time + end_t = words[i].end_time + j = i + 1 + while j < n: + w = words[j] + if len(chunk) >= max_words_per_cue: + break + if w.start_time - t0 > max_cue_sec: + break + if ( + prefer_break_on_punctuation + and (len(chunk) >= 2 or end_t - t0 >= 0.45) + and _looks_like_sentence_restart(chunk[-1].word, w.word) + ): + break + chunk.append(w) + end_t = w.end_time + j += 1 + if ( + prefer_break_on_punctuation + and len(chunk) >= min_words_before_break + and chunk[-1].word.rstrip("\"')]}").endswith(_PUNCTUATION_BREAK_CHARS) + ): + break + chunks_out.append(chunk) + i = j + return chunks_out + + +def format_srt(lines: list[tuple[float, float, str]]) -> str: + blocks: list[str] = [] + for idx, (start, end, text) in enumerate(lines, start=1): + blocks.append( + f"{idx}\n{_fmt_time(start)} --> {_fmt_time(end)}\n{text}\n" + ) + return "\n".join(blocks) + + +def _fmt_time(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = int(seconds % 60) + millis = int(round((seconds % 1) * 1000)) + if millis >= 1000: + millis = 999 + return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" + + +# --------------------------------------------------------------------------- +# ASS / SubStation Alpha output (the format libass natively renders) +# --------------------------------------------------------------------------- + + +def _fmt_ass_time(seconds: float) -> str: + """ASS time format: ``H:MM:SS.cs`` (centiseconds).""" + seconds = max(0.0, seconds) + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + secs = seconds % 60 + whole = int(secs) + cs = int(round((secs - whole) * 100)) + if cs >= 100: + cs = 99 + return f"{hours:d}:{minutes:02d}:{whole:02d}.{cs:02d}" + + +def _escape_ass_text(text: str) -> str: + """Escape characters that are significant to the ASS dialogue parser.""" + return ( + text.replace("\\", r"\\") + .replace("{", r"\{") + .replace("}", r"\}") + .replace("\n", r"\N") + ) + + +def format_ass( + lines: list[tuple[float, float, str]], + *, + play_res_x: int, + play_res_y: int, + font_size: int, + margin_v: int, + margin_h: int = 60, + font_name: str = "Arial", + render_theme: RenderTheme = RenderTheme.LEGACY, +) -> str: + """Render captions as an ASS script whose PlayRes matches the output video. + + Why this exists: libass' font/margin scaling multiplies every pixel-ish + value by ``video_height / PlayResY``. The default ``PlayResY=288`` blew + ``FontSize=48`` up to ~320 output pixels and pushed ``MarginV`` to the + middle of the frame. Pinning ``PlayResY`` to the actual output height + makes that scale factor exactly 1.0, so ``font_size`` and ``margin_v`` + below are honest output pixel values. + """ + + if render_theme == RenderTheme.REFERENCE_LOWER_THIRD: + style_line = ( + f"Style: Default,{font_name},{font_size},&H00FFFFFF,&H000000FF," + "&H00000000,&H00000000,-1,0,0,0,100,100,-1,0,1,3,0,2," + f"{margin_h},{margin_h},{margin_v},0\n" + ) + else: + style_line = ( + f"Style: Default,{font_name},{font_size},&H00FFFFFF,&H000000FF," + f"&H00000000,&H70000000,-1,0,0,0,100,100,0,0,4,0,0,2," + f"{margin_h},{margin_h},{margin_v},0\n" + ) + + header = ( + "[Script Info]\n" + "ScriptType: v4.00+\n" + f"PlayResX: {play_res_x}\n" + f"PlayResY: {play_res_y}\n" + "WrapStyle: 0\n" + "ScaledBorderAndShadow: yes\n" + "YCbCr Matrix: None\n" + "\n" + "[V4+ Styles]\n" + "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, " + "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, " + "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, " + "Alignment, MarginL, MarginR, MarginV, Encoding\n" + + style_line + + "\n" + "[Events]\n" + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n" + ) + + events = [] + for start, end, text in lines: + events.append( + f"Dialogue: 0,{_fmt_ass_time(start)},{_fmt_ass_time(end)},Default,," + f"0,0,0,,{_escape_ass_text(text)}" + ) + return header + "\n".join(events) + ("\n" if events else "") diff --git a/src/humeo/video_cache.py b/src/humeo/video_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..7de242b349afb0410c740805d5381dbb32a9dfb1 --- /dev/null +++ b/src/humeo/video_cache.py @@ -0,0 +1,231 @@ +"""Video ingest cache: YouTube id β†’ work directory + manifest on disk.""" + +from __future__ import annotations + +import hashlib +import json +import logging +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, Field + +from humeo.env import default_humeo_cache_root + +logger = logging.getLogger(__name__) + +# Typical watch / short / embed URLs (11-char id). +_YOUTUBE_ID_RE = re.compile( + r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/)([a-zA-Z0-9_-]{11})" +) + +MANIFEST_VERSION = 1 +MANIFEST_NAME = "video_cache_manifest.json" +LOCAL_SOURCE_INFO_NAME = "source.local.json" + + +class VideoCacheEntry(BaseModel): + """One row in the global cache manifest (machine-checkable, Pydantic-only).""" + + video_id: str + url: str = "" + title: str = "" + channel: str = "" + work_dir: str + source_mp4: str + transcript_json: str + downloaded_at: str = "" # ISO 8601 UTC when ingest completed + + +class VideoCacheManifest(BaseModel): + version: int = MANIFEST_VERSION + entries: dict[str, VideoCacheEntry] = Field(default_factory=dict) + + +def extract_youtube_video_id(url: str) -> str | None: + """Return the 11-character video id, or None if not a recognized YouTube URL.""" + m = _YOUTUBE_ID_RE.search(url) + return m.group(1) if m else None + + +def looks_like_local_source(source: str) -> bool: + """Return True when ``source`` should be treated as a local file path.""" + if extract_youtube_video_id(source): + return False + return "://" not in source + + +def normalize_local_source_path(source: str) -> Path | None: + """Return an absolute local path for ``source`` when it is file-like.""" + if not looks_like_local_source(source): + return None + return Path(source).expanduser().resolve(strict=False) + + +def local_source_cache_key(source: str) -> str | None: + """Return a stable cache key for a local source path.""" + path = normalize_local_source_path(source) + if path is None: + return None + stem = re.sub(r"[^a-zA-Z0-9]+", "-", path.stem).strip("-").lower() or "video" + digest = hashlib.sha256(str(path).encode("utf-8")).hexdigest()[:16] + return f"{stem}-{digest}" + + +def _local_source_info_path(work_dir: Path) -> Path: + return work_dir / LOCAL_SOURCE_INFO_NAME + + +def read_local_source_info(work_dir: Path) -> dict[str, str]: + """Read ``source.local.json`` when present.""" + path = _local_source_info_path(work_dir) + if not path.is_file(): + return {} + with open(path, encoding="utf-8") as f: + data = json.load(f) + if not isinstance(data, dict): + return {} + return {str(k): str(v) for k, v in data.items()} + + +def write_local_source_info(work_dir: Path, source_path: Path) -> Path: + """Persist the original local source path used for ``source.mp4``.""" + work_dir.mkdir(parents=True, exist_ok=True) + path = _local_source_info_path(work_dir) + payload = {"local_source_path": str(Path(source_path).expanduser().resolve(strict=False))} + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2) + f.write("\n") + return path + + +def local_source_matches(work_dir: Path, source: str) -> bool: + """Return True when ``work_dir`` already contains the same local source.""" + path = normalize_local_source_path(source) + if path is None: + return False + info = read_local_source_info(work_dir) + return info.get("local_source_path") == str(path) + + +def manifest_path(cache_root: Path | None = None) -> Path: + root = cache_root if cache_root is not None else default_humeo_cache_root() + root.mkdir(parents=True, exist_ok=True) + return root / MANIFEST_NAME + + +def load_manifest(cache_root: Path | None = None) -> VideoCacheManifest: + path = manifest_path(cache_root) + if not path.exists(): + return VideoCacheManifest() + with open(path, encoding="utf-8") as f: + data: Any = json.load(f) + return VideoCacheManifest.model_validate(data) + + +def save_manifest(manifest: VideoCacheManifest, cache_root: Path | None = None) -> Path: + path = manifest_path(cache_root) + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + f.write(manifest.model_dump_json(indent=2)) + return path + + +def resolve_work_directory( + *, + youtube_url: str, + explicit_work_dir: Path | None, + use_video_cache: bool, + cache_root: Path | None, +) -> Path: + """Pick the directory for ``source.mp4``, ``transcript.json``, ``clips.json``, etc. + + - If ``explicit_work_dir`` is set (CLI ``--work-dir``), use it. + - Else if video cache is disabled, use ``.humeo_work``. + - Else if the source is a local file path, use ``/local//``. + - Else if the source has no YouTube id, use ``.humeo_work``. + - Else use ``/videos//`` (creates parents as needed). + """ + if explicit_work_dir is not None: + p = Path(explicit_work_dir).resolve() + p.mkdir(parents=True, exist_ok=True) + return p + + if not use_video_cache: + p = Path(".humeo_work").resolve() + p.mkdir(parents=True, exist_ok=True) + return p + + local_key = local_source_cache_key(youtube_url) + if local_key: + root = cache_root if cache_root is not None else default_humeo_cache_root() + p = (root / "local" / local_key).resolve() + p.mkdir(parents=True, exist_ok=True) + return p + + vid = extract_youtube_video_id(youtube_url) + if not vid: + p = Path(".humeo_work").resolve() + p.mkdir(parents=True, exist_ok=True) + return p + + root = cache_root if cache_root is not None else default_humeo_cache_root() + p = (root / "videos" / vid).resolve() + p.mkdir(parents=True, exist_ok=True) + return p + + +def ingest_complete(work_dir: Path, source: str | None = None) -> bool: + """Return True if both video and transcript exist and match the current source.""" + complete = (work_dir / "source.mp4").is_file() and (work_dir / "transcript.json").is_file() + if not complete: + return False + if source is None: + return True + local_path = normalize_local_source_path(source) + if local_path is None: + return True + return local_source_matches(work_dir, source) + + +def read_youtube_info_json(work_dir: Path) -> dict[str, Any]: + """Read ``source.info.json`` written by yt-dlp ``--write-info-json``.""" + p = work_dir / "source.info.json" + if not p.is_file(): + return {} + with open(p, encoding="utf-8") as f: + return json.load(f) + + +def upsert_manifest_from_info( + *, + work_dir: Path, + youtube_url: str, + info: dict[str, Any], + cache_root: Path | None = None, +) -> None: + """Merge or add a manifest entry after successful ingest.""" + vid = (info.get("id") or extract_youtube_video_id(youtube_url) or "").strip() + if not vid: + logger.debug("No video id for manifest; skipping.") + return + + now = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + wd = work_dir.resolve() + entry = VideoCacheEntry( + video_id=vid, + url=str(info.get("webpage_url") or youtube_url), + title=str(info.get("title") or ""), + channel=str(info.get("channel") or info.get("uploader") or ""), + work_dir=str(wd), + source_mp4=str((wd / "source.mp4").resolve()), + transcript_json=str((wd / "transcript.json").resolve()), + downloaded_at=now, + ) + + manifest = load_manifest(cache_root) + manifest.entries[vid] = entry + path = save_manifest(manifest, cache_root) + logger.info("Updated video cache manifest: %s", path)