diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..076f58049936f42603b4e1a070dc965eb4316b09
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,15 @@
+.git
+.env
+.env.*
+!.env.example
+.venv
+__pycache__
+.pytest_cache
+.humeo_*
+.tmp_review_frames
+.tmp_review_frames_ticketc
+output
+output*
+*.log
+*.zip
+*.pyc
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..d1eaaae6f4b71a996fba468c461ffbf599f74deb 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a78fc156804e6d7261a277a4dcfc2e829a7a7a53
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.12-slim-bookworm
+
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PORT=7860
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY . /app
+
+RUN pip install --upgrade pip && \
+    pip install ./humeo-core && \
+    pip install .
+
+EXPOSE 7860
+
+CMD ["python", "app.py"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f2d768392bb52fd2079bb0a642954ba1bbe8688a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 NotABot
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 6838c02488f486801cce092812db9b806d4eab2d..ce8910f6ecc271fdc01677157ae28439a9c09bfa 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,199 @@
----
-title: Clipforge
-emoji: 🏆
-colorFrom: blue
-colorTo: gray
-sdk: docker
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
+title: ClipForge
+sdk: docker
+app_port: 7860
+---
+
+# ClipForge
+
+Current default preset:
+
+- `native_highlight` captions
+- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
+- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
+- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
+
+Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
+
+**Architecture (static HTML, GitHub Pages):**  
+[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
+
+## Hugging Face Space
+
+This repo includes a Hugging Face Docker Space entrypoint in `app.py` with the ClipForge upload/link UI.
+
+- Paste a YouTube/video URL or upload one local video file
+- Watch live pipeline progress in the ClipForge UI
+- Preview and download rendered `short_*.mp4` clips from the UI
+- Regenerate from the same source with a steering prompt
+
+Required Space secrets:
+
+- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
+- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
+
+If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
+`ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
+
+## Repo layout
+
+| Path | Role |
+|------|------|
+| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
+| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
+
+## Pipeline (actual order)
+
+```text
+YouTube URL
+  → ingest (source.mp4, transcript.json)
+  → clip selection (Gemini → clips.json)
+  → hook detection (Gemini → hooks.json)
+  → content pruning (Gemini → prune.json)
+  → keyframes + layout vision (Gemini vision → layout_vision.json)
+  → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
+```
+
+Details: **`docs/PIPELINE.md`**.
+
+## Five layouts
+
+A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
+
+## Requirements
+
+- **Python** ≥ 3.10  
+- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)  
+- **`ffmpeg`** — on `PATH` for extract/render  
+- **API keys** — see **`docs/ENVIRONMENT.md`**  
+  - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages  
+  - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable  
+  - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
+
+Copy **`.env.example`** → **`.env`** (never commit `.env`).
+
+## Install
+
+```bash
+uv venv
+uv sync
+```
+
+Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
+
+```bash
+uv sync --extra whisper
+```
+
+## Run
+
+```bash
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+humeo --long-to-shorts "C:\path\to\video.mp4"
+```
+
+Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
+
+## CLI guide (all flags)
+
+Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
+
+### Required
+
+| Flag | Meaning |
+|------|---------|
+| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
+
+### Paths and cache behavior
+
+| Flag | Meaning |
+|------|---------|
+| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
+| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
+| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
+| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
+| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
+
+### Model selection and stage forcing
+
+| Flag | Meaning |
+|------|---------|
+| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
+| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
+| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
+| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
+| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
+| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
+| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
+
+### Pruning and subtitles
+
+| Flag | Meaning |
+|------|---------|
+| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
+| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
+| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
+| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
+| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
+
+### Logging
+
+| Flag | Meaning |
+|------|---------|
+| `--verbose`, `-v` | Enable debug logging. |
+
+### Common command recipes
+
+```bash
+# Basic run
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+
+# Local MP4
+humeo --long-to-shorts "C:\path\to\video.mp4"
+
+# Full fresh run for debugging / prompt tuning
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
+
+# Re-run only clip selection after prompt edits
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
+
+# Keep intermediates in a fixed local folder
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
+
+# Compare different prune levels on same source
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
+```
+
+## Documentation
+
+| Doc | Purpose |
+|-----|---------|
+| **`docs/README.md`** | Index of all files under `docs/` |
+| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
+| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
+| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
+| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
+| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
+| **`docs/full_run_output.txt`** | Example full run log (text) |
+| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
+| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
+| **`docs/TODO.md`** | Backlog |
+| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
+| **`docs/SOLUTIONS.md`** | Design rationale |
+| **`TERMINOLOGY.md`** | Glossary |
+
+## Tests
+
+```bash
+uv sync --extra dev
+uv run pytest
+```
+
+## Sharing outputs
+
+`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
+
+## License
+
+See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc25374683b045a589bdfd92ac0cf53a6cde8647
--- /dev/null
+++ b/app.py
@@ -0,0 +1,808 @@
+from __future__ import annotations
+
+import html
+import json
+import logging
+import os
+import queue
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Annotated
+
+
+def _bootstrap_local_paths() -> None:
+    repo_root = Path(__file__).resolve().parent
+    for candidate in (repo_root / "src", repo_root / "humeo-core" / "src"):
+        candidate_str = str(candidate)
+        if candidate.is_dir() and candidate_str not in sys.path:
+            sys.path.insert(0, candidate_str)
+
+
+_bootstrap_local_paths()
+if not (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip():
+    os.environ["HUMEO_TRANSCRIBE_PROVIDER"] = (
+        "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
+    )
+
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
+
+from humeo.config import PipelineConfig
+from humeo.pipeline import run_pipeline
+
+
+APP_TITLE = "ClipForge"
+LOG_FORMAT = "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s"
+MAX_LOG_LINES = 700
+LLM_KEY_NAMES = ("GOOGLE_API_KEY", "GEMINI_API_KEY", "OPENROUTER_API_KEY")
+
+
+class QueueLogHandler(logging.Handler):
+    def __init__(self, sink: queue.Queue[str]):
+        super().__init__()
+        self._sink = sink
+
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            self._sink.put_nowait(self.format(record))
+        except Exception:
+            pass
+
+
+@dataclass
+class ClipFile:
+    name: str
+    url: str
+    duration: str
+
+
+@dataclass
+class Job:
+    id: str
+    run_root: Path
+    output_dir: Path
+    work_dir: Path
+    source: str
+    source_path: Path | None = None
+    steering_note: str | None = None
+    status: str = "Queued"
+    nav_status: str = "Processing..."
+    error: str | None = None
+    done: bool = False
+    created_at: float = field(default_factory=time.time)
+    logs: list[str] = field(default_factory=list)
+    clips: dict[str, ClipFile] = field(default_factory=dict)
+    steps: list[dict[str, object]] = field(
+        default_factory=lambda: [
+            {"name": "Uploading video", "pct": 100, "state": "done"},
+            {"name": "Generating transcript", "pct": 5, "state": "active"},
+            {"name": "Choosing short clips", "pct": 0, "state": "pending"},
+            {"name": "Producing clips", "pct": 0, "state": "pending"},
+            {"name": "Adding subtitles & light edits", "pct": 0, "state": "pending"},
+        ]
+    )
+
+
+JOBS: dict[str, Job] = {}
+JOBS_LOCK = threading.Lock()
+
+
+def _append_log(job: Job, line: str) -> None:
+    job.logs.append(line)
+    if len(job.logs) > MAX_LOG_LINES:
+        job.logs = job.logs[-MAX_LOG_LINES:]
+
+
+def _set_step(job: Job, idx: int, pct: int, state: str = "active") -> None:
+    for step_idx, step in enumerate(job.steps):
+        if step_idx < idx:
+            step["pct"] = 100
+            step["state"] = "done"
+        elif step_idx == idx:
+            step["pct"] = max(int(step.get("pct", 0)), min(100, pct))
+            step["state"] = state
+        elif step.get("state") != "done":
+            step["state"] = "pending"
+
+
+def _update_stage_from_log(job: Job, line: str) -> None:
+    if "STAGE 1: INGESTION" in line:
+        job.status = "Generating transcript"
+        _set_step(job, 1, 15)
+    elif "Transcribing" in line:
+        job.status = "Generating transcript"
+        _set_step(job, 1, 45)
+    elif "Transcript already exists" in line or "Transcription complete" in line:
+        _set_step(job, 1, 90)
+    elif "STAGE 2: CLIP SELECTION" in line:
+        job.status = "Choosing short clips"
+        _set_step(job, 2, 20)
+    elif "STAGE 2.25: HOOK DETECTION" in line:
+        job.status = "Finding hooks"
+        _set_step(job, 2, 55)
+    elif "STAGE 2.5: CONTENT PRUNING" in line:
+        job.status = "Tightening clip windows"
+        _set_step(job, 2, 78)
+    elif "STAGE 2.75: CLIP ASSEMBLY" in line:
+        job.status = "Assembling clips"
+        _set_step(job, 3, 18)
+    elif "STAGE 3: CLIP LAYOUTS" in line:
+        job.status = "Choosing layout"
+        _set_step(job, 3, 38)
+    elif "STAGE 4: RENDER" in line:
+        job.status = "Producing clips"
+        _set_step(job, 3, 62)
+    elif "reframe_clip_ffmpeg" in line:
+        _set_step(job, 4, min(90, 20 + len(job.clips) * 12))
+    elif "RENDER QA" in line or "Render QA summary" in line:
+        job.status = "Checking clips"
+        _set_step(job, 4, 82)
+    elif "PIPELINE COMPLETE" in line:
+        job.status = "Complete"
+        job.nav_status = "Done"
+        for step in job.steps:
+            step["pct"] = 100
+            step["state"] = "done"
+
+
+def _install_log_handler(message_queue: queue.Queue[str]) -> tuple[logging.Handler, int, dict[str, int]]:
+    handler = QueueLogHandler(message_queue)
+    handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt="%H:%M:%S"))
+
+    root_logger = logging.getLogger()
+    previous_level = root_logger.level
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.INFO)
+
+    previous_logger_levels: dict[str, int] = {}
+    for logger_name in ("urllib3", "httpx", "httpcore"):
+        logger = logging.getLogger(logger_name)
+        previous_logger_levels[logger_name] = logger.level
+        logger.setLevel(logging.WARNING)
+
+    return handler, previous_level, previous_logger_levels
+
+
+def _remove_log_handler(
+    handler: logging.Handler,
+    previous_root_level: int,
+    previous_logger_levels: dict[str, int],
+) -> None:
+    root_logger = logging.getLogger()
+    root_logger.removeHandler(handler)
+    root_logger.setLevel(previous_root_level)
+    for logger_name, level in previous_logger_levels.items():
+        logging.getLogger(logger_name).setLevel(level)
+
+
+def _duration_label(path: Path) -> str:
+    try:
+        result = subprocess.run(
+            [
+                "ffprobe",
+                "-v",
+                "error",
+                "-show_entries",
+                "format=duration",
+                "-of",
+                "default=noprint_wrappers=1:nokey=1",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        total = max(0, int(round(float(result.stdout.strip()))))
+    except Exception:
+        total = 0
+    return f"{total // 60}:{total % 60:02d}" if total else "0:00"
+
+
+def _publish_files(job: Job) -> None:
+    for path in sorted(job.output_dir.glob("short_*.mp4")):
+        if path.name not in job.clips and path.is_file():
+            job.clips[path.name] = ClipFile(
+                name=path.name,
+                url=f"/api/jobs/{job.id}/files/{path.name}",
+                duration=_duration_label(path),
+            )
+
+
+def _validate_credentials() -> None:
+    if not any((os.environ.get(name) or "").strip() for name in LLM_KEY_NAMES):
+        raise HTTPException(
+            status_code=400,
+            detail="Missing LLM secret. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENROUTER_API_KEY in the Space secrets.",
+        )
+
+    provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip().lower()
+    if provider in {"", "auto"}:
+        provider = "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
+    if provider == "elevenlabs" and not (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
+        raise HTTPException(status_code=400, detail="Missing ELEVENLABS_API_KEY Space secret.")
+    if provider in {"openai", "api"} and not (os.environ.get("OPENAI_API_KEY") or "").strip():
+        raise HTTPException(status_code=400, detail="Missing OPENAI_API_KEY Space secret.")
+
+
+def _safe_url(value: str | None) -> str | None:
+    value = (value or "").strip()
+    if not value:
+        return None
+    if not re.match(r"^https?://", value, flags=re.I):
+        raise HTTPException(status_code=400, detail="Paste a valid http(s) video URL.")
+    return value
+
+
+def _snapshot(job: Job) -> dict[str, object]:
+    return {
+        "id": job.id,
+        "status": job.status,
+        "nav_status": job.nav_status,
+        "done": job.done,
+        "error": job.error,
+        "logs": "\n".join(job.logs[-MAX_LOG_LINES:]),
+        "steps": job.steps,
+        "clips": [clip.__dict__ for clip in job.clips.values()],
+    }
+
+
+def _run_job(job_id: str) -> None:
+    with JOBS_LOCK:
+        job = JOBS[job_id]
+    message_queue: queue.Queue[str] = queue.Queue()
+    handler, previous_root_level, previous_logger_levels = _install_log_handler(message_queue)
+
+    def drain_queue() -> None:
+        with JOBS_LOCK:
+            local_job = JOBS[job_id]
+            while True:
+                try:
+                    line = message_queue.get_nowait()
+                except queue.Empty:
+                    break
+                _append_log(local_job, line)
+                _update_stage_from_log(local_job, line)
+            _publish_files(local_job)
+
+    try:
+        with JOBS_LOCK:
+            _append_log(job, f"Prepared source: {job.source}")
+            _append_log(job, f"Run id: {job.id}")
+            _set_step(job, 1, 8)
+
+        config = PipelineConfig(
+            source=job.source,
+            youtube_url=job.source,
+            output_dir=job.output_dir,
+            work_dir=job.work_dir,
+            use_video_cache=False,
+            clean_run=True,
+            interactive=False,
+            prune_level="balanced",
+            overwrite_outputs=True,
+            steering_notes=[job.steering_note] if job.steering_note else [],
+        )
+
+        worker_error: str | None = None
+        outputs: list[Path] = []
+
+        def pipeline_worker() -> None:
+            nonlocal outputs, worker_error
+            try:
+                outputs = run_pipeline(config)
+            except Exception as exc:
+                worker_error = str(exc)
+                for line in traceback.format_exc().splitlines():
+                    if line.strip():
+                        message_queue.put_nowait(line)
+
+        thread = threading.Thread(target=pipeline_worker, daemon=True)
+        thread.start()
+        while thread.is_alive():
+            drain_queue()
+            time.sleep(0.35)
+        drain_queue()
+
+        with JOBS_LOCK:
+            local_job = JOBS[job_id]
+            for output in outputs:
+                if Path(output).exists():
+                    local_job.clips[Path(output).name] = ClipFile(
+                        name=Path(output).name,
+                        url=f"/api/jobs/{job_id}/files/{Path(output).name}",
+                        duration=_duration_label(Path(output)),
+                    )
+            if worker_error:
+                local_job.error = worker_error
+                local_job.status = f"Failed: {worker_error}"
+                local_job.nav_status = "Failed"
+            else:
+                local_job.status = "Complete" if local_job.clips else "Complete - no clips generated"
+                local_job.nav_status = "Done"
+                for step in local_job.steps:
+                    step["pct"] = 100
+                    step["state"] = "done"
+            local_job.done = True
+    finally:
+        _remove_log_handler(handler, previous_root_level, previous_logger_levels)
+
+
+async def _stage_upload(uploaded_file: UploadFile, run_root: Path) -> Path:
+    suffix = Path(uploaded_file.filename or "input.mp4").suffix or ".mp4"
+    staged_path = run_root / f"input{suffix}"
+    with staged_path.open("wb") as handle:
+        while chunk := await uploaded_file.read(1024 * 1024):
+            handle.write(chunk)
+    return staged_path
+
+
+app = FastAPI(title=APP_TITLE)
+
+
+@app.get("/", response_class=HTMLResponse)
+def index() -> str:
+    return INDEX_HTML
+
+
+@app.post("/api/jobs")
+async def create_job(
+    video_url: Annotated[str | None, Form()] = None,
+    regen_prompt: Annotated[str | None, Form()] = None,
+    source_job_id: Annotated[str | None, Form()] = None,
+    file: Annotated[UploadFile | None, File()] = None,
+) -> JSONResponse:
+    _validate_credentials()
+    job_id = uuid.uuid4().hex[:12]
+    run_root = Path(tempfile.mkdtemp(prefix=f"clipforge-{job_id}-"))
+    work_dir = run_root / "work"
+    output_dir = run_root / "output"
+    work_dir.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    source_path: Path | None = None
+    source = _safe_url(video_url)
+    source_job_id = (source_job_id or "").strip()
+    if source_job_id:
+        with JOBS_LOCK:
+            previous = JOBS.get(source_job_id)
+        if previous is None:
+            raise HTTPException(status_code=404, detail="Previous job not found for regeneration.")
+        if previous.source_path and previous.source_path.exists():
+            source_path = run_root / previous.source_path.name
+            shutil.copy2(previous.source_path, source_path)
+            source = str(source_path)
+        else:
+            source = previous.source
+    elif file is not None:
+        source_path = await _stage_upload(file, run_root)
+        source = str(source_path)
+
+    if not source:
+        raise HTTPException(status_code=400, detail="Upload a video file or paste a video URL first.")
+
+    job = Job(
+        id=job_id,
+        run_root=run_root,
+        output_dir=output_dir,
+        work_dir=work_dir,
+        source=source,
+        source_path=source_path,
+        steering_note=(regen_prompt or "").strip() or None,
+    )
+    with JOBS_LOCK:
+        JOBS[job_id] = job
+
+    threading.Thread(target=_run_job, args=(job_id,), daemon=True).start()
+    return JSONResponse(_snapshot(job))
+
+
+@app.get("/api/jobs/{job_id}")
+def get_job(job_id: str) -> JSONResponse:
+    with JOBS_LOCK:
+        job = JOBS.get(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found.")
+        _publish_files(job)
+        return JSONResponse(_snapshot(job))
+
+
+@app.get("/api/jobs/{job_id}/files/{filename}")
+def get_job_file(job_id: str, filename: str) -> FileResponse:
+    with JOBS_LOCK:
+        job = JOBS.get(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found.")
+        path = (job.output_dir / Path(filename).name).resolve(strict=False)
+        if job.output_dir.resolve(strict=False) not in path.parents or not path.is_file():
+            raise HTTPException(status_code=404, detail="File not found.")
+    return FileResponse(path, media_type="video/mp4", filename=path.name)
+
+
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"ok": "true"}
+
+
+INDEX_HTML = r"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>ClipForge - Video to Clips</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,500;0,600;1,300;1,400&family=DM+Sans:wght@300;400;500&display=swap" rel="stylesheet">
+<style>
+  :root {
+    --cream: #F7F2E9; --champagne: #EDE3CC; --champagne-deep: #D9C9A6;
+    --gold: #B8924A; --gold-light: #D4AA6A; --ink: #2A1F0E;
+    --ink-soft: #5C4A2E; --ink-muted: #9A8560; --white: #FDFAF4;
+    --surface: #F0E9D8; --border: #DDD0B3; --success: #6B8C5A;
+    --radius: 12px; --radius-lg: 20px;
+  }
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body { font-family: 'DM Sans', sans-serif; background: var(--cream); color: var(--ink); min-height: 100vh; overflow-x: hidden; }
+  nav { display: flex; align-items: center; justify-content: space-between; padding: 20px 32px; border-bottom: 1px solid var(--border); background: var(--white); position: sticky; top: 0; z-index: 100; }
+  .logo { font-family: 'Cormorant Garamond', serif; font-size: 1.6rem; font-weight: 600; color: var(--ink); letter-spacing: 0.02em; }
+  .logo span { color: var(--gold); }
+  .screen { display: none; animation: fadeIn 0.5s ease; }
+  .screen.active { display: block; }
+  @keyframes fadeIn { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: translateY(0); } }
+  #screen-input { display: flex; flex-direction: column; align-items: center; justify-content: center; min-height: calc(100vh - 65px); padding: 40px 20px; text-align: center; }
+  .eyebrow { font-size: 0.75rem; letter-spacing: 0.18em; text-transform: uppercase; color: var(--gold); font-weight: 500; margin-bottom: 16px; }
+  .hero-title { font-family: 'Cormorant Garamond', serif; font-size: clamp(2rem, 5vw, 3.6rem); font-weight: 500; line-height: 1.15; color: var(--ink); max-width: 620px; margin-bottom: 12px; }
+  .hero-title em { font-style: italic; color: var(--gold); }
+  .hero-sub { font-size: 0.95rem; color: var(--ink-muted); margin-bottom: 48px; font-weight: 300; }
+  .input-card { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 36px; width: 100%; max-width: 520px; box-shadow: 0 8px 32px rgba(42,31,14,0.07); }
+  .mode-tabs { display: flex; background: var(--surface); border-radius: 10px; padding: 4px; margin-bottom: 28px; gap: 4px; }
+  .mode-tab { flex: 1; padding: 10px 0; border: none; background: transparent; border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.85rem; font-weight: 500; color: var(--ink-muted); cursor: pointer; transition: all 0.2s; }
+  .mode-tab.active { background: var(--white); color: var(--ink); box-shadow: 0 2px 8px rgba(42,31,14,0.1); }
+  .input-section { display: none; } .input-section.active { display: block; }
+  .input-label { font-size: 0.78rem; letter-spacing: 0.08em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 8px; display: block; font-weight: 500; text-align:left; }
+  .yt-input { width: 100%; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; transition: border-color 0.2s; }
+  .yt-input:focus { border-color: var(--gold); } .yt-input::placeholder { color: var(--ink-muted); }
+  .upload-zone { border: 2px dashed var(--champagne-deep); border-radius: var(--radius); padding: 36px 20px; text-align: center; cursor: pointer; transition: all 0.2s; background: var(--cream); }
+  .upload-zone:hover, .upload-zone.dragover { border-color: var(--gold); background: var(--champagne); }
+  .upload-icon { width: 44px; height: 44px; background: var(--champagne); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 12px; font-size: 1.2rem; }
+  .upload-text { font-size: 0.9rem; color: var(--ink-soft); font-weight: 400; }
+  .upload-sub { font-size: 0.78rem; color: var(--ink-muted); margin-top: 4px; }
+  .convert-btn { width: 100%; margin-top: 28px; padding: 16px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.95rem; font-weight: 500; cursor: pointer; letter-spacing: 0.03em; transition: all 0.2s; position: relative; overflow: hidden; }
+  .convert-btn:hover { background: var(--ink-soft); transform: translateY(-1px); box-shadow: 0 6px 20px rgba(42,31,14,0.2); } .convert-btn:active { transform: translateY(0); }
+  .convert-btn:disabled { opacity: .65; cursor: progress; transform:none; }
+  #screen-processing { max-width: 780px; margin: 0 auto; padding: 48px 20px 80px; }
+  .processing-header { text-align: center; margin-bottom: 40px; }
+  .processing-title { font-family: 'Cormorant Garamond', serif; font-size: 2rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
+  .processing-sub { font-size: 0.88rem; color: var(--ink-muted); font-weight: 300; }
+  .pipeline { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 28px; box-shadow: 0 4px 20px rgba(42,31,14,0.06); margin-bottom: 32px; }
+  .pipeline-step { display: flex; align-items: flex-start; gap: 16px; padding: 16px 0; border-bottom: 1px solid var(--champagne); opacity: 0.4; transition: opacity 0.4s; }
+  .pipeline-step:last-child { border-bottom: none; } .pipeline-step.active, .pipeline-step.done { opacity: 1; }
+  .step-icon { width: 36px; height: 36px; flex-shrink: 0; background: var(--surface); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1rem; transition: all 0.4s; border: 1.5px solid var(--border); }
+  .pipeline-step.active .step-icon { background: var(--champagne); border-color: var(--gold); }
+  .pipeline-step.done .step-icon { background: var(--gold); border-color: var(--gold); color: white; font-size: 0.85rem; }
+  .step-content { flex: 1; padding-top: 4px; }
+  .step-name { font-size: 0.9rem; font-weight: 500; color: var(--ink); margin-bottom: 8px; display: flex; align-items: center; justify-content: space-between; }
+  .step-pct { font-size: 0.8rem; color: var(--gold); font-weight: 500; }
+  .progress-track { height: 6px; background: var(--surface); border-radius: 99px; overflow: hidden; }
+  .progress-fill { height: 100%; border-radius: 99px; background: linear-gradient(90deg, var(--gold-light), var(--gold)); width: 0%; transition: width 0.25s ease; }
+  .pipeline-step.done .progress-fill { width: 100%; background: var(--gold); }
+  .tips-section { margin-bottom: 40px; }
+  .tips-label { font-size: 0.72rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 12px; font-weight: 500; }
+  .tip-card { background: var(--champagne); border-radius: var(--radius); padding: 14px 18px; font-size: 0.85rem; color: var(--ink-soft); display: flex; align-items: flex-start; gap: 10px; margin-bottom: 8px; line-height: 1.5; }
+  .tip-dot { color: var(--gold); margin-top: 2px; flex-shrink: 0; }
+  .clips-section { margin-top: 8px; }
+  .clips-title { font-family: 'Cormorant Garamond', serif; font-size: 1.4rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
+  .clips-sub { font-size: 0.82rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
+  .clips-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 16px; }
+  .clip-card { border-radius: var(--radius); overflow: hidden; cursor: pointer; background: var(--white); border: 1px solid var(--border); box-shadow: 0 2px 10px rgba(42,31,14,0.06); transition: all 0.2s; animation: clipAppear 0.5s ease both; }
+  .clip-card:hover { transform: translateY(-3px); box-shadow: 0 8px 24px rgba(42,31,14,0.13); }
+  @keyframes clipAppear { from { opacity: 0; transform: scale(0.9) translateY(10px); } to { opacity: 1; transform: scale(1) translateY(0); } }
+  .clip-thumb { aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; overflow: hidden; }
+  .clip-play { width: 44px; height: 44px; background: rgba(255,255,255,0.88); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1.1rem; z-index: 2; box-shadow: 0 2px 12px rgba(0,0,0,0.2); transition: transform 0.2s; }
+  .clip-card:hover .clip-play { transform: scale(1.1); }
+  .clip-meta { padding: 10px 12px; } .clip-num { font-size: 0.72rem; color: var(--ink-muted); text-transform: uppercase; letter-spacing: 0.08em; font-weight: 500; }
+  .clip-dur { font-size: 0.82rem; color: var(--ink); font-weight: 400; margin-top: 2px; }
+  .clip-download { margin-top: 8px; display:inline-block; font-size:.74rem; color:var(--gold); text-decoration:none; }
+  .regen-section { margin-top: 56px; background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 32px; display: none; animation: fadeIn 0.5s ease; box-shadow: 0 4px 20px rgba(42,31,14,0.06); }
+  .regen-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 500; margin-bottom: 6px; }
+  .regen-sub { font-size: 0.85rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
+  .regen-textarea { width: 100%; min-height: 100px; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; resize: vertical; transition: border-color 0.2s; line-height: 1.6; margin-bottom: 14px; }
+  .regen-textarea:focus { border-color: var(--gold); } .regen-textarea::placeholder { color: var(--ink-muted); }
+  .regen-row { display: flex; gap: 10px; align-items: center; flex-wrap: wrap; }
+  .chip { padding: 7px 14px; background: var(--champagne); border: 1px solid var(--border); border-radius: 99px; font-size: 0.78rem; color: var(--ink-soft); cursor: pointer; transition: all 0.15s; font-weight: 400; white-space: nowrap; }
+  .chip:hover { background: var(--champagne-deep); color: var(--ink); border-color: var(--gold); }
+  .regen-btn { margin-left: auto; padding: 12px 24px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.88rem; font-weight: 500; cursor: pointer; transition: all 0.2s; white-space: nowrap; }
+  .regen-btn:hover { background: var(--ink-soft); }
+  .modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
+  .modal-overlay.open { display: flex; }
+  .modal-box { background: var(--white); border-radius: var(--radius-lg); width: 100%; max-width: 390px; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
+  @keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
+  .modal-video { aspect-ratio: 9/16; max-height: 70vh; display: flex; align-items: center; justify-content: center; position: relative; background:var(--ink); }
+  .modal-video video { width:100%; height:100%; object-fit:contain; background:#000; }
+  .modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
+  .modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
+  .modal-actions { display:flex; align-items:center; gap:8px; }
+  .modal-close, .modal-download { padding: 8px 14px; background: var(--surface); border: 1px solid var(--border); border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.82rem; cursor: pointer; transition: all 0.15s; color:var(--ink); text-decoration:none; }
+  .modal-close:hover, .modal-download:hover { background: var(--champagne); }
+  .log-panel { display:none; margin-top:24px; background:var(--ink); color:var(--cream); border-radius:12px; padding:14px; font:12px/1.45 ui-monospace, SFMono-Regular, Consolas, monospace; white-space:pre-wrap; max-height:240px; overflow:auto; text-align:left; }
+  @media (max-width: 600px) { nav { padding: 16px 20px; } .input-card { padding: 24px 20px; } #screen-processing { padding: 32px 16px 60px; } .pipeline { padding: 20px 16px; } .clips-grid { grid-template-columns: repeat(2, 1fr); gap: 10px; } .regen-section { padding: 22px 18px; } .regen-btn { width: 100%; margin-left: 0; } .regen-row { flex-direction: column; align-items: flex-start; } }
+  .thumb-1 { background: linear-gradient(135deg, #D4A96A 0%, #8B5E3C 100%); } .thumb-2 { background: linear-gradient(135deg, #7A9E8A 0%, #3D6650 100%); }
+  .thumb-3 { background: linear-gradient(135deg, #9E8A7A 0%, #5C3E2E 100%); } .thumb-4 { background: linear-gradient(135deg, #8A7A9E 0%, #4A3866 100%); }
+  .thumb-5 { background: linear-gradient(135deg, #9E9A7A 0%, #5C5820 100%); } .thumb-6 { background: linear-gradient(135deg, #C4856A 0%, #7A3020 100%); }
+  .thumb-7 { background: linear-gradient(135deg, #7AABBE 0%, #2A5A6E 100%); } .thumb-8 { background: linear-gradient(135deg, #9EAA7A 0%, #4A5E20 100%); }
+  .thumb-9 { background: linear-gradient(135deg, #AA7A9E 0%, #5E2060 100%); } .thumb-0 { background: linear-gradient(135deg, #D4C36A 0%, #8B7820 100%); }
+  .spin { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--border); border-top-color: var(--gold); border-radius: 50%; animation: spin 0.8s linear infinite; }
+  @keyframes spin { to { transform: rotate(360deg); } }
+</style>
+</head>
+<body>
+<nav>
+  <div class="logo">Clip<span>Forge</span></div>
+  <div style="font-size:0.8rem;color:var(--ink-muted);font-weight:300;display:none" id="nav-status">Processing...</div>
+</nav>
+<div class="screen active" id="screen-input">
+  <div style="display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:calc(100vh - 65px);padding:40px 20px;text-align:center;">
+    <div class="eyebrow">AI Video Editor</div>
+    <h1 class="hero-title">Convert your long video to <em>short clips</em> for social media</h1>
+    <p class="hero-sub">Paste a link or upload a file - we handle the rest</p>
+    <div class="input-card">
+      <div class="mode-tabs">
+        <button class="mode-tab active" onclick="switchMode('yt')">Link</button>
+        <button class="mode-tab" onclick="switchMode('upload')">Upload File</button>
+      </div>
+      <div class="input-section active" id="mode-yt">
+        <label class="input-label">Video URL</label>
+        <input class="yt-input" type="text" placeholder="https://youtube.com/watch?v=..." id="yt-url">
+      </div>
+      <div class="input-section" id="mode-upload">
+        <input type="file" id="file-input" accept="video/mp4,video/quicktime,video/*" hidden>
+        <div class="upload-zone" id="upload-zone" onclick="openUpload()">
+          <div class="upload-icon">File</div>
+          <div class="upload-text">Click to browse or drag & drop</div>
+          <div class="upload-sub">MP4, MOV, AVI - up to your Space limit</div>
+        </div>
+      </div>
+      <button class="convert-btn" id="convert-btn" onclick="startProcessing()">Convert to Clips -></button>
+    </div>
+  </div>
+</div>
+<div class="screen" id="screen-processing">
+  <div class="processing-header">
+    <div class="eyebrow">Working on it</div>
+    <h2 class="processing-title">Your clips are being crafted</h2>
+    <p class="processing-sub" id="processing-sub">Sit back - long videos can take a little while</p>
+  </div>
+  <div class="pipeline" id="pipeline">
+    <div class="pipeline-step" id="step-0"><div class="step-icon">Up</div><div class="step-content"><div class="step-name">Uploading video <span class="step-pct" id="pct-0">0%</span></div><div class="progress-track"><div class="progress-fill" id="fill-0"></div></div></div></div>
+    <div class="pipeline-step" id="step-1"><div class="step-icon">Text</div><div class="step-content"><div class="step-name">Generating transcript <span class="step-pct" id="pct-1"></span></div><div class="progress-track"><div class="progress-fill" id="fill-1"></div></div></div></div>
+    <div class="pipeline-step" id="step-2"><div class="step-icon">Cut</div><div class="step-content"><div class="step-name">Choosing short clips <span class="step-pct" id="pct-2"></span></div><div class="progress-track"><div class="progress-fill" id="fill-2"></div></div></div></div>
+    <div class="pipeline-step" id="step-3"><div class="step-icon">Film</div><div class="step-content"><div class="step-name">Producing clips <span class="step-pct" id="pct-3"></span></div><div class="progress-track"><div class="progress-fill" id="fill-3"></div></div></div></div>
+    <div class="pipeline-step" id="step-4"><div class="step-icon">Edit</div><div class="step-content"><div class="step-name">Adding subtitles &amp; light edits <span class="step-pct" id="pct-4"></span></div><div class="progress-track"><div class="progress-fill" id="fill-4"></div></div></div></div>
+  </div>
+  <div class="tips-section" id="tips-section">
+    <div class="tips-label">Tips while you wait</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> Clips are automatically trimmed around the strongest hook.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> The system can pick centered speaker or split presentation layout per clip.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> Word-by-word subtitles are added by default.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> You can regenerate with different instructions after the first batch.</div>
+  </div>
+  <div class="clips-section" id="clips-section" style="display:none">
+    <div class="clips-title">Your clips</div>
+    <p class="clips-sub" id="clips-sub-text">Tap any clip to preview</p>
+    <div class="clips-grid" id="clips-grid"></div>
+  </div>
+  <div class="regen-section" id="regen-section">
+    <div class="regen-title">Produce a different set</div>
+    <p class="regen-sub">Describe what you're looking for and we'll re-cut your video</p>
+    <textarea class="regen-textarea" placeholder="e.g. Focus on the funniest moments, keep clips under 30 seconds, add a text hook at the start..." id="regen-prompt"></textarea>
+    <div class="regen-row">
+      <span class="chip" onclick="setChip('Highlight key insights')">Key insights</span>
+      <span class="chip" onclick="setChip('Funny & entertaining moments')">Funny moments</span>
+      <span class="chip" onclick="setChip('Emotional or inspiring clips')">Emotional</span>
+      <span class="chip" onclick="setChip('Fast-paced, high energy edits')">High energy</span>
+      <button class="regen-btn" onclick="triggerRegen()">Regenerate Clips -></button>
+    </div>
+  </div>
+  <pre class="log-panel" id="log-panel"></pre>
+</div>
+<div class="modal-overlay" id="modal" onclick="closeModal(event)">
+  <div class="modal-box">
+    <div class="modal-video" id="modal-video"><div class="clip-play" style="width:56px;height:56px;font-size:1.4rem;background:rgba(255,255,255,0.9)">▶</div></div>
+    <div class="modal-footer">
+      <div class="modal-clip-label" id="modal-label">Clip 1</div>
+      <div class="modal-actions"><a class="modal-download" id="modal-download" href="#" download>Download</a><button class="modal-close" onclick="document.getElementById('modal').classList.remove('open')">Close</button></div>
+    </div>
+  </div>
+</div>
+<script>
+  let currentMode = 'yt';
+  let selectedFile = null;
+  let currentJobId = null;
+  let renderedClips = [];
+  const iconLabels = ['Up','Text','Cut','Film','Edit'];
+
+  function switchMode(m) {
+    currentMode = m;
+    document.querySelectorAll('.mode-tab').forEach((t,i) => t.classList.toggle('active', (i===0 && m==='yt') || (i===1 && m==='upload')));
+    document.getElementById('mode-yt').classList.toggle('active', m==='yt');
+    document.getElementById('mode-upload').classList.toggle('active', m==='upload');
+  }
+
+  function openUpload() { document.getElementById('file-input').click(); }
+
+  function setSelectedFile(file) {
+    selectedFile = file;
+    const zone = document.getElementById('upload-zone');
+    zone.innerHTML = `<div class="upload-icon">OK</div><div class="upload-text" style="color:var(--gold)">File selected: ${escapeHtml(file.name)}</div><div class="upload-sub">Ready to convert</div>`;
+  }
+
+  const uploadZone = document.getElementById('upload-zone');
+  document.getElementById('file-input').addEventListener('change', e => { if (e.target.files[0]) setSelectedFile(e.target.files[0]); });
+  uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
+  uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
+  uploadZone.addEventListener('drop', e => { e.preventDefault(); uploadZone.classList.remove('dragover'); if (e.dataTransfer.files[0]) setSelectedFile(e.dataTransfer.files[0]); });
+
+  function escapeHtml(s) {
+    return String(s).replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#039;'}[c]));
+  }
+
+  async function createJob(extraPrompt = '') {
+    const form = new FormData();
+    if (extraPrompt && currentJobId) {
+      form.append('source_job_id', currentJobId);
+      form.append('regen_prompt', extraPrompt);
+    } else if (currentMode === 'upload') {
+      if (!selectedFile) throw new Error('Choose a video file first.');
+      form.append('file', selectedFile);
+    } else {
+      const url = document.getElementById('yt-url').value.trim();
+      if (!url) throw new Error('Paste a video URL first.');
+      form.append('video_url', url);
+    }
+    const res = await fetch('/api/jobs', { method: 'POST', body: form });
+    const data = await res.json();
+    if (!res.ok) throw new Error(data.detail || 'Could not start job.');
+    return data;
+  }
+
+  async function startProcessing() {
+    const btn = document.getElementById('convert-btn');
+    try {
+      btn.disabled = true;
+      btn.textContent = 'Starting...';
+      const job = await createJob();
+      currentJobId = job.id;
+      renderedClips = [];
+      document.getElementById('clips-grid').innerHTML = '';
+      document.getElementById('screen-input').classList.remove('active');
+      document.getElementById('screen-processing').classList.add('active');
+      document.getElementById('nav-status').style.display = 'block';
+      syncJob(job);
+      pollJob(job.id);
+    } catch (err) {
+      alert(err.message || err);
+    } finally {
+      btn.disabled = false;
+      btn.textContent = 'Convert to Clips ->';
+    }
+  }
+
+  async function pollJob(id) {
+    let done = false;
+    while (!done && currentJobId === id) {
+      await new Promise(r => setTimeout(r, 1400));
+      const res = await fetch(`/api/jobs/${id}`);
+      const job = await res.json();
+      syncJob(job);
+      done = job.done;
+    }
+  }
+
+  function syncJob(job) {
+    document.getElementById('nav-status').textContent = job.nav_status || 'Processing...';
+    document.getElementById('processing-sub').textContent = job.error ? job.error : job.status;
+    document.getElementById('log-panel').textContent = job.logs || '';
+    (job.steps || []).forEach((step, i) => {
+      const el = document.getElementById(`step-${i}`);
+      const fill = document.getElementById(`fill-${i}`);
+      const pct = document.getElementById(`pct-${i}`);
+      el.classList.toggle('active', step.state === 'active');
+      el.classList.toggle('done', step.state === 'done');
+      el.querySelector('.step-icon').innerHTML = step.state === 'done' ? '✓' : (step.state === 'active' ? '<span class="spin"></span>' : iconLabels[i]);
+      fill.style.width = `${step.pct || 0}%`;
+      pct.textContent = step.pct ? `${Math.floor(step.pct)}%` : '';
+    });
+    (job.clips || []).forEach((clip, idx) => {
+      if (!renderedClips.some(c => c.name === clip.name)) {
+        renderedClips.push(clip);
+        addClip(renderedClips.length - 1);
+      }
+    });
+    if (renderedClips.length) {
+      document.getElementById('clips-section').style.display = 'block';
+      document.getElementById('clips-sub-text').textContent = job.done
+        ? `All ${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - tap to preview`
+        : `${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - more coming...`;
+    }
+    if (job.done) {
+      document.getElementById('regen-section').style.display = 'block';
+      if (job.error) document.getElementById('log-panel').style.display = 'block';
+    }
+  }
+
+  function addClip(idx) {
+    const clip = renderedClips[idx];
+    const grid = document.getElementById('clips-grid');
+    const card = document.createElement('div');
+    card.className = 'clip-card';
+    card.innerHTML = `<div class="clip-thumb thumb-${idx % 10}"><div class="clip-play">▶</div></div><div class="clip-meta"><div class="clip-num">Clip ${idx + 1}</div><div class="clip-dur">${clip.duration || '0:00'}</div><a class="clip-download" href="${clip.url}" download onclick="event.stopPropagation()">Download</a></div>`;
+    card.onclick = () => openModal(idx);
+    grid.appendChild(card);
+  }
+
+  function openModal(idx) {
+    const clip = renderedClips[idx];
+    const modal = document.getElementById('modal');
+    const video = document.getElementById('modal-video');
+    video.className = 'modal-video';
+    video.innerHTML = `<video src="${clip.url}" controls autoplay playsinline></video>`;
+    document.getElementById('modal-label').textContent = `Clip ${idx + 1}`;
+    document.getElementById('modal-download').href = clip.url;
+    modal.classList.add('open');
+  }
+
+  function closeModal(e) {
+    if (e.target === document.getElementById('modal')) {
+      document.getElementById('modal').classList.remove('open');
+      document.getElementById('modal-video').innerHTML = '';
+    }
+  }
+
+  function setChip(text) {
+    const ta = document.getElementById('regen-prompt');
+    ta.value = text;
+    ta.focus();
+  }
+
+  async function triggerRegen() {
+    const prompt = document.getElementById('regen-prompt').value.trim();
+    if (!prompt) { document.getElementById('regen-prompt').focus(); return; }
+    if (!currentJobId) { alert('Run a video first.'); return; }
+    renderedClips = [];
+    document.getElementById('clips-grid').innerHTML = '';
+    document.getElementById('clips-section').style.display = 'none';
+    document.getElementById('regen-section').style.display = 'none';
+    document.getElementById('nav-status').textContent = 'Regenerating...';
+    document.querySelectorAll('.pipeline-step').forEach((s, i) => {
+      s.classList.remove('active', 'done');
+      s.querySelector('.step-icon').innerHTML = iconLabels[i];
+      document.getElementById(`fill-${i}`).style.width = '0%';
+      document.getElementById(`pct-${i}`).textContent = '';
+    });
+    window.scrollTo({ top: 0, behavior: 'smooth' });
+    try {
+      const job = await createJob(prompt);
+      currentJobId = job.id;
+      syncJob(job);
+      pollJob(job.id);
+    } catch (err) {
+      alert(err.message || err);
+    }
+  }
+</script>
+</body>
+</html>"""
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))
diff --git a/humeo-core/.gitignore b/humeo-core/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5cdfd8b15451f9536bacee169a7b07d76b8f934b
--- /dev/null
+++ b/humeo-core/.gitignore
@@ -0,0 +1,9 @@
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+.pytest_cache/
+build/
+dist/
+.venv/
+.env
diff --git a/humeo-core/LICENSE b/humeo-core/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f2d768392bb52fd2079bb0a642954ba1bbe8688a
--- /dev/null
+++ b/humeo-core/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 NotABot
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/humeo-core/README.md b/humeo-core/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..22ea8a681ddc2eb2b5c7d0077f4d867a5e140e5e
--- /dev/null
+++ b/humeo-core/README.md
@@ -0,0 +1,165 @@
+# humeo-core
+
+**Reusable-rocket MCP server for long-video → 9:16 shorts.**
+
+First-principles design, from the HIVE paper + Bryan's rocket analogy:
+we don't build doors and windows (general subject-tracker UI, retraining
+models). We build the **container** (schemas), **landing gear** (deterministic
+local extraction), and **five thrusters** (the five 9:16 layouts this video
+format actually uses). Everything else is pluggable.
+
+## The rocket, in one picture
+
+```
+            ┌──────────────────────────────────────────┐
+            │         Control panel  (MCP tools)       │   <- any MCP client
+            └───────────────────┬──────────────────────┘
+                                │ strict JSON
+   ┌────────────────┬───────────┼────────────────┬─────────────────┐
+   ▼                ▼           ▼                ▼                 ▼
+ ingest       classify_scenes  select_clips   plan_layout       render_clip
+(scenes +     (5-way layout   (clip picker,   (5 thrusters,    (ffmpeg compile,
+ keyframes +   classifier)     heuristic +     pure filter      dry-run safe)
+ transcript)                   LLM-ready)      math)
+                                                 │
+                                                 ▼
+                                       ┌────────────────────┐
+                                       │   LayoutKind       │
+                                       │  ────────────────  │
+                                       │  zoom_call_center  │
+                                       │  sit_center        │
+                                       │  split_chart_person│
+                                       │  split_two_persons │
+                                       │  split_two_charts  │
+                                       └────────────────────┘
+```
+
+Only the classifier and clip-selector have optional LLM hooks; everything
+else is deterministic, local, and cheap.
+
+## Why five layouts? (the "max 2 items" rule)
+
+The hard constraint for this format: **a short shows at most two on-screen
+items** — where an "item" is a `person` (a human speaker) or a `chart`
+(slide, graph, data visual, screenshare). That gives exactly five recipes:
+
+1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
+2. **`sit_center`** — 1 person, interview / seated framing.
+3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
+   (default: **even 50/50** top/bottom, chart on top).
+4. **`split_two_persons`** — 2 speakers, stacked vertically.
+5. **`split_two_charts`** — 2 charts, stacked vertically.
+
+Because the geometry is bounded, we do NOT need a general subject-tracker
+ML model or a drag-to-highlight UI. We need five small, correct pieces of
+crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
+is.
+
+See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
+used across these docs (subject, crop, band, seam, bbox, layout, etc.).
+
+## Install
+
+```bash
+uv venv
+uv sync
+```
+
+External requirements: `ffmpeg` and `ffprobe` on PATH.
+
+`scenedetect` requires OpenCV. Install `opencv-python-headless` or
+`opencv-python` alongside `scenedetect`.
+
+## Use it as an MCP server
+
+```bash
+humeo-core         # stdio transport (primary console script)
+# humeo-mcp        # same entrypoint — kept so existing MCP configs keep working
+```
+
+Example Cursor/Claude Desktop config:
+
+```json
+{
+  "mcpServers": {
+    "humeo": { "command": "humeo-core" }
+  }
+}
+```
+
+Tools exposed:
+
+| Tool                              | Purpose                                                                     |
+| --------------------------------- | --------------------------------------------------------------------------- |
+| `list_layouts`                    | Enumerate the 5 supported layouts.                                          |
+| `ingest`                          | Scene detection + keyframe extraction (+ optional transcript).              |
+| `classify_scenes`                 | Pixel-heuristic per-scene layout classification.                            |
+| `detect_scene_regions`            | Return the bbox prompt + per-scene jobs (agent runs its own vision model).  |
+| `classify_scenes_with_vision`     | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
+| `select_clips`                    | Heuristic clip picker over a word-level transcript.                         |
+| `plan_layout`                     | Return the exact `ffmpeg -filter_complex` for a layout.                     |
+| `build_render_cmd`                | Build the ffmpeg command (no execution) — review before spend.              |
+| `render_clip`                     | Build + run ffmpeg to produce a 9:16 MP4.                                   |
+
+Resource: `humeo://layouts` (JSON listing of the 5 layouts).
+
+### Three interchangeable region detectors
+
+All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
+
+```
+classify.py   (pixel variance, no ML)
+face_detect.py (MediaPipe, local)            ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
+vision.py     (multimodal LLM + OCR bboxes)
+```
+
+## JSON contracts (non-negotiable)
+
+All tools take and return Pydantic-validated JSON. The contracts live in
+[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
+
+- `Scene`                     `{scene_id, start_time, end_time, keyframe_path?}`
+- `TranscriptWord`            `{word, start_time, end_time}`
+- `IngestResult`              `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
+- `SceneClassification`       `{scene_id, layout, confidence, reason}`
+- `BoundingBox`               `{x1, y1, x2, y2, label, confidence}`  (all coords normalized)
+- `SceneRegions`              `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
+- `Clip`                      `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
+- `ClipPlan`                  `{source_path, clips[]}`
+- `LayoutInstruction`         `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
+- `RenderRequest` / `RenderResult`
+
+## First-principles decisions (what we intentionally did NOT build)
+
+- **No giant subject-tracker ML.** The video format has 5 fixed layouts
+  (with a hard "max 2 items" rule); pixel-level tracking is not needed.
+- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
+  agent-first workflow. If a human wants to override, they pass a
+  `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
+  `zoom`.
+- **No end-to-end video→video model.** The HIVE paper's core insight is
+  that decomposed orchestration beats monolithic generation. We reify
+  that insight as six small composable tools.
+
+## Extending the pilot
+
+- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
+- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
+- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
+  to get per-scene bboxes + OCR text, then feed the results back through
+  `classify_scenes_with_vision`. This is the scene-change → v3 images →
+  LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
+- All enforce strict JSON outputs, so bad model output can't corrupt
+  downstream stages.
+
+## Testing
+
+```bash
+python -m pytest
+```
+
+See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
+
+## License
+
+MIT
diff --git a/humeo-core/docs/ARCHITECTURE.md b/humeo-core/docs/ARCHITECTURE.md
new file mode 100644
index 0000000000000000000000000000000000000000..79aa008279dbe8abe25831a2bfba7a12508e0976
--- /dev/null
+++ b/humeo-core/docs/ARCHITECTURE.md
@@ -0,0 +1,128 @@
+# Architecture — Reusable Rocket
+
+> *"We don't need to build the door or windows — just a container with landing
+> gear and thrusters that move in different directions."*
+> — Bryan
+
+That analogy maps exactly onto this MCP:
+
+| Rocket part     | Codebase                                                         | Purpose                                                                 |
+| --------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------- |
+| Container       | `src/humeo_core/schemas.py`                                       | Strict JSON contracts every stage reads/writes.                         |
+| Landing gear    | `src/humeo_core/primitives/ingest.py`                             | Deterministic local extraction (scenes, keyframes, transcript).         |
+| Thrusters (×5)  | `src/humeo_core/primitives/layouts.py`                            | Five fixed 9:16 crop/compose recipes (max 2 on-screen items).           |
+| Pilot           | `primitives/classify.py` + `primitives/select_clips.py`          | Heuristic + LLM-ready decision makers.                                  |
+| Compiler        | `src/humeo_core/primitives/compile.py`                            | Deterministic ffmpeg assembly.                                          |
+| Control panel   | `src/humeo_core/server.py`                                        | MCP tools exposing every primitive.                                     |
+| Control surface | `src/humeo_core/server.py`                                        | MCP tool surface for agents and clients.                                |
+
+## First-principles reasoning
+
+The HIVE paper's core insight is that good short-video editing requires
+**staged reasoning with strict intermediate artifacts**, not a single
+giant model call. Three consequences flow from that:
+
+1. **Extraction must be local and deterministic.** No model call should
+   ever touch raw video bytes. `ingest.py` runs ffprobe + PySceneDetect
+   + ffmpeg + (optional) faster-whisper. Everything it emits is JSON or
+   a file path.
+
+2. **Reasoning must be decomposed into narrow sub-tasks.** Classifying a
+   scene's layout is a completely different task from selecting a viral
+   clip. Each has its own schema, its own prompt, its own validation.
+   This is why `primitives/` is five files instead of one.
+
+3. **Every model call must emit schema-validated JSON.** Free-form model
+   output is not allowed to enter the pipeline. `classify_scenes_with_llm`
+   and `select_clips_with_llm` both `model_validate(...)` the raw output
+   before returning; parse failures degrade gracefully to `SIT_CENTER` +
+   low confidence, not crashes.
+
+## Why only five layouts?
+
+The hard rule for this format: **a short shows at most two on-screen
+items**, where an "item" is a `person` or a `chart`. That gives exactly
+five recipes — all implemented as pure functions from
+`LayoutInstruction` to an ffmpeg filtergraph string in `layouts.py`:
+
+| Layout                 | Items           | Recipe                                        |
+| ---------------------- | --------------- | --------------------------------------------- |
+| `zoom_call_center`     | 1 person        | tight centered 9:16 crop (zoom ≥ 1.25).       |
+| `sit_center`           | 1 person        | wider centered 9:16 crop.                     |
+| `split_chart_person`   | 1 chart + person| source partitioned L/R by bboxes, stacked.    |
+| `split_two_persons`    | 2 persons       | L/R speakers, stacked top/bottom.             |
+| `split_two_charts`     | 2 charts        | L/R charts, stacked top/bottom.               |
+
+A general subject-tracker ML model is orders of magnitude more expensive
+and less reliable than five hand-written crop recipes. If a new geometry
+ever shows up in future source videos, adding a sixth thruster is
+strictly additive: write a new `plan_*` function, add it to `_DISPATCH`,
+add an enum variant. No existing code has to change.
+
+## 9:16 layout math
+
+Source is assumed 16:9 (1920×1080 by default, but probed per-clip).
+Target is 1080×1920. For each layout:
+
+### `zoom_call_center` and `sit_center`
+
+Standard centered aspect-ratio crop to 9:16, then scale to 1080×1920:
+
+```
+crop=cw:ch:x:y,scale=1080:1920:flags=lanczos,setsar=1[vout]
+```
+
+`cw`, `ch` are the largest 9:16 window that fits in the source, divided
+by `zoom`. `x`, `y` center the window on `person_x_norm` / 0.5.
+Dimensions are rounded to even values so libx264 is happy. The window is
+clamped inside the source so a high `person_x_norm` never crops outside.
+
+### Split layouts (`split_chart_person`, `split_two_persons`, `split_two_charts`)
+
+All three splits share one recipe — only the items differ:
+
+1. **Horizontal partition.** The source is cut at a single vertical seam
+   so the two source strips are **complementary** (no overlap, no gap).
+   When both bboxes are set (Gemini vision), the seam is the midpoint
+   between `left.x2` and `right.x1`. Otherwise the seam defaults to
+   either an even 50/50 (two-of-a-kind splits) or a 2/3 | 1/3 split
+   (legacy `split_chart_person` fallback).
+2. **Vertical crop.** Each strip's vertical extent comes from the
+   corresponding bbox when provided, so each item **fills** its output
+   band instead of being lost in full-height source context.
+3. **Cover-scale to the band.** Each strip is scaled with
+   `force_original_aspect_ratio=increase` + center-cropped to the band
+   dimensions. Bands are always fully painted; no letterbox bars.
+4. **Stack.** Two branches produced by `split=2` are `vstack`-ed into
+   the final 1080×1920.
+
+**Band heights** are controlled by `LayoutInstruction.top_band_ratio`,
+which defaults to **0.5** (even 50/50 — the symmetric look Bryan asked
+for after the uneven Cathy Wood shorts). Legacy 60/40 is still reachable
+by setting `top_band_ratio=0.6`.
+
+**Stack order** (for `split_chart_person`) is controlled by
+`focus_stack_order`: chart-on-top (default) or person-on-top.
+
+## Extensibility story
+
+- **Smarter classifier:** implement `LLMVisionFn` with any multimodal
+  model and pass it to `classify_scenes_with_llm`. The fallback heuristic
+  stays available for offline runs and tests.
+- **Smarter clip selector:** same pattern, `LLMTextFn` → `select_clips_with_llm`.
+- **New layout:** add a `plan_*` planner, register in `_DISPATCH`, add a
+  `LayoutKind` variant. Tests in `test_layouts.py` automatically iterate
+  over all `LayoutKind`s, so the dispatch coverage test will catch a
+  missing registration immediately.
+
+## What we intentionally did NOT build
+
+- Drag-and-highlight subject-selector UI.
+- A general ML subject-tracker.
+- A monolithic video-in-video-out model.
+- Any network calls in the core library. The MCP server is stdio-only;
+  the CLI runs fully offline.
+
+This keeps the rocket **reusable**: the same primitives power the MCP
+server, the CLI, a Python library, and (soon) a web UI if that's ever
+warranted.
diff --git a/humeo-core/docs/MCP_USAGE.md b/humeo-core/docs/MCP_USAGE.md
new file mode 100644
index 0000000000000000000000000000000000000000..36ce6f56e901596a260b7656c1c0168b0f01e7ee
--- /dev/null
+++ b/humeo-core/docs/MCP_USAGE.md
@@ -0,0 +1,100 @@
+# Using humeo-core from an MCP client
+
+The installed console command is **`humeo-core`**. For backward compatibility,
+**`humeo-mcp`** is also registered (same entrypoint); either works in
+`"command": ...` if both are on `PATH` from the same install.
+
+## 1. Add to your client
+
+`claude_desktop_config.json` or `.cursor/mcp.json`:
+
+```json
+{
+  "mcpServers": {
+    "humeo": {
+      "command": "humeo-core"
+    }
+  }
+}
+```
+
+## 2. A typical agent plan
+
+```
+→ humeo.list_layouts()
+    # discover the 5 layouts (max 2 on-screen items per short)
+
+→ humeo.ingest(source_path="/abs/long.mp4", work_dir="/abs/work", with_transcript=true)
+    # IngestResult: scenes[], keyframes, transcript_words[]
+
+→ humeo.classify_scenes(scenes=<IngestResult.scenes>)
+    # SceneClassification[] — one layout per scene
+
+→ humeo.select_clips(
+      source_path=..., transcript_words=..., duration_sec=...,
+      target_count=5, min_sec=30, max_sec=60
+  )
+    # ClipPlan — top non-overlapping clips
+
+# For each clip, pick the layout of the scene its midpoint falls in,
+# build a LayoutInstruction, and:
+
+→ humeo.build_render_cmd(request={...})
+    # dry-run: returns the exact ffmpeg argv, no execution
+
+→ humeo.render_clip(request={..., "mode": "normal"})
+    # actually renders the 9:16 MP4
+```
+
+## 3. Strict JSON all the way
+
+Every request/response is validated against the schemas in
+[`schemas.py`](../src/humeo_core/schemas.py). Invalid input is rejected
+*before* ffmpeg is touched, so a confused agent can't accidentally
+rm-rf your disk or burn GPU hours.
+
+## 4. Override knobs
+
+`LayoutInstruction` accepts:
+
+- `zoom`, `person_x_norm`, `chart_x_norm` — single-subject knobs.
+- `split_chart_region`, `split_person_region`,
+  `split_second_chart_region`, `split_second_person_region` —
+  normalized bboxes that drive split-layout cropping.
+- `top_band_ratio` — fraction of output height used by the top band
+  (default 0.5 = even 50/50, the symmetric look).
+- `focus_stack_order` — for `split_chart_person`, chart-on-top vs
+  person-on-top.
+
+Example: chart + person with a precise bbox crop and an even split.
+
+```json
+{
+  "clip_id": "001",
+  "layout": "split_chart_person",
+  "split_chart_region":  {"x1": 0.00, "y1": 0.10, "x2": 0.52, "y2": 0.95},
+  "split_person_region": {"x1": 0.55, "y1": 0.05, "x2": 1.00, "y2": 1.00},
+  "top_band_ratio": 0.5,
+  "focus_stack_order": "chart_then_person"
+}
+```
+
+Example: two-speaker interview.
+
+```json
+{
+  "clip_id": "002",
+  "layout": "split_two_persons",
+  "split_person_region":        {"x1": 0.02, "y1": 0.05, "x2": 0.48, "y2": 0.95},
+  "split_second_person_region": {"x1": 0.52, "y1": 0.05, "x2": 0.98, "y2": 0.95}
+}
+```
+
+## 5. When to stay in dry-run
+
+- You want to show an approval UI before spending CPU.
+- You want to diff the planned ffmpeg commands against a previous run.
+- You're building tests.
+
+`mode="dry_run"` is always safe, never writes output, and returns the
+exact argv list.
diff --git a/humeo-core/examples/render_request.json b/humeo-core/examples/render_request.json
new file mode 100644
index 0000000000000000000000000000000000000000..038f8b4531736b7ba707d27680bfea84bd630965
--- /dev/null
+++ b/humeo-core/examples/render_request.json
@@ -0,0 +1,23 @@
+{
+  "source_path": "/absolute/path/to/long.mp4",
+  "clip": {
+    "clip_id": "001",
+    "topic": "Prediction Market Explosion",
+    "start_time_sec": 289.0,
+    "end_time_sec": 331.5,
+    "viral_hook": "Prediction markets could explode to $5 trillion.",
+    "virality_score": 0.94,
+    "transcript": "Full text for subtitle generation...",
+    "suggested_overlay_title": "$5T Prediction Markets"
+  },
+  "layout": {
+    "clip_id": "001",
+    "layout": "split_chart_person",
+    "zoom": 1.0,
+    "person_x_norm": 0.83,
+    "chart_x_norm": 0.0
+  },
+  "output_path": "/absolute/path/to/out/clip_001.mp4",
+  "title_text": "$5T Prediction Markets",
+  "mode": "dry_run"
+}
diff --git a/humeo-core/pyproject.toml b/humeo-core/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..73f627478b946920f687d4293ac752470841934b
--- /dev/null
+++ b/humeo-core/pyproject.toml
@@ -0,0 +1,46 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "humeo-core"
+version = "0.1.0"
+description = "Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints)."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "Humeo" }]
+keywords = ["mcp", "video", "shorts", "ffmpeg", "editing", "humeo", "hive"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+    "mcp[cli]>=1.2.0",
+    "pydantic>=2.0",
+    "scenedetect>=0.6",
+]
+
+[project.optional-dependencies]
+transcribe = ["faster-whisper>=1.0"]
+download = ["yt-dlp>=2024.0"]
+face = ["mediapipe>=0.10", "opencv-python>=4.8"]
+vision = ["Pillow>=10.0"]
+dev = ["pytest>=7", "pytest-asyncio>=0.23", "Pillow>=10.0"]
+
+[project.scripts]
+humeo-core = "humeo_core.server:main"
+# Backward-compatible entry point (same module); existing MCP configs may still call `humeo-mcp`.
+humeo-mcp = "humeo_core.server:main"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+humeo_core = ["assets/fonts/*"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-ra -q"
diff --git a/humeo-core/src/humeo_core.egg-info/PKG-INFO b/humeo-core/src/humeo_core.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..78b029cc75dce7925036f828838fcb487ea0d071
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/PKG-INFO
@@ -0,0 +1,197 @@
+Metadata-Version: 2.4
+Name: humeo-core
+Version: 0.1.0
+Summary: Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints).
+Author: Humeo
+License: MIT
+Keywords: mcp,video,shorts,ffmpeg,editing,humeo,hive
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: mcp[cli]>=1.2.0
+Requires-Dist: pydantic>=2.0
+Requires-Dist: scenedetect>=0.6
+Provides-Extra: transcribe
+Requires-Dist: faster-whisper>=1.0; extra == "transcribe"
+Provides-Extra: download
+Requires-Dist: yt-dlp>=2024.0; extra == "download"
+Provides-Extra: face
+Requires-Dist: mediapipe>=0.10; extra == "face"
+Requires-Dist: opencv-python>=4.8; extra == "face"
+Provides-Extra: vision
+Requires-Dist: Pillow>=10.0; extra == "vision"
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
+Requires-Dist: Pillow>=10.0; extra == "dev"
+Dynamic: license-file
+
+# humeo-core
+
+**Reusable-rocket MCP server for long-video → 9:16 shorts.**
+
+First-principles design, from the HIVE paper + Bryan's rocket analogy:
+we don't build doors and windows (general subject-tracker UI, retraining
+models). We build the **container** (schemas), **landing gear** (deterministic
+local extraction), and **five thrusters** (the five 9:16 layouts this video
+format actually uses). Everything else is pluggable.
+
+## The rocket, in one picture
+
+```
+            ┌──────────────────────────────────────────┐
+            │         Control panel  (MCP tools)       │   <- any MCP client
+            └───────────────────┬──────────────────────┘
+                                │ strict JSON
+   ┌────────────────┬───────────┼────────────────┬─────────────────┐
+   ▼                ▼           ▼                ▼                 ▼
+ ingest       classify_scenes  select_clips   plan_layout       render_clip
+(scenes +     (5-way layout   (clip picker,   (5 thrusters,    (ffmpeg compile,
+ keyframes +   classifier)     heuristic +     pure filter      dry-run safe)
+ transcript)                   LLM-ready)      math)
+                                                 │
+                                                 ▼
+                                       ┌────────────────────┐
+                                       │   LayoutKind       │
+                                       │  ────────────────  │
+                                       │  zoom_call_center  │
+                                       │  sit_center        │
+                                       │  split_chart_person│
+                                       │  split_two_persons │
+                                       │  split_two_charts  │
+                                       └────────────────────┘
+```
+
+Only the classifier and clip-selector have optional LLM hooks; everything
+else is deterministic, local, and cheap.
+
+## Why five layouts? (the "max 2 items" rule)
+
+The hard constraint for this format: **a short shows at most two on-screen
+items** — where an "item" is a `person` (a human speaker) or a `chart`
+(slide, graph, data visual, screenshare). That gives exactly five recipes:
+
+1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
+2. **`sit_center`** — 1 person, interview / seated framing.
+3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
+   (default: **even 50/50** top/bottom, chart on top).
+4. **`split_two_persons`** — 2 speakers, stacked vertically.
+5. **`split_two_charts`** — 2 charts, stacked vertically.
+
+Because the geometry is bounded, we do NOT need a general subject-tracker
+ML model or a drag-to-highlight UI. We need five small, correct pieces of
+crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
+is.
+
+See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
+used across these docs (subject, crop, band, seam, bbox, layout, etc.).
+
+## Install
+
+```bash
+uv venv
+uv sync
+```
+
+External requirements: `ffmpeg` and `ffprobe` on PATH.
+
+`scenedetect` requires OpenCV. Install `opencv-python-headless` or
+`opencv-python` alongside `scenedetect`.
+
+## Use it as an MCP server
+
+```bash
+humeo-core         # stdio transport (primary console script)
+# humeo-mcp        # same entrypoint — kept so existing MCP configs keep working
+```
+
+Example Cursor/Claude Desktop config:
+
+```json
+{
+  "mcpServers": {
+    "humeo": { "command": "humeo-core" }
+  }
+}
+```
+
+Tools exposed:
+
+| Tool                              | Purpose                                                                     |
+| --------------------------------- | --------------------------------------------------------------------------- |
+| `list_layouts`                    | Enumerate the 5 supported layouts.                                          |
+| `ingest`                          | Scene detection + keyframe extraction (+ optional transcript).              |
+| `classify_scenes`                 | Pixel-heuristic per-scene layout classification.                            |
+| `detect_scene_regions`            | Return the bbox prompt + per-scene jobs (agent runs its own vision model).  |
+| `classify_scenes_with_vision`     | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
+| `select_clips`                    | Heuristic clip picker over a word-level transcript.                         |
+| `plan_layout`                     | Return the exact `ffmpeg -filter_complex` for a layout.                     |
+| `build_render_cmd`                | Build the ffmpeg command (no execution) — review before spend.              |
+| `render_clip`                     | Build + run ffmpeg to produce a 9:16 MP4.                                   |
+
+Resource: `humeo://layouts` (JSON listing of the 5 layouts).
+
+### Three interchangeable region detectors
+
+All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
+
+```
+classify.py   (pixel variance, no ML)
+face_detect.py (MediaPipe, local)            ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
+vision.py     (multimodal LLM + OCR bboxes)
+```
+
+## JSON contracts (non-negotiable)
+
+All tools take and return Pydantic-validated JSON. The contracts live in
+[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
+
+- `Scene`                     `{scene_id, start_time, end_time, keyframe_path?}`
+- `TranscriptWord`            `{word, start_time, end_time}`
+- `IngestResult`              `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
+- `SceneClassification`       `{scene_id, layout, confidence, reason}`
+- `BoundingBox`               `{x1, y1, x2, y2, label, confidence}`  (all coords normalized)
+- `SceneRegions`              `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
+- `Clip`                      `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
+- `ClipPlan`                  `{source_path, clips[]}`
+- `LayoutInstruction`         `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
+- `RenderRequest` / `RenderResult`
+
+## First-principles decisions (what we intentionally did NOT build)
+
+- **No giant subject-tracker ML.** The video format has 5 fixed layouts
+  (with a hard "max 2 items" rule); pixel-level tracking is not needed.
+- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
+  agent-first workflow. If a human wants to override, they pass a
+  `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
+  `zoom`.
+- **No end-to-end video→video model.** The HIVE paper's core insight is
+  that decomposed orchestration beats monolithic generation. We reify
+  that insight as six small composable tools.
+
+## Extending the pilot
+
+- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
+- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
+- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
+  to get per-scene bboxes + OCR text, then feed the results back through
+  `classify_scenes_with_vision`. This is the scene-change → v3 images →
+  LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
+- All enforce strict JSON outputs, so bad model output can't corrupt
+  downstream stages.
+
+## Testing
+
+```bash
+python -m pytest
+```
+
+See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
+
+## License
+
+MIT
diff --git a/humeo-core/src/humeo_core.egg-info/SOURCES.txt b/humeo-core/src/humeo_core.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f73f8e3bdbf2d2ae0b44262a48caa424c7ac007
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/SOURCES.txt
@@ -0,0 +1,33 @@
+LICENSE
+README.md
+pyproject.toml
+src/humeo_core/__init__.py
+src/humeo_core/schemas.py
+src/humeo_core/server.py
+src/humeo_core.egg-info/PKG-INFO
+src/humeo_core.egg-info/SOURCES.txt
+src/humeo_core.egg-info/dependency_links.txt
+src/humeo_core.egg-info/entry_points.txt
+src/humeo_core.egg-info/requires.txt
+src/humeo_core.egg-info/top_level.txt
+src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
+src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
+src/humeo_core/assets/fonts/SourceSans3-OFL.txt
+src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
+src/humeo_core/primitives/__init__.py
+src/humeo_core/primitives/classify.py
+src/humeo_core/primitives/compile.py
+src/humeo_core/primitives/face_detect.py
+src/humeo_core/primitives/ingest.py
+src/humeo_core/primitives/layouts.py
+src/humeo_core/primitives/select_clips.py
+src/humeo_core/primitives/vision.py
+tests/test_classify.py
+tests/test_compile.py
+tests/test_face_detect.py
+tests/test_layout_bbox.py
+tests/test_layouts.py
+tests/test_schemas.py
+tests/test_select_clips.py
+tests/test_server_tools.py
+tests/test_vision.py
\ No newline at end of file
diff --git a/humeo-core/src/humeo_core.egg-info/dependency_links.txt b/humeo-core/src/humeo_core.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/humeo-core/src/humeo_core.egg-info/entry_points.txt b/humeo-core/src/humeo_core.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0e9df37e2aa1fbdad3098d34e629fdfc2a2044b
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+humeo-core = humeo_core.server:main
+humeo-mcp = humeo_core.server:main
diff --git a/humeo-core/src/humeo_core.egg-info/requires.txt b/humeo-core/src/humeo_core.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5660c1df773875982439273d51d35bc37d1d26b1
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/requires.txt
@@ -0,0 +1,21 @@
+mcp[cli]>=1.2.0
+pydantic>=2.0
+scenedetect>=0.6
+
+[dev]
+pytest>=7
+pytest-asyncio>=0.23
+Pillow>=10.0
+
+[download]
+yt-dlp>=2024.0
+
+[face]
+mediapipe>=0.10
+opencv-python>=4.8
+
+[transcribe]
+faster-whisper>=1.0
+
+[vision]
+Pillow>=10.0
diff --git a/humeo-core/src/humeo_core.egg-info/top_level.txt b/humeo-core/src/humeo_core.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2faab947486877e418834efb9cf6b4c9cdaa21b2
--- /dev/null
+++ b/humeo-core/src/humeo_core.egg-info/top_level.txt
@@ -0,0 +1 @@
+humeo_core
diff --git a/humeo-core/src/humeo_core/__init__.py b/humeo-core/src/humeo_core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fde979e33456746792eace3d5e494d8cb90fc515
--- /dev/null
+++ b/humeo-core/src/humeo_core/__init__.py
@@ -0,0 +1,49 @@
+"""humeo-core: reusable-rocket MCP primitives for long-video-to-shorts editing.
+
+First-principles design (rocket analogy):
+    Container  -> schemas.py        (strict JSON contracts)
+    Landing gear -> primitives/ingest.py, primitives/compile.py  (deterministic local)
+    Thrusters    -> primitives/layouts.py                         (5 fixed 9:16 layouts, max 2 items)
+    Pilot        -> primitives/classify.py, primitives/select_clips.py (heuristic, LLM-ready)
+    Control panel -> server.py      (FastMCP tools that expose all primitives)
+"""
+
+from .schemas import (
+    BoundingBox,
+    Clip,
+    ClipPlan,
+    ClipRenderSpan,
+    ClipSubtitleWords,
+    FocusStackOrder,
+    IngestResult,
+    LayoutInstruction,
+    LayoutKind,
+    RenderRequest,
+    RenderResult,
+    RenderTheme,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+    TranscriptWord,
+)
+
+__all__ = [
+    "BoundingBox",
+    "Clip",
+    "ClipPlan",
+    "ClipRenderSpan",
+    "ClipSubtitleWords",
+    "FocusStackOrder",
+    "IngestResult",
+    "LayoutInstruction",
+    "LayoutKind",
+    "RenderRequest",
+    "RenderResult",
+    "RenderTheme",
+    "Scene",
+    "SceneClassification",
+    "SceneRegions",
+    "TranscriptWord",
+]
+
+__version__ = "0.1.0"
diff --git a/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..bd81aef67d88acd3bf4b88d7c6ff86900b2e9ce3
Binary files /dev/null and b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf differ
diff --git a/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1ab04e2a11702d69277b4525bb7fb767ed9c045c
--- /dev/null
+++ b/humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
@@ -0,0 +1,93 @@
+Copyright 2020 The League Spartan Project Authors (https://github.com/theleagueof/league-spartan)
+
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+https://scripts.sil.org/OFL
+
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded, 
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50ee76cf00fbfe42fb7c74a9b95c9508dec5bb8f
--- /dev/null
+++ b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt
@@ -0,0 +1,93 @@
+Copyright 2010-2020 Adobe (http://www.adobe.com/), with Reserved Font Name 'Source'. All Rights Reserved. Source is a trademark of Adobe in the United States and/or other countries.
+
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+
+This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
+
+
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded, 
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.
diff --git a/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
new file mode 100644
index 0000000000000000000000000000000000000000..6c16581182d07be08abe83026aa8af97e857fcb5
--- /dev/null
+++ b/humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39e3ab05ccd7cb94907c31005bb5bec1d5432f0b096a2b782976e217a540eb6c
+size 395372
diff --git a/humeo-core/src/humeo_core/primitives/__init__.py b/humeo-core/src/humeo_core/primitives/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9c96d207211f13dc058b5844885e4b0dee8b9c8
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/__init__.py
@@ -0,0 +1 @@
+"""Primitives: deterministic, composable building blocks of the rocket."""
diff --git a/humeo-core/src/humeo_core/primitives/classify.py b/humeo-core/src/humeo_core/primitives/classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf9274101c3a548149f62c06072daf2d70ed813
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/classify.py
@@ -0,0 +1,232 @@
+"""Scene classifier: assigns one of the 5 layouts to each scene.
+
+Two backends share the same contract:
+
+* ``classify_scenes_heuristic`` — no model call. Uses keyframe pixel analysis
+  (edge density + color variance + face-rectangle heuristic-free approach)
+  to guess which of the 5 layouts fits best. Fully offline, deterministic.
+  Note: the heuristic only picks between ``SIT_CENTER`` / ``ZOOM_CALL_CENTER`` /
+  ``SPLIT_CHART_PERSON``; the two-of-a-kind splits (``SPLIT_TWO_PERSONS`` /
+  ``SPLIT_TWO_CHARTS``) are only selectable by the vision-LLM backend.
+* ``classify_scenes_with_llm`` — pluggable LLM hook. Takes a callable
+  ``(image_path, prompt) -> str`` so the caller (MCP client or test) can
+  wire up whatever multimodal model they want. Enforces strict JSON output.
+
+Even without a model, the heuristic is good enough for many real inputs and
+keeps the whole pipeline runnable with zero external dependencies.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import struct
+from typing import Callable, Iterable
+
+from ..schemas import LayoutKind, Scene, SceneClassification
+
+
+# ---------------------------------------------------------------------------
+# Tiny PNG/JPEG reader → down-sampled grayscale column profile
+# ---------------------------------------------------------------------------
+# We intentionally avoid a hard dependency on Pillow. If Pillow is available
+# we use it; otherwise we fall back to reading just PNG dimensions, which is
+# enough for a coarse column-variance heuristic on any pre-decoded frame.
+
+
+def _load_grayscale(path: str) -> tuple[list[list[int]], int, int] | None:
+    try:
+        from PIL import Image  # type: ignore
+
+        img = Image.open(path).convert("L")
+        w, h = img.size
+        # Down-sample to at most 128 cols x 72 rows for cheap analysis.
+        tw = min(128, w)
+        th = min(72, h)
+        img = img.resize((tw, th))
+        px = list(img.getdata())
+        grid = [px[i * tw : (i + 1) * tw] for i in range(th)]
+        return grid, tw, th
+    except Exception:
+        return None
+
+
+def _png_dims(path: str) -> tuple[int, int] | None:
+    try:
+        with open(path, "rb") as f:
+            head = f.read(24)
+        if head[:8] != b"\x89PNG\r\n\x1a\n":
+            return None
+        w, h = struct.unpack(">II", head[16:24])
+        return int(w), int(h)
+    except Exception:
+        return None
+
+
+def _column_profile(grid: list[list[int]]) -> list[float]:
+    if not grid:
+        return []
+    h = len(grid)
+    w = len(grid[0])
+    out: list[float] = []
+    for x in range(w):
+        s = 0
+        for y in range(h):
+            s += grid[y][x]
+        out.append(s / h)
+    return out
+
+
+def _variance(values: Iterable[float]) -> float:
+    vs = list(values)
+    if not vs:
+        return 0.0
+    m = sum(vs) / len(vs)
+    return sum((v - m) ** 2 for v in vs) / len(vs)
+
+
+# ---------------------------------------------------------------------------
+# Heuristic classifier
+# ---------------------------------------------------------------------------
+
+
+def _classify_one_heuristic(keyframe_path: str | None) -> SceneClassification:
+    if not keyframe_path or not os.path.exists(keyframe_path):
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.3,
+            reason="no keyframe available — defaulting to SIT_CENTER",
+        )
+
+    gs = _load_grayscale(keyframe_path)
+    if gs is None:
+        # Can't read pixels: still return a safe default with low confidence.
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.25,
+            reason="PIL unavailable or image unreadable — defaulting to SIT_CENTER",
+        )
+
+    grid, w, h = gs
+    cols = _column_profile(grid)
+
+    def _split_contrast(left: list[float], right: list[float]) -> float:
+        lm = sum(left) / max(1, len(left))
+        rm = sum(right) / max(1, len(right))
+        lv = _variance(left)
+        rv = _variance(right)
+        between = (lm - rm) ** 2
+        within = (lv + rv) / 2.0 + 1e-6
+        return between / within
+
+    # Left/right halves — good for symmetric two-up scenes.
+    mid = max(1, w // 2)
+    split_halves = _split_contrast(cols[:mid], cols[mid:])
+
+    # Left 2/3 vs right 1/3 — matches explainer slides (chart + talking head).
+    t = max(1, w // 3)
+    left_two_thirds = cols[: 2 * t]
+    right_one_third = cols[2 * t :]
+    split_thirds = _split_contrast(left_two_thirds, right_one_third)
+
+    split_score = max(split_halves, split_thirds)
+    # Overall column variance: low variance → flat composition (zoom call).
+    overall_var = _variance(cols)
+
+    # Threshold tuned on Ark-style 2/3 chart + 1/3 speaker; "thirds" score catches
+    # layouts where half-vs-half contrast was too weak (e.g. clip 005 vs 004).
+    if split_score > 20.0:
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            confidence=min(0.95, 0.5 + split_score / 200.0),
+            reason=(
+                f"chart/person contrast (halves={split_halves:.1f}, "
+                f"thirds={split_thirds:.1f} → max={split_score:.1f})"
+            ),
+        )
+    if overall_var < 100.0:
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.ZOOM_CALL_CENTER,
+            confidence=0.7,
+            reason=f"low column variance ({overall_var:.1f}) — flat centered framing",
+        )
+    return SceneClassification(
+        scene_id="?",
+        layout=LayoutKind.SIT_CENTER,
+        confidence=0.6,
+        reason=f"moderate composition (score={split_score:.1f}, var={overall_var:.1f})",
+    )
+
+
+def classify_scenes_heuristic(scenes: list[Scene]) -> list[SceneClassification]:
+    out: list[SceneClassification] = []
+    for s in scenes:
+        r = _classify_one_heuristic(s.keyframe_path)
+        out.append(r.model_copy(update={"scene_id": s.scene_id}))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# LLM-backed classifier (caller provides the model hook)
+# ---------------------------------------------------------------------------
+
+
+LLMVisionFn = Callable[[str, str], str]
+"""Signature: (image_path, prompt) -> raw model string (expected JSON)."""
+
+
+CLASSIFIER_PROMPT = """You are a scene layout classifier for a short-video editor.
+Return ONLY a JSON object of the form:
+  {"layout": "<one of: zoom_call_center | sit_center | split_chart_person>",
+   "confidence": <0..1 float>,
+   "reason": "<=15 words"}
+
+Layout definitions:
+- zoom_call_center: one person on a video call (webcam grid / talking head tight crop), subject centered.
+- sit_center:       one person sitting in frame, subject centered, wider framing than a zoom call.
+- split_chart_person: an explainer scene with a chart/graphic on the LEFT (~2/3 of frame) and a person on the RIGHT (~1/3).
+
+Pick the single best match. No prose, no markdown, JSON only.
+"""
+
+
+def classify_scenes_with_llm(
+    scenes: list[Scene], vision_fn: LLMVisionFn
+) -> list[SceneClassification]:
+    out: list[SceneClassification] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind.SIT_CENTER,
+                    confidence=0.2,
+                    reason="no keyframe",
+                )
+            )
+            continue
+        raw = vision_fn(s.keyframe_path, CLASSIFIER_PROMPT)
+        try:
+            data = json.loads(raw)
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind(data["layout"]),
+                    confidence=float(data.get("confidence", 0.5)),
+                    reason=str(data.get("reason", ""))[:200],
+                )
+            )
+        except Exception as e:
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind.SIT_CENTER,
+                    confidence=0.25,
+                    reason=f"LLM parse error: {e!r}",
+                )
+            )
+    return out
diff --git a/humeo-core/src/humeo_core/primitives/compile.py b/humeo-core/src/humeo_core/primitives/compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7bbfbecb46f4c8a93be706cde3af3bab76c60a
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/compile.py
@@ -0,0 +1,602 @@
+"""Compiler: assemble a final 9:16 clip from source + clip + layout instruction.
+
+Builds the ffmpeg invocation, optionally runs it. Keeping ``dry_run`` as a
+first-class mode means the MCP server can return the exact command without
+executing — ideal for an agent that wants to review before spending CPU.
+
+Rendering order is fixed and intentional:
+
+1. **Cut + crop/compose.** ``plan_layout`` produces the base filtergraph
+   that takes the source, applies the layout-specific crops, and emits a
+   labelled ``[vout]`` at the exact output resolution (e.g. 1080x1920).
+2. **Overlay title** (``drawtext``) — skipped for split layouts because
+   the source itself already has a slide/chart title and an extra overlay
+   just obscures content.
+3. **Subtitles.** ``subtitles`` filter runs **last** so text is drawn over
+   the finished composition, not the source. ``original_size`` is pinned
+   to the output resolution so libass coordinate math (MarginV, FontSize)
+   is in *output pixels*, not libass's default PlayResY=288 — which was
+   the bug behind the "subtitles blocked / floating in the middle" look.
+4. **Mux** with the source audio stream (``0:a:0``).
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+from ..schemas import RenderRequest, RenderResult, RenderTheme, SPLIT_LAYOUTS
+from .layouts import plan_layout
+
+
+def _ensure_ffmpeg() -> str:
+    exe = shutil.which("ffmpeg")
+    if not exe:
+        raise RuntimeError("ffmpeg not found on PATH")
+    return exe
+
+
+def _ensure_windows_fontconfig() -> dict[str, str]:
+    """Return subprocess env with a minimal fontconfig setup on Windows.
+
+    Some Windows FFmpeg builds ship libass + fontconfig but do not bundle a
+    default fontconfig config, which makes subtitle rendering fail with:
+
+    ``Fontconfig error: Cannot load default config file: No such file: (null)``
+
+    We generate a tiny config that points fontconfig at ``C:/Windows/Fonts`` and
+    a writable cache dir under ``%LOCALAPPDATA%/humeo``. Non-Windows platforms
+    pass through the existing environment unchanged.
+    """
+    env = os.environ.copy()
+    if os.name != "nt":
+        return env
+    if env.get("FONTCONFIG_FILE"):
+        return env
+
+    local_appdata = Path(
+        env.get("LOCALAPPDATA", str(Path(tempfile.gettempdir()) / "humeo-local"))
+    )
+    cfg_dir = local_appdata / "humeo" / "fontconfig"
+    cache_dir = local_appdata / "humeo" / "fontconfig-cache"
+    cfg_dir.mkdir(parents=True, exist_ok=True)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    cfg_file = cfg_dir / "fonts.conf"
+    windows_fonts = Path(env.get("WINDIR", r"C:\Windows")) / "Fonts"
+    if not cfg_file.exists():
+        cfg_file.write_text(
+            "\n".join(
+                [
+                    '<?xml version="1.0"?>',
+                    "<fontconfig>",
+                    f"  <dir>{windows_fonts.as_posix()}</dir>",
+                    f"  <cachedir>{cache_dir.as_posix()}</cachedir>",
+                    "</fontconfig>",
+                    "",
+                ]
+            ),
+            encoding="utf-8",
+        )
+
+    env["FONTCONFIG_PATH"] = str(cfg_dir)
+    env["FONTCONFIG_FILE"] = str(cfg_file)
+    return env
+
+
+def _escape_drawtext(text: str) -> str:
+    # drawtext quoting is brittle across ffmpeg builds. Keep it simple:
+    # collapse whitespace, drop apostrophes, and escape the characters
+    # that are still significant to the filter parser.
+    safe = " ".join(text.split()).replace("'", "")
+    return safe.replace("\\", "\\\\").replace(":", "\\:")
+
+
+# ---------------------------------------------------------------------------
+# Title overlay planning
+# ---------------------------------------------------------------------------
+#
+# ffmpeg ``drawtext`` does not wrap text by itself; whatever you hand it is
+# emitted as a single line. With a fixed 72px font and no width budget, the
+# "Prediction Markets vs Derivatives" title on a 1080px canvas would spill
+# past both edges and show up clipped (the user reported exactly this bug).
+#
+# The helpers below plan a title layout BEFORE it hits drawtext:
+#
+# 1. Short titles (fit at 72px single line): emit the existing single
+#    ``drawtext`` call unchanged so golden tests and previously-calibrated
+#    visuals stay byte-for-byte identical.
+# 2. Long titles: split at the best word boundary into two balanced lines and
+#    emit two stacked ``drawtext`` filters at a slightly smaller font
+#    (60px / 52px / 44px, auto-shrinking until both lines fit).
+# 3. Single-word titles that still overflow: shrink the single line until it
+#    fits, then hard-truncate with an ellipsis as a last resort.
+#
+# The character-width estimate is deliberately conservative (0.55 * fontsize)
+# so mixed-case prose with wide letters like W/M still clears the margin.
+# Calibrated visually against Arial Bold on 1080p output.
+
+_TITLE_PRIMARY_SIZE = 72   # Current "hero" title size; preserved for short titles.
+_TITLE_MIN_SIZE = 44       # Readability floor at 1080x1920 output.
+_TITLE_MARGIN_PX = 60      # Horizontal safe-area on each side.
+_TITLE_Y_TOP = 80          # Pixel offset of the top title baseline (matches pre-P2 look).
+_TITLE_CHAR_WIDTH_RATIO = 0.55
+_TITLE_LINE_SPACING_RATIO = 1.3
+
+# Keep the overlay font explicit. Without a ``font=`` directive, drawtext
+# falls back to fontconfig's "Sans", which resolves to a serif (Times New
+# Roman) on default Windows installs — the "ugly serif title" bug reported
+# against v1. Arial matches the ASS subtitle ``Fontname`` below so the
+# title and captions read as a single typographic family. Keep this in
+# sync with the ``Fontname=Arial`` in the subtitle filter if it ever
+# changes.
+_TITLE_FONT_NAME = "Arial"
+_REFERENCE_TITLE_FONT_NAME = "League Spartan"
+_REFERENCE_CAPTION_FONT_NAME = "Source Sans 3"
+_REFERENCE_TITLE_BAR_X = 28
+_REFERENCE_TITLE_BAR_Y = 32
+_REFERENCE_TITLE_BAR_W = 1024
+_REFERENCE_TITLE_BAR_H = 148
+_REFERENCE_TITLE_TEXT_X = 72
+_REFERENCE_TITLE_TEXT_Y = 54
+_REFERENCE_TITLE_SIZE = 64
+_REFERENCE_CAPTION_BAR_X = 0
+_REFERENCE_CAPTION_BAR_W = 1080
+_REFERENCE_CAPTION_BAR_H = 120
+_REFERENCE_CAPTION_TEXT_MARGIN_L = 92
+_REFERENCE_CAPTION_TEXT_MARGIN_R = 92
+
+
+def _fonts_dir() -> Path:
+    return Path(__file__).resolve().parents[1] / "assets" / "fonts"
+
+
+def _bundled_font_path(filename: str) -> Path | None:
+    path = _fonts_dir() / filename
+    return path if path.is_file() else None
+
+
+def _title_char_px(size_px: int) -> float:
+    return size_px * _TITLE_CHAR_WIDTH_RATIO
+
+
+def _title_fits(text: str, size_px: int, usable_w: int) -> bool:
+    return int(len(text) * _title_char_px(size_px)) <= usable_w
+
+
+def _wrap_title_two_lines(text: str) -> tuple[str, str]:
+    """Split ``text`` at the word boundary that most balances the two halves.
+
+    Returns ``(line1, line2)``. If ``text`` has fewer than two words, returns
+    ``(text, "")`` and the caller should fall back to single-line shrinking.
+    """
+    words = text.split()
+    if len(words) < 2:
+        return text, ""
+    best_idx = 1
+    best_delta = 10**9
+    for i in range(1, len(words)):
+        left = " ".join(words[:i])
+        right = " ".join(words[i:])
+        delta = abs(len(left) - len(right))
+        if delta < best_delta:
+            best_delta = delta
+            best_idx = i
+    return " ".join(words[:best_idx]), " ".join(words[best_idx:])
+
+
+def _drawtext_font_arg() -> str:
+    """Return a drawtext font selector that is stable on the current platform."""
+    if os.name == "nt":
+        arial = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" / "arial.ttf"
+        if arial.is_file():
+            return f"fontfile='{_escape_filter_path(str(arial))}'"
+    return f"font={_TITLE_FONT_NAME}"
+
+
+def _reference_title_font_arg() -> str:
+    bundled = _bundled_font_path("LeagueSpartan-Bold-static.ttf") or _bundled_font_path(
+        "LeagueSpartan-Bold.ttf"
+    )
+    if bundled is not None:
+        return f"fontfile='{_escape_filter_path(str(bundled))}'"
+    return f"font={_REFERENCE_TITLE_FONT_NAME}"
+
+
+def _drawtext_single(text: str, size: int, y: int) -> str:
+    esc = _escape_drawtext(text)
+    return (
+        f"drawtext=text='{esc}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y}"
+    )
+
+
+def _drawtext_two(line1: str, line2: str, size: int, y_top: int) -> str:
+    """Two drawtext filters chained by comma — one ffmpeg filter chain, two lines."""
+    esc1 = _escape_drawtext(line1)
+    esc2 = _escape_drawtext(line2)
+    y_bottom = y_top + int(round(size * _TITLE_LINE_SPACING_RATIO))
+    return (
+        f"drawtext=text='{esc1}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y_top},"
+        f"drawtext=text='{esc2}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y_bottom}"
+    )
+
+
+def plan_title_drawtext(title_text: str, out_w: int = 1080) -> str | None:
+    """Return the ``drawtext`` filter fragment for ``title_text`` or None to skip.
+
+    The returned string is intended to be spliced into the main filtergraph
+    between the ``[v_prepad]`` and ``[vout]`` labels by
+    :func:`build_ffmpeg_cmd`. It does NOT include those labels itself.
+
+    Backward compatibility: when the title fits on one line at the original
+    72px size, the output is identical to the pre-P2 single-``drawtext``
+    form (same x/y/fontsize/borderw), so golden ffmpeg tests stay green.
+    """
+    text = " ".join((title_text or "").split())
+    if not text:
+        return None
+    usable_w = max(1, out_w - 2 * _TITLE_MARGIN_PX)
+
+    if _title_fits(text, _TITLE_PRIMARY_SIZE, usable_w):
+        return _drawtext_single(text, _TITLE_PRIMARY_SIZE, _TITLE_Y_TOP)
+
+    line1, line2 = _wrap_title_two_lines(text)
+    if line2:
+        for size in (60, 52, _TITLE_MIN_SIZE):
+            if _title_fits(line1, size, usable_w) and _title_fits(line2, size, usable_w):
+                return _drawtext_two(line1, line2, size, _TITLE_Y_TOP)
+
+    for size in (64, 56, 52, _TITLE_MIN_SIZE):
+        if _title_fits(text, size, usable_w):
+            return _drawtext_single(text, size, _TITLE_Y_TOP)
+
+    max_chars = max(4, int(usable_w / _title_char_px(_TITLE_MIN_SIZE)))
+    truncated = text[: max_chars - 1].rstrip() + "..."
+    return _drawtext_single(truncated, _TITLE_MIN_SIZE, _TITLE_Y_TOP)
+
+
+def _reference_title_fragment(title_text: str, out_w: int = 1080) -> str:
+    bar_w = min(_REFERENCE_TITLE_BAR_W, max(320, out_w - 2 * _REFERENCE_TITLE_BAR_X))
+    accent_w = 16
+    title = " ".join((title_text or "").split())
+    usable_w = max(220, bar_w - (_REFERENCE_TITLE_TEXT_X - _REFERENCE_TITLE_BAR_X) - 30)
+    text_filters: list[str] = []
+    if title:
+        if _title_fits(title, _REFERENCE_TITLE_SIZE, usable_w):
+            esc = _escape_drawtext(title)
+            text_filters.append(
+                f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                f"fontcolor=white:fontsize={_REFERENCE_TITLE_SIZE}:"
+                "borderw=1.2:bordercolor=0x101010@0.18:"
+                f"x={_REFERENCE_TITLE_TEXT_X}:"
+                f"y={_REFERENCE_TITLE_TEXT_Y}"
+            )
+        else:
+            line1, line2 = _wrap_title_two_lines(title)
+            two_line_size = 54
+            while (
+                line2
+                and two_line_size > 42
+                and not (
+                    _title_fits(line1, two_line_size, usable_w)
+                    and _title_fits(line2, two_line_size, usable_w)
+                )
+            ):
+                two_line_size -= 2
+            if line2 and _title_fits(line1, two_line_size, usable_w) and _title_fits(line2, two_line_size, usable_w):
+                y_top = 36
+                y_bottom = y_top + int(round(two_line_size * 1.08))
+                for line, y in ((line1, y_top), (line2, y_bottom)):
+                    esc = _escape_drawtext(line)
+                    text_filters.append(
+                        f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                        f"fontcolor=white:fontsize={two_line_size}:"
+                        "borderw=1.2:bordercolor=0x101010@0.18:"
+                        f"x={_REFERENCE_TITLE_TEXT_X}:y={y}"
+                    )
+            else:
+                size = _REFERENCE_TITLE_SIZE
+                while title and not _title_fits(title, size, usable_w) and size > 38:
+                    size -= 2
+                if title and not _title_fits(title, size, usable_w):
+                    max_chars = max(8, int(usable_w / _title_char_px(size)))
+                    title = title[: max_chars - 1].rstrip() + "..."
+                esc = _escape_drawtext(title)
+                text_filters.append(
+                    f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                    f"fontcolor=white:fontsize={size}:"
+                    "borderw=1.2:bordercolor=0x101010@0.18:"
+                    f"x={_REFERENCE_TITLE_TEXT_X}:"
+                    f"y={_REFERENCE_TITLE_TEXT_Y}"
+                )
+    text_filter = f",{','.join(text_filters)}" if text_filters else ""
+    return (
+        f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
+        f"w={bar_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x1F1F1F@0.84:t=fill,"
+        f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
+        f"w={accent_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x2A2453@0.98:t=fill"
+        f"{text_filter}"
+    )
+
+
+def _reference_caption_bar_fragment(
+    *,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    margin_v: int = 166,
+    font_size: int = 38,
+) -> str:
+    bar_w = min(_REFERENCE_CAPTION_BAR_W, max(320, out_w - 2 * _REFERENCE_CAPTION_BAR_X))
+    bar_h = max(_REFERENCE_CAPTION_BAR_H, int(round(font_size * 2.05)))
+    bar_y = max(
+        _REFERENCE_TITLE_BAR_Y + _REFERENCE_TITLE_BAR_H + 36,
+        out_h - max(40, margin_v) - bar_h,
+    )
+    return (
+        f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
+        f"w={bar_w}:h={bar_h}:color=0x6570E6@1.0:t=fill,"
+        f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
+        f"w={bar_w}:h=3:color=0xE4E7FF@0.14:t=fill"
+    )
+
+
+def _escape_filter_path(path: str) -> str:
+    return path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'")
+
+
+def _has_audio_stream(media_path: str) -> bool:
+    probe = shutil.which("ffprobe")
+    if not probe:
+        return False
+    out = subprocess.run(
+        [
+            probe,
+            "-v",
+            "error",
+            "-select_streams",
+            "a:0",
+            "-show_entries",
+            "stream=codec_type",
+            "-of",
+            "csv=p=0",
+            media_path,
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    return out.returncode == 0 and "audio" in (out.stdout or "").lower()
+
+
+def build_ffmpeg_cmd(
+    req: RenderRequest,
+    *,
+    src_w: int = 1920,
+    src_h: int = 1080,
+    include_audio: bool = True,
+) -> list[str]:
+    exe = _ensure_ffmpeg() if req.mode != "dry_run" else "ffmpeg"
+
+    plan = plan_layout(
+        req.layout, out_w=req.width, out_h=req.height, src_w=src_w, src_h=src_h
+    )
+    fg = plan.filtergraph
+
+    if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+        chrome_parts = [
+            _reference_title_fragment(req.title_text, out_w=req.width),
+            _reference_caption_bar_fragment(
+                out_w=req.width,
+                out_h=req.height,
+                margin_v=min(req.subtitle_margin_v, 136),
+                font_size=max(req.subtitle_font_size, 124),
+            )
+            if req.subtitle_path
+            else "",
+        ]
+        fg = fg.replace(
+            "[vout]",
+            f"[v_prepad];[v_prepad]{','.join(part for part in chrome_parts if part)}[vout]",
+        )
+    elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+        # The native-highlight theme mirrors the reference short in
+        # videoplayback (12): no separate top title card, just centered
+        # floating captions with per-word highlight timing.
+        pass
+    else:
+        # Skip the drawtext title overlay on split layouts: the top band already
+        # shows a slide/chart with its own baked-in title, so adding an overlay
+        # on top of that is pure noise (and was stacking over the chart title
+        # in the SPLIT_CHART_PERSON Cathy Wood shorts).
+        title_allowed = req.layout.layout not in SPLIT_LAYOUTS
+        if req.title_text and title_allowed:
+            # ``plan_title_drawtext`` returns a full filter fragment (possibly
+            # two chained ``drawtext`` calls) that fits within the output width.
+            # For short titles it is byte-identical to the pre-P2 single-line
+            # form, keeping existing golden tests green while fixing the
+            # "Prediction Markets vs Derivatives" edge-clip report.
+            title_fragment = plan_title_drawtext(req.title_text, out_w=req.width)
+            if title_fragment:
+                fg = fg.replace(
+                    "[vout]",
+                    f"[v_prepad];[v_prepad]{title_fragment}[vout]",
+                )
+
+    if req.subtitle_path:
+        subtitle_esc = _escape_filter_path(req.subtitle_path)
+        fonts_dir = _fonts_dir()
+        fontsdir_arg = (
+            f":fontsdir='{_escape_filter_path(str(fonts_dir))}'" if fonts_dir.is_dir() else ""
+        )
+        # ``original_size`` pins libass's PlayResY to the actual output so
+        # ``FontSize`` and ``MarginV`` are interpreted in output pixels. Without
+        # this, libass defaults to PlayResY=288 and then upscales to the real
+        # canvas (1920) -- blowing font sizes and pushing subtitles to the
+        # middle of the frame. ``WrapStyle=0`` enables smart word wrap so long
+        # lines break into readable stacks instead of running off-screen.
+        if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+            force_style = (
+                f"Fontname={_REFERENCE_CAPTION_FONT_NAME},"
+                f"FontSize={max(req.subtitle_font_size, 124)},Alignment=2,"
+                f"MarginV={min(req.subtitle_margin_v, 136)},"
+                "MarginL=56,MarginR=56,"
+                "WrapStyle=0,BorderStyle=1,Outline=2,Shadow=0,"
+                "BackColour=&H00000000&,PrimaryColour=&H00FFFFFF&,"
+                "Bold=1,Italic=0,Spacing=-1"
+            )
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}:"
+                f"force_style='{force_style}'[vout]"
+            )
+        elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}[vout]"
+            )
+        else:
+            force_style = (
+                f"Fontname=Arial,"
+                f"FontSize={req.subtitle_font_size},Alignment=2,"
+                f"MarginV={req.subtitle_margin_v},MarginL=60,MarginR=60,"
+                "WrapStyle=0,BorderStyle=4,"
+                "BackColour=&H70000000&,PrimaryColour=&H00FFFFFF&,"
+                "Outline=0,Shadow=0,Bold=1"
+            )
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}:"
+                f"force_style='{force_style}'[vout]"
+            )
+        fg = fg.replace("[vout]", subtitle_filter)
+
+    start = req.clip.start_time_sec
+    dur = max(0.1, req.clip.duration_sec)
+
+    Path(Path(req.output_path).parent).mkdir(parents=True, exist_ok=True)
+
+    cmd: list[str] = [
+        exe,
+        "-y",
+        "-ss",
+        f"{start:.3f}",
+        "-t",
+        f"{dur:.3f}",
+        "-i",
+        req.source_path,
+        "-filter_complex",
+        fg,
+        "-map",
+        "[vout]",
+        "-c:v",
+        "libx264",
+        "-preset",
+        "veryfast",
+        "-crf",
+        "20",
+    ]
+
+    if include_audio:
+        cmd.extend(["-map", "0:a:0", "-c:a", "aac", "-b:a", "160k"])
+
+    cmd.extend(["-movflags", "+faststart", req.output_path])
+    return cmd
+
+
+def probe_source_size(source_path: str) -> tuple[int, int]:
+    exe = shutil.which("ffprobe")
+    if not exe:
+        return 1920, 1080
+    out = subprocess.run(
+        [
+            exe,
+            "-v",
+            "error",
+            "-select_streams",
+            "v:0",
+            "-show_entries",
+            "stream=width,height",
+            "-of",
+            "csv=p=0",
+            source_path,
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    try:
+        w, h = out.stdout.strip().split(",")
+        return int(w), int(h)
+    except Exception:
+        return 1920, 1080
+
+
+def render_clip(req: RenderRequest) -> RenderResult:
+    try:
+        src_w, src_h = probe_source_size(req.source_path) if req.mode != "dry_run" else (1920, 1080)
+    except Exception:
+        src_w, src_h = 1920, 1080
+
+    include_audio = True
+    if req.mode != "dry_run":
+        include_audio = _has_audio_stream(req.source_path)
+        if not include_audio:
+            return RenderResult(
+                clip_id=req.clip.clip_id,
+                output_path=req.output_path,
+                ffmpeg_cmd=[],
+                success=False,
+                error="Source media has no detectable audio stream (a:0).",
+            )
+
+    cmd = build_ffmpeg_cmd(req, src_w=src_w, src_h=src_h, include_audio=include_audio)
+
+    if req.mode == "dry_run":
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=True,
+        )
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, env=_ensure_windows_fontconfig())
+        if include_audio and not _has_audio_stream(req.output_path):
+            return RenderResult(
+                clip_id=req.clip.clip_id,
+                output_path=req.output_path,
+                ffmpeg_cmd=cmd,
+                success=False,
+                error="Rendered output is missing audio stream (a:0).",
+            )
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=True,
+        )
+    except subprocess.CalledProcessError as e:
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=False,
+            error=e.stderr.decode("utf-8", errors="replace")[-4000:] if e.stderr else str(e),
+        )
diff --git a/humeo-core/src/humeo_core/primitives/face_detect.py b/humeo-core/src/humeo_core/primitives/face_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bdbbe0f5d3b0b5850de1b0da15a1547a0a94432
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/face_detect.py
@@ -0,0 +1,135 @@
+"""Local face-detection primitive — the MediaPipe path as another ``SceneRegions`` producer.
+
+Three detection backends share the *same output schema* (``SceneRegions``):
+
+* ``primitives/classify.py``          — pixel variance heuristic, no model.
+* ``primitives/face_detect.py``       — MediaPipe face rectangle (this file).
+* ``primitives/vision.py``            — multimodal LLM + OCR bboxes.
+
+Because all three emit ``SceneRegions``, the layout planner in
+``primitives/vision.py`` (``classify_from_regions`` + ``layout_instruction_from_regions``)
+works on all of them unchanged. That is the whole point of the primitive
+boundary — the *detector* is swappable, the *renderer* is fixed.
+
+MediaPipe is imported lazily so it remains an optional extra.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Callable
+
+from ..schemas import BoundingBox, Scene, SceneRegions
+
+logger = logging.getLogger(__name__)
+
+
+# A bbox loader for any future cloud face API. Takes a keyframe path,
+# returns a normalized face bbox or ``None``. Same shape as the MediaPipe
+# wrapper below, which lets tests pass a stub and skip MediaPipe.
+FaceBBoxFn = Callable[[str], BoundingBox | None]
+
+
+def detect_face_regions(
+    scenes: list[Scene],
+    face_fn: FaceBBoxFn | None = None,
+    chart_split_threshold: float = 0.65,
+) -> list[SceneRegions]:
+    """Populate ``SceneRegions.person_bbox`` (+ ``chart_bbox``) from a face detector.
+
+    The face bbox is treated as the *person bbox*. If the face sits in the
+    right ``(1 - chart_split_threshold)`` of the frame, a *chart bbox* is
+    synthesised over the left region — mirroring the original
+    ``reframe.py`` split heuristic.
+
+    Args:
+        scenes: scenes with ``keyframe_path`` populated.
+        face_fn: pluggable face detector. Defaults to MediaPipe (lazy
+            import) if not supplied. Pass a stub in tests.
+        chart_split_threshold: face x-center above this normalized value
+            triggers a synthetic chart bbox on the left.
+    """
+
+    if face_fn is None:
+        face_fn = _mediapipe_face_bbox
+
+    out: list[SceneRegions] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available"))
+            continue
+        try:
+            face = face_fn(s.keyframe_path)
+        except Exception as e:  # one bad scene should not kill the batch
+            logger.warning("face detector failed on %s: %r", s.keyframe_path, e)
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason=f"face detector error: {e!r}"))
+            continue
+
+        if face is None:
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no face detected"))
+            continue
+
+        chart = None
+        if face.center_x >= chart_split_threshold:
+            # Face pushed right → assume a chart occupies the left region.
+            chart = BoundingBox(
+                x1=0.0,
+                y1=0.0,
+                x2=min(chart_split_threshold, face.x1),
+                y2=1.0,
+                label="chart_inferred",
+                confidence=max(0.0, face.center_x - chart_split_threshold + 0.5),
+            )
+
+        out.append(
+            SceneRegions(
+                scene_id=s.scene_id,
+                person_bbox=face,
+                chart_bbox=chart,
+                raw_reason="face detected" + (" + synthetic chart bbox" if chart else ""),
+            )
+        )
+
+    return out
+
+
+def _mediapipe_face_bbox(keyframe_path: str) -> BoundingBox | None:
+    """Return the largest-confidence face as a ``BoundingBox``, or ``None``.
+
+    Imports MediaPipe + OpenCV lazily so they remain optional dependencies
+    (install ``humeo-core[face]``).
+    """
+
+    try:
+        import cv2  # type: ignore
+        import mediapipe as mp  # type: ignore
+    except ImportError as e:
+        raise RuntimeError(
+            "MediaPipe face detection requires `pip install humeo-core[face]`"
+        ) from e
+
+    img = cv2.imread(keyframe_path)
+    if img is None:
+        return None
+    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    with mp.solutions.face_detection.FaceDetection(
+        model_selection=1, min_detection_confidence=0.5
+    ) as detector:
+        results = detector.process(rgb)
+        if not results.detections:
+            return None
+        best = max(results.detections, key=lambda d: d.score[0])
+        box = best.location_data.relative_bounding_box
+        x1 = max(0.0, min(1.0, float(box.xmin)))
+        y1 = max(0.0, min(1.0, float(box.ymin)))
+        x2 = max(x1 + 1e-6, min(1.0, x1 + float(box.width)))
+        y2 = max(y1 + 1e-6, min(1.0, y1 + float(box.height)))
+        return BoundingBox(
+            x1=x1,
+            y1=y1,
+            x2=x2,
+            y2=y2,
+            label="face",
+            confidence=float(best.score[0]),
+        )
diff --git a/humeo-core/src/humeo_core/primitives/ingest.py b/humeo-core/src/humeo_core/primitives/ingest.py
new file mode 100644
index 0000000000000000000000000000000000000000..6781af3e1529cafffcc1b7425483d06083890958
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/ingest.py
@@ -0,0 +1,187 @@
+"""Landing gear: deterministic, local extraction.
+
+Everything here can run without a GPU, without an API key, and without the
+internet (once inputs are present). This follows the HIVE guide's rule
+"extraction stays local; LLMs only reason".
+
+Functions:
+    probe_duration      — ffprobe wrapper
+    detect_scenes       — PySceneDetect (ContentDetector)
+    extract_keyframes   — ffmpeg snapshot at each scene midpoint
+    transcribe_audio    — faster-whisper (optional dependency)
+    ingest              — one-shot convenience runner that returns IngestResult
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+
+from ..schemas import IngestResult, Scene, TranscriptWord
+
+
+class IngestError(RuntimeError):
+    pass
+
+
+def _require(binary: str) -> str:
+    path = shutil.which(binary)
+    if not path:
+        raise IngestError(
+            f"Required binary not on PATH: {binary!r}. Install it or add the path."
+        )
+    return path
+
+
+def probe_duration(source_path: str) -> float:
+    ffprobe = _require("ffprobe")
+    out = subprocess.run(
+        [
+            ffprobe,
+            "-v",
+            "error",
+            "-show_entries",
+            "format=duration",
+            "-of",
+            "json",
+            source_path,
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    data = json.loads(out.stdout)
+    return float(data["format"]["duration"])
+
+
+def detect_scenes(
+    source_path: str, threshold: float = 27.0, min_scene_sec: float = 1.0
+) -> list[Scene]:
+    """Use PySceneDetect's ContentDetector to split the video into scenes."""
+
+    try:
+        from scenedetect import detect, ContentDetector  # type: ignore
+    except ModuleNotFoundError as e:
+        # scenedetect depends on OpenCV; surface the real missing module.
+        missing = getattr(e, "name", "") or str(e)
+        hint = "pip install 'scenedetect[opencv]'" if "cv2" in missing else "pip install scenedetect"
+        raise IngestError(
+            f"Scene detection unavailable (missing module: {missing}). Install with: {hint}"
+        ) from e
+
+    result = detect(
+        source_path,
+        ContentDetector(threshold=threshold, min_scene_len=int(min_scene_sec * 24)),
+    )
+    scenes: list[Scene] = []
+    for i, (start, end) in enumerate(result):
+        scenes.append(
+            Scene(
+                scene_id=f"s{i:04d}",
+                start_time=float(start.get_seconds()),
+                end_time=float(end.get_seconds()),
+            )
+        )
+    # Guard: if PySceneDetect returns empty (e.g. a single long shot),
+    # fall back to one scene spanning the whole video.
+    if not scenes:
+        duration = probe_duration(source_path)
+        scenes.append(Scene(scene_id="s0000", start_time=0.0, end_time=duration))
+    return scenes
+
+
+def extract_keyframes(
+    source_path: str, scenes: list[Scene], out_dir: str
+) -> list[Scene]:
+    """Extract one JPG per scene at its midpoint. Mutates nothing; returns copies."""
+
+    ffmpeg = _require("ffmpeg")
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+    updated: list[Scene] = []
+    for s in scenes:
+        mid = s.start_time + (s.end_time - s.start_time) / 2.0
+        out_path = os.path.join(out_dir, f"{s.scene_id}.jpg")
+        subprocess.run(
+            [
+                ffmpeg,
+                "-y",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{mid:.3f}",
+                "-i",
+                source_path,
+                "-frames:v",
+                "1",
+                "-q:v",
+                "3",
+                out_path,
+            ],
+            check=True,
+        )
+        updated.append(s.model_copy(update={"keyframe_path": out_path}))
+    return updated
+
+
+def transcribe_audio(
+    source_path: str, model_name: str = "base", language: str | None = None
+) -> list[TranscriptWord]:
+    """Word-level transcript via faster-whisper. Optional dependency."""
+
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+    except ImportError as e:
+        raise IngestError(
+            "faster-whisper is not installed. pip install faster-whisper"
+        ) from e
+
+    model = WhisperModel(model_name, device="auto", compute_type="auto")
+    segments, _info = model.transcribe(source_path, word_timestamps=True, language=language)
+    words: list[TranscriptWord] = []
+    for seg in segments:
+        for w in getattr(seg, "words", []) or []:
+            if w.word is None:
+                continue
+            words.append(
+                TranscriptWord(
+                    word=str(w.word).strip(),
+                    start_time=float(w.start or 0.0),
+                    end_time=float(w.end or 0.0),
+                )
+            )
+    return words
+
+
+def ingest(
+    source_path: str,
+    work_dir: str,
+    *,
+    with_transcript: bool = False,
+    whisper_model: str = "base",
+) -> IngestResult:
+    """Run all extraction stages and return a single ``IngestResult``."""
+
+    if not os.path.exists(source_path):
+        raise IngestError(f"source_path does not exist: {source_path}")
+
+    Path(work_dir).mkdir(parents=True, exist_ok=True)
+    keyframes_dir = os.path.join(work_dir, "keyframes")
+
+    duration = probe_duration(source_path)
+    scenes = detect_scenes(source_path)
+    scenes = extract_keyframes(source_path, scenes, keyframes_dir)
+
+    words: list[TranscriptWord] = []
+    if with_transcript:
+        words = transcribe_audio(source_path, model_name=whisper_model)
+
+    return IngestResult(
+        source_path=os.path.abspath(source_path),
+        duration_sec=duration,
+        scenes=scenes,
+        transcript_words=words,
+        keyframes_dir=keyframes_dir,
+    )
diff --git a/humeo-core/src/humeo_core/primitives/layouts.py b/humeo-core/src/humeo_core/primitives/layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c247b2f4c5079803663525534535e315b6aee6
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/layouts.py
@@ -0,0 +1,707 @@
+"""The 9:16 layout thrusters — deterministic crop + compose math.
+
+First principles: this video format has a hard constraint of **at most two
+on-screen items** per short (see :class:`humeo_core.schemas.LayoutKind`). That
+gives exactly five recipes:
+
+* 1 person alone, tight  → ``ZOOM_CALL_CENTER``
+* 1 person alone, wider  → ``SIT_CENTER``
+* 1 chart + 1 person     → ``SPLIT_CHART_PERSON``
+* 2 persons              → ``SPLIT_TWO_PERSONS``
+* 2 charts               → ``SPLIT_TWO_CHARTS``
+
+Each planner returns a pure ``ffmpeg -filter_complex`` fragment ending in
+``[vout]``. The compiler (``compile.py``) glues the fragment to the cut +
+audio + subtitle chain. Because every planner is a pure function that
+returns a string, the whole layout system is unit-testable without ever
+invoking ffmpeg.
+
+Split layouts share one contract:
+
+* Output: 9:16 frame split into a **top band** and **bottom band**.
+  Band heights are driven by :attr:`LayoutInstruction.top_band_ratio`.
+  Default is ``0.5`` (even 50/50), matching the user-requested symmetric look.
+* Source strips for the two items are **complementary** — they partition
+  the source width at a single seam so the two items never overlap and
+  together cover the full frame width.
+* Each strip is scaled to fill its output band using the "cover"
+  convention (``force_original_aspect_ratio=increase`` + center crop), so
+  the band is fully painted (no letterbox bars, no stretch).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from ..schemas import (
+    BoundingBox,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    TimedCenterPoint,
+)
+
+
+# Source geometry assumption. Most podcast sources are 1920x1080; we still
+# normalize everything by the actual source size so changing this is safe.
+DEFAULT_SRC_W = 1920
+DEFAULT_SRC_H = 1080
+TRACKING_BLEND_SEC = 0.30
+
+
+@dataclass(frozen=True)
+class FilterPlan:
+    """Result of planning a layout.
+
+    ``filtergraph`` is the body of ``-filter_complex`` and ends with
+    ``[vout]`` as the final labelled stream.
+    """
+
+    filtergraph: str
+    out_label: str = "vout"
+
+
+# ---------------------------------------------------------------------------
+# Tiny pixel helpers
+# ---------------------------------------------------------------------------
+
+
+def _clamp01(v: float) -> float:
+    return max(0.0, min(1.0, v))
+
+
+def _even(v: int) -> int:
+    """Floor ``v`` to an even integer (ffmpeg ``crop``/``scale`` need even dims)."""
+    return v - (v % 2)
+
+
+def _bbox_to_crop_pixels(
+    box: BoundingBox, src_w: int, src_h: int
+) -> tuple[int, int, int, int]:
+    """Normalized bbox → ``(cw, ch, x, y)`` with even dimensions for ffmpeg."""
+    x1 = int(round(_clamp01(box.x1) * float(src_w)))
+    y1 = int(round(_clamp01(box.y1) * float(src_h)))
+    x2 = int(round(_clamp01(box.x2) * float(src_w)))
+    y2 = int(round(_clamp01(box.y2) * float(src_h)))
+    x1 = max(0, min(src_w - 2, x1))
+    y1 = max(0, min(src_h - 2, y1))
+    x2 = max(x1 + 2, min(src_w, x2))
+    y2 = max(y1 + 2, min(src_h, y2))
+    cw = _even(x2 - x1)
+    ch = _even(y2 - y1)
+    return max(2, cw), max(2, ch), _even(x1), _even(y1)
+
+
+def _base_crop_size(
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+) -> tuple[int, int]:
+    if src_w / src_h >= target_aspect:
+        base_ch = src_h
+        base_cw = int(round(base_ch * target_aspect))
+    else:
+        base_cw = src_w
+        base_ch = int(round(base_cw / target_aspect))
+    return _even(max(2, base_cw)), _even(max(2, base_ch))
+
+
+def _crop_box(
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+    zoom: float,
+    center_x_norm: float,
+    center_y_norm: float = 0.5,
+) -> tuple[int, int, int, int]:
+    """Return ``(cw, ch, x, y)`` crop values for a centered aspect-ratio crop.
+
+    ``zoom > 1`` means tighter crop (smaller window around the center). The
+    function always keeps the crop window fully inside the source frame.
+    """
+
+    zoom = max(1.0, zoom)
+    base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
+
+    cw = _even(max(2, int(round(base_cw / zoom))))
+    ch = _even(max(2, int(round(base_ch / zoom))))
+
+    cx = int(round(_clamp01(center_x_norm) * src_w))
+    cy = int(round(_clamp01(center_y_norm) * src_h))
+    x = _even(max(0, min(src_w - cw, cx - cw // 2)))
+    y = _even(max(0, min(src_h - ch, cy - ch // 2)))
+    return cw, ch, x, y
+
+
+def _center_crop_to_9x16(
+    src_w: int, src_h: int, zoom: float, person_x_norm: float
+) -> tuple[int, int, int, int]:
+    return _crop_box(src_w, src_h, 9 / 16, zoom, person_x_norm, 0.5)
+
+
+def _crop_x_from_center(src_w: int, cw: int, center_x_norm: float) -> int:
+    """Return an even, in-bounds crop x for a normalized horizontal center."""
+    cx = int(round(_clamp01(center_x_norm) * src_w))
+    return _even(max(0, min(src_w - cw, cx - cw // 2)))
+
+
+def _tracked_value_expr(
+    values: list[tuple[float, float]],
+    *,
+    clamp_min: float | None = None,
+    clamp_max: float | None = None,
+    round_even: bool = False,
+) -> str:
+    if not values:
+        raise ValueError("values must not be empty")
+
+    expr = f"{float(values[-1][0]):.3f}"
+    for idx in range(len(values) - 2, -1, -1):
+        v0, t0 = float(values[idx][0]), float(values[idx][1])
+        v1, t1 = float(values[idx + 1][0]), float(values[idx + 1][1])
+        if t1 <= t0:
+            expr = f"if(lt(t\\,{t1:.3f})\\,{v0:.3f}\\,{expr})"
+            continue
+
+        switch_t = (t0 + t1) / 2.0
+        blend_half = TRACKING_BLEND_SEC / 2.0
+        blend_start = max(t0, switch_t - blend_half)
+        blend_end = min(t1, switch_t + blend_half)
+
+        if blend_end <= blend_start:
+            expr = f"if(lt(t\\,{switch_t:.3f})\\,{v0:.3f}\\,{expr})"
+            continue
+
+        blend_expr = (
+            f"{v0:.3f}+({v1 - v0:.3f})*(t-{blend_start:.3f})/({blend_end - blend_start:.3f})"
+        )
+        expr = (
+            f"if(lt(t\\,{blend_start:.3f})\\,{v0:.3f}\\,"
+            f"if(lt(t\\,{blend_end:.3f})\\,{blend_expr}\\,{expr}))"
+        )
+
+    if clamp_min is not None:
+        expr = f"max({clamp_min:.3f}\\,{expr})"
+    if clamp_max is not None:
+        expr = f"min({clamp_max:.3f}\\,{expr})"
+    if round_even:
+        expr = f"floor(({expr})/2)*2"
+    return expr
+
+
+def _tracked_crop_x_expr(
+    *,
+    src_w: int,
+    crop_w: int,
+    tracking: list[TimedCenterPoint],
+) -> str:
+    """Return an ffmpeg expression for a time-varying crop x position.
+
+    We mostly hold each framing until the midpoint between adjacent samples,
+    then blend over a short window. That keeps edited talk footage from
+    drifting for seconds after a cut while still avoiding a one-frame jump
+    in the crop position.
+    """
+    if not tracking:
+        raise ValueError("tracking must not be empty")
+
+    center_points = [
+        (_clamp01(point.x_norm) * src_w, float(point.t_sec))
+        for point in tracking
+    ]
+    center_expr = _tracked_value_expr(
+        center_points,
+        clamp_min=0.0,
+        clamp_max=float(src_w),
+    )
+    max_x = max(0, src_w - crop_w)
+    return f"floor(max(0\\,min({max_x}\\,({center_expr})-{crop_w}/2))/2)*2"
+
+
+def _tracked_crop_exprs(
+    *,
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+    default_zoom: float,
+    center_y_norm: float,
+    tracking: list[TimedCenterPoint],
+) -> tuple[str, str, str, str]:
+    if not tracking:
+        raise ValueError("tracking must not be empty")
+
+    base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
+    width_points: list[tuple[float, float]] = []
+    height_points: list[tuple[float, float]] = []
+    center_points: list[tuple[float, float]] = []
+    for point in tracking:
+        zoom = max(1.0, float(point.zoom if point.zoom is not None else default_zoom))
+        width_points.append((float(_even(max(2, int(round(base_cw / zoom))))), float(point.t_sec)))
+        height_points.append((float(_even(max(2, int(round(base_ch / zoom))))), float(point.t_sec)))
+        center_points.append((_clamp01(point.x_norm) * src_w, float(point.t_sec)))
+
+    w_expr = _tracked_value_expr(
+        width_points,
+        clamp_min=2.0,
+        clamp_max=float(base_cw),
+        round_even=True,
+    )
+    h_expr = _tracked_value_expr(
+        height_points,
+        clamp_min=2.0,
+        clamp_max=float(base_ch),
+        round_even=True,
+    )
+    center_expr = _tracked_value_expr(
+        center_points,
+        clamp_min=0.0,
+        clamp_max=float(src_w),
+    )
+    center_y_px = _clamp01(center_y_norm) * src_h
+    x_expr = f"floor(max(0\\,min({src_w}-out_w\\,({center_expr})-out_w/2))/2)*2"
+    y_expr = f"floor(max(0\\,min({src_h}-out_h\\,{center_y_px:.3f}-out_h/2))/2)*2"
+    return w_expr, h_expr, x_expr, y_expr
+
+
+# ---------------------------------------------------------------------------
+# Split helpers — shared by all three split layouts
+# ---------------------------------------------------------------------------
+
+
+# Minimum source-strip width for a split, as a fraction of source width.
+# Prevents a chart/person bbox that hugs one edge from starving the other.
+_MIN_SPLIT_STRIP_FRAC = 0.2
+_CHART_STRIP_VERTICAL_PAD_FRAC = 0.12
+
+
+@dataclass(frozen=True)
+class _SplitStrip:
+    """A source-frame crop rectangle destined for one output band."""
+
+    cw: int
+    ch: int
+    x: int
+    y: int
+
+    def filter_crop(self, input_label: str, out_w: int, band_h: int, out_label: str) -> str:
+        """Return ``[input]crop=...,scale=...,crop=...,setsar=1[out_label]``.
+
+        Uses the "cover" convention: scale so the band is fully painted, then
+        center-crop any overflow. Bands always get filled — no letterbox bars.
+        """
+        return (
+            f"[{input_label}]crop={self.cw}:{self.ch}:{self.x}:{self.y},"
+            f"scale={out_w}:{band_h}:force_original_aspect_ratio=increase,"
+            f"crop={out_w}:{band_h},setsar=1[{out_label}]"
+        )
+
+
+def _bbox_strip(
+    box: BoundingBox | None,
+    *,
+    src_w: int,
+    src_h: int,
+    x_start: int,
+    x_end: int,
+) -> _SplitStrip:
+    """Build a source crop for one band.
+
+    Horizontal range is fixed by ``[x_start, x_end)`` (from the seam math so
+    strips partition the source width). Vertical range comes from ``box``
+    when available — that's what makes the chart **fill** the output band
+    instead of being squashed inside full-height source context.
+    """
+    x = _even(max(0, min(src_w - 2, x_start)))
+    cw = _even(max(2, min(src_w - x, x_end - x)))
+
+    if box is not None:
+        y1 = int(round(_clamp01(box.y1) * float(src_h)))
+        y2 = int(round(_clamp01(box.y2) * float(src_h)))
+        y = _even(max(0, min(src_h - 2, y1)))
+        ch = _even(max(2, min(src_h - y, y2 - y)))
+    else:
+        y = 0
+        ch = _even(src_h)
+
+    return _SplitStrip(cw=cw, ch=ch, x=x, y=y)
+
+
+def _chart_strip_with_vertical_pad(
+    strip: _SplitStrip,
+    *,
+    src_h: int,
+    pad_frac: float = _CHART_STRIP_VERTICAL_PAD_FRAC,
+) -> _SplitStrip:
+    """Relax chart crops vertically so cover-scaling trims fewer chart edges."""
+
+    pad = _even(max(0, int(round(strip.ch * max(0.0, pad_frac)))))
+    if pad <= 0:
+        return strip
+
+    top = max(0, strip.y - pad)
+    bottom = min(src_h, strip.y + strip.ch + pad)
+    ch = _even(max(2, bottom - top))
+    if ch <= strip.ch:
+        return strip
+    y = _even(max(0, min(src_h - ch, top)))
+    return _SplitStrip(cw=strip.cw, ch=ch, x=strip.x, y=y)
+
+
+def _compute_seam(
+    *,
+    left_box: BoundingBox | None,
+    right_box: BoundingBox | None,
+    src_w: int,
+    src_h: int,
+    default_fraction: float = 0.5,
+) -> int:
+    """Return an even x-coordinate that partitions the source into two strips.
+
+    When both bboxes are known, the seam is the midpoint of the gap/overlap
+    between ``left_box.x2`` and ``right_box.x1``. Falls back to
+    ``default_fraction * src_w`` (0.5 = even) otherwise. The seam is clamped
+    so neither strip is thinner than :data:`_MIN_SPLIT_STRIP_FRAC` of source.
+    """
+    if left_box is not None and right_box is not None:
+        _, _, left_x, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
+        left_cw, _, _, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
+        _, _, right_x, _ = _bbox_to_crop_pixels(right_box, src_w, src_h)
+
+        left_right = left_x + left_cw
+        seam = int(round((left_right + right_x) / 2.0))
+    else:
+        seam = int(round(default_fraction * float(src_w)))
+
+    seam = _even(seam)
+    min_strip = _even(max(2, int(round(src_w * _MIN_SPLIT_STRIP_FRAC))))
+    if min_strip * 2 >= src_w:
+        min_strip = _even(max(2, src_w // 4))
+    return max(min_strip, min(src_w - min_strip, seam))
+
+
+def _band_heights(out_h: int, top_ratio: float) -> tuple[int, int]:
+    """Return ``(top_h, bot_h)`` even band heights that sum to ``out_h``."""
+    top_h = _even(int(round(out_h * top_ratio)))
+    top_h = max(2, min(out_h - 2, top_h))
+    bot_h = out_h - top_h
+    return top_h, bot_h
+
+
+def _stack_filtergraph(
+    *,
+    top_strip: _SplitStrip,
+    bot_strip: _SplitStrip,
+    out_w: int,
+    top_h: int,
+    bot_h: int,
+) -> str:
+    """Compose the split filter graph: ``[0:v]split=2 → two crops → vstack → [vout]``."""
+    top_fg = top_strip.filter_crop("src1", out_w, top_h, "top")
+    bot_fg = bot_strip.filter_crop("src2", out_w, bot_h, "bot")
+    return (
+        f"[0:v]split=2[src1][src2];"
+        f"{top_fg};"
+        f"{bot_fg};"
+        f"[top][bot]vstack=inputs=2[vout]"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Layout: single-subject (centered) — 1 person
+# ---------------------------------------------------------------------------
+
+
+def plan_zoom_call_center(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 person, tight zoom-call framing. ``zoom`` clamped to ``>= 1.25``."""
+    zoom = max(instruction.zoom, 1.25)
+    cw, ch, x, y = _center_crop_to_9x16(src_w, src_h, zoom, instruction.person_x_norm)
+    if instruction.person_tracking:
+        if any(point.zoom is not None for point in instruction.person_tracking):
+            w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
+                src_w=src_w,
+                src_h=src_h,
+                target_aspect=9 / 16,
+                default_zoom=zoom,
+                center_y_norm=0.5,
+                tracking=instruction.person_tracking,
+            )
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+        else:
+            x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+    else:
+        fg = (
+            f"[0:v]crop={cw}:{ch}:{x}:{y},"
+            f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+        )
+    return FilterPlan(filtergraph=fg)
+
+
+def plan_sit_center(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 person, interview/seated framing. Vertical center biased to ``0.48``
+    so faces sit slightly above the 9:16 middle instead of centered on a
+    subject's chest.
+    """
+    zoom = max(instruction.zoom, 1.0)
+    cw, ch, x, y = _crop_box(
+        src_w, src_h, 9 / 16, zoom, instruction.person_x_norm, 0.48
+    )
+    if instruction.person_tracking:
+        if any(point.zoom is not None for point in instruction.person_tracking):
+            w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
+                src_w=src_w,
+                src_h=src_h,
+                target_aspect=9 / 16,
+                default_zoom=zoom,
+                center_y_norm=0.48,
+                tracking=instruction.person_tracking,
+            )
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+        else:
+            x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+    else:
+        fg = (
+            f"[0:v]crop={cw}:{ch}:{x}:{y},"
+            f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+        )
+    return FilterPlan(filtergraph=fg)
+
+
+# ---------------------------------------------------------------------------
+# Split layouts — 2 items stacked vertically
+# ---------------------------------------------------------------------------
+
+
+def plan_split_chart_person(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 chart + 1 person.
+
+    **Horizontal partition.** Chart occupies the left source strip, person the
+    right strip. When both bboxes are set (Gemini vision), the seam sits at
+    the midpoint between ``chart.x2`` and ``person.x1`` so the strips are
+    complementary (no overlap, no gap). Otherwise the seam defaults to a
+    2/3 | 1/3 split (chart left, person right), matching the Ark-style
+    explainer-slide geometry this codebase was originally written against.
+
+    **Vertical crop.** Each strip's vertical extent comes from the
+    corresponding bbox when provided — crucial so the chart **fills** its
+    output band instead of being lost inside full-height source context
+    (plant, background, lower-third graphics, etc.). Falls back to full
+    source height when bboxes are unavailable.
+
+    **Output bands.** Controlled by :attr:`LayoutInstruction.top_band_ratio`
+    (default 0.5 = even 50/50 — the user-requested symmetric look). Focus
+    stack order picks chart-on-top (default) vs person-on-top.
+    """
+
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+
+    chart_box = instruction.split_chart_region
+    person_box = instruction.split_person_region
+
+    if chart_box is not None and person_box is not None:
+        seam = _compute_seam(
+            left_box=chart_box, right_box=person_box, src_w=src_w, src_h=src_h
+        )
+        chart_start = 0
+    else:
+        # Historical default: chart = left 2/3, person = right 1/3 (the
+        # Ark-style explainer-slide geometry this codebase was originally
+        # written against). ``chart_x_norm`` trims the chart strip from its
+        # left edge when we have no vision bbox to do it precisely.
+        seam = _even(max(2, min(src_w - 2, int(round((2.0 / 3.0) * float(src_w))))))
+        trim = int(round(_clamp01(instruction.chart_x_norm) * float(seam)))
+        chart_start = _even(max(0, min(seam - 2, trim)))
+
+    chart_strip = _bbox_strip(
+        chart_box, src_w=src_w, src_h=src_h, x_start=chart_start, x_end=seam
+    )
+    if chart_box is not None:
+        chart_strip = _chart_strip_with_vertical_pad(chart_strip, src_h=src_h)
+    person_strip = _bbox_strip(
+        person_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    return _emit_split(
+        chart_strip=chart_strip,
+        person_strip=person_strip,
+        order=instruction.focus_stack_order,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+
+
+def _emit_split(
+    *,
+    chart_strip: _SplitStrip,
+    person_strip: _SplitStrip,
+    order: FocusStackOrder,
+    out_w: int,
+    top_h: int,
+    bot_h: int,
+) -> FilterPlan:
+    if order == FocusStackOrder.CHART_THEN_PERSON:
+        fg = _stack_filtergraph(
+            top_strip=chart_strip,
+            bot_strip=person_strip,
+            out_w=out_w,
+            top_h=top_h,
+            bot_h=bot_h,
+        )
+    else:
+        fg = _stack_filtergraph(
+            top_strip=person_strip,
+            bot_strip=chart_strip,
+            out_w=out_w,
+            top_h=top_h,
+            bot_h=bot_h,
+        )
+    return FilterPlan(filtergraph=fg)
+
+
+def plan_split_two_persons(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """2 persons (interview two-up) stacked vertically.
+
+    First person = ``split_person_region``, second person = ``split_second_person_region``.
+    Seam sits at the midpoint between the two bboxes when both are known;
+    otherwise defaults to a centered 50/50 split.
+    """
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+
+    left_box = instruction.split_person_region
+    right_box = instruction.split_second_person_region
+
+    seam = _compute_seam(
+        left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
+    )
+
+    left_strip = _bbox_strip(
+        left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
+    )
+    right_strip = _bbox_strip(
+        right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    fg = _stack_filtergraph(
+        top_strip=left_strip,
+        bot_strip=right_strip,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+    return FilterPlan(filtergraph=fg)
+
+
+def plan_split_two_charts(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """2 charts stacked vertically.
+
+    First chart = ``split_chart_region``, second chart = ``split_second_chart_region``.
+    Uses the same seam/bbox-y-crop recipe as the other splits, so each chart
+    fills its output band instead of being surrounded by source context.
+    """
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+
+    left_box = instruction.split_chart_region
+    right_box = instruction.split_second_chart_region
+
+    seam = _compute_seam(
+        left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
+    )
+
+    left_strip = _bbox_strip(
+        left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
+    )
+    if left_box is not None:
+        left_strip = _chart_strip_with_vertical_pad(left_strip, src_h=src_h)
+    right_strip = _bbox_strip(
+        right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    if right_box is not None:
+        right_strip = _chart_strip_with_vertical_pad(right_strip, src_h=src_h)
+    fg = _stack_filtergraph(
+        top_strip=left_strip,
+        bot_strip=right_strip,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+    return FilterPlan(filtergraph=fg)
+
+
+_DISPATCH = {
+    LayoutKind.ZOOM_CALL_CENTER: plan_zoom_call_center,
+    LayoutKind.SIT_CENTER: plan_sit_center,
+    LayoutKind.SPLIT_CHART_PERSON: plan_split_chart_person,
+    LayoutKind.SPLIT_TWO_PERSONS: plan_split_two_persons,
+    LayoutKind.SPLIT_TWO_CHARTS: plan_split_two_charts,
+}
+
+
+def plan_layout(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """Dispatch to one of the five thrusters.
+
+    Exhaustive over :class:`LayoutKind` — adding a new layout requires adding
+    a planner above **and** an entry in :data:`_DISPATCH`.
+    """
+
+    fn = _DISPATCH.get(instruction.layout)
+    if fn is None:
+        raise ValueError(f"Unknown layout: {instruction.layout!r}")
+    return fn(instruction, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
diff --git a/humeo-core/src/humeo_core/primitives/select_clips.py b/humeo-core/src/humeo_core/primitives/select_clips.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fd915d4d7478df3fe4c719ee80a7649da271f8a
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/select_clips.py
@@ -0,0 +1,150 @@
+"""Clip selection: pick the strongest 30-60s segments from a long source.
+
+Two backends, same contract:
+
+* ``select_clips_heuristic`` — greedy word-density scoring. Uses the
+  transcript alone; zero model calls. Good baseline when transcript exists.
+* ``select_clips_with_llm`` — pluggable LLM hook. Caller provides a
+  ``(prompt_text) -> str`` function that must return strict JSON matching
+  the ``ClipPlan`` schema. We re-validate before returning.
+
+Both return a ``ClipPlan``.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Callable
+
+from ..schemas import Clip, ClipPlan, TranscriptWord
+
+
+LLMTextFn = Callable[[str], str]
+
+
+CLIP_SELECTOR_PROMPT_TEMPLATE = """You are a viral-clip selector for a podcast editor.
+Return ONLY JSON matching this shape:
+
+{{
+  "source_path": "{source_path}",
+  "clips": [
+    {{
+      "clip_id": "001",
+      "topic": "<short topic>",
+      "start_time_sec": <float>,
+      "end_time_sec": <float>,
+      "viral_hook": "<one line>",
+      "virality_score": <0..1>,
+      "transcript": "<full clip transcript>",
+      "suggested_overlay_title": "<<=6 words>"
+    }}
+  ]
+}}
+
+Pick {target_count} clips, each {min_sec}-{max_sec} seconds long, NO overlaps, sorted by virality_score desc.
+
+Transcript (word, start, end):
+{transcript}
+"""
+
+
+def _words_in_window(
+    words: list[TranscriptWord], start: float, end: float
+) -> list[TranscriptWord]:
+    return [w for w in words if w.start_time >= start and w.end_time <= end]
+
+
+def select_clips_heuristic(
+    source_path: str,
+    words: list[TranscriptWord],
+    duration_sec: float,
+    *,
+    target_count: int = 5,
+    min_sec: float = 30.0,
+    max_sec: float = 60.0,
+    step_sec: float = 5.0,
+) -> ClipPlan:
+    """Greedy: slide a window, score by words/sec, take top non-overlapping picks."""
+
+    if duration_sec <= min_sec or not words:
+        # No sensible windowing possible; return one clip of the whole thing.
+        end = min(duration_sec, max_sec) if duration_sec > 0 else max_sec
+        return ClipPlan(
+            source_path=source_path,
+            clips=[
+                Clip(
+                    clip_id="001",
+                    topic="Full source",
+                    start_time_sec=0.0,
+                    end_time_sec=max(end, 1.0),
+                    viral_hook="",
+                    virality_score=0.5,
+                    transcript=" ".join(w.word for w in words),
+                    suggested_overlay_title="Highlight",
+                )
+            ],
+        )
+
+    candidates: list[tuple[float, float, float, str]] = []
+    window = (min_sec + max_sec) / 2.0
+    t = 0.0
+    while t + window <= duration_sec:
+        ws = _words_in_window(words, t, t + window)
+        if ws:
+            density = len(ws) / window
+            text = " ".join(w.word for w in ws)
+            candidates.append((density, t, t + window, text))
+        t += step_sec
+
+    candidates.sort(key=lambda c: c[0], reverse=True)
+    picked: list[tuple[float, float, float, str]] = []
+    for c in candidates:
+        if len(picked) >= target_count:
+            break
+        if all(c[2] <= p[1] or c[1] >= p[2] for p in picked):
+            picked.append(c)
+    picked.sort(key=lambda c: c[1])
+
+    clips: list[Clip] = []
+    for i, (density, s, e, text) in enumerate(picked, start=1):
+        norm = min(1.0, density / 3.0)  # ~3 words/sec is dense talking
+        clips.append(
+            Clip(
+                clip_id=f"{i:03d}",
+                topic=text.split(".")[0][:60] or f"Clip {i}",
+                start_time_sec=round(s, 2),
+                end_time_sec=round(e, 2),
+                viral_hook=text[:120],
+                virality_score=round(norm, 3),
+                transcript=text,
+                suggested_overlay_title=(text.split(".")[0][:40] or f"Clip {i}"),
+            )
+        )
+    return ClipPlan(source_path=source_path, clips=clips)
+
+
+def select_clips_with_llm(
+    source_path: str,
+    words: list[TranscriptWord],
+    *,
+    target_count: int,
+    min_sec: float,
+    max_sec: float,
+    text_fn: LLMTextFn,
+) -> ClipPlan:
+    transcript_lines = "\n".join(
+        f"{w.word}\t{w.start_time:.2f}\t{w.end_time:.2f}" for w in words
+    )
+    prompt = CLIP_SELECTOR_PROMPT_TEMPLATE.format(
+        source_path=source_path,
+        target_count=target_count,
+        min_sec=min_sec,
+        max_sec=max_sec,
+        transcript=transcript_lines,
+    )
+    raw = text_fn(prompt)
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"LLM did not return JSON: {e}; raw={raw[:200]!r}") from e
+    return ClipPlan.model_validate(data)
diff --git a/humeo-core/src/humeo_core/primitives/vision.py b/humeo-core/src/humeo_core/primitives/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..938fd3951876a8f7fd1f39964bc91e14bdc61f70
--- /dev/null
+++ b/humeo-core/src/humeo_core/primitives/vision.py
@@ -0,0 +1,210 @@
+"""Vision-LLM + OCR primitive — the alt path to per-scene framing decisions.
+
+Design (Bryan's "big screen change -> v3 images -> LLM+OCR -> bbox" idea):
+
+1. Scene detection already produces one keyframe per scene (deterministic,
+   local, cheap). That is ``primitives/ingest.py::extract_keyframes``.
+2. For each keyframe, call a pluggable vision LLM with an OCR hint. The
+   model returns normalized bboxes for the on-screen roles it cares about
+   (``person``, ``chart``) plus any OCR text it reads.
+3. Fold those bboxes into ``LayoutInstruction`` values so the existing
+   layout planner (``primitives/layouts.py``) does the actual ffmpeg math.
+
+Why this shape:
+
+* **Pluggable**. Caller supplies ``LLMRegionFn``. We never hard-code a
+  provider. The same primitive works for Gemini, GPT-4o, internal models,
+  tests, or mocks.
+* **Schema-validated**. Raw model output is parsed into ``SceneRegions``
+  (Pydantic). Malformed output degrades to ``None`` regions rather than
+  crashing or corrupting downstream state.
+* **Separable**. ``detect_regions_with_llm`` is one function. Mapping
+  regions to ``LayoutInstruction`` is another. Mapping a ``LayoutKind``
+  guess from regions is a third. Each is independently testable.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Callable
+
+from ..schemas import (
+    BoundingBox,
+    LayoutInstruction,
+    LayoutKind,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+)
+
+
+LLMRegionFn = Callable[[str, str], str]
+"""Signature: (keyframe_path, prompt) -> raw model string (expected JSON).
+
+The caller is responsible for any image encoding (base64, multipart, etc.).
+The primitive only passes the path + prompt and re-validates the reply.
+"""
+
+
+REGION_PROMPT = """You are a vision+OCR system for a short-video editor.
+Look at the provided keyframe and return a STRICT JSON object of this shape:
+
+{
+  "person_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
+  "chart_bbox":  {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
+  "ocr_text":    "<text visible on screen, empty string if none>",
+  "reason":      "<= 20 words of rationale"
+}
+
+Rules:
+- All bbox coordinates are normalized to the frame (0=left/top, 1=right/bottom).
+- x2 > x1, y2 > y1.
+- Return null for any region that is not present (e.g. a pure talking-head
+  scene has no chart).
+- "person_bbox" is the *speaker's* body/head region if visible.
+- "chart_bbox" is any chart, graph, slide, screenshare, or diagram.
+- OCR text should be the readable text on screen (titles, labels, chart
+  axis values). Omit subtitle captions.
+- NO markdown, NO prose outside JSON. JSON only.
+"""
+
+
+# ---------------------------------------------------------------------------
+# Core: detect regions per scene via pluggable LLM
+# ---------------------------------------------------------------------------
+
+
+def detect_regions_with_llm(
+    scenes: list[Scene], vision_fn: LLMRegionFn
+) -> list[SceneRegions]:
+    """Call ``vision_fn`` for each scene's keyframe and return parsed regions.
+
+    Parse failures degrade to an empty ``SceneRegions`` with ``raw_reason``
+    describing the error — never raise — so a single bad scene can't take
+    down the whole pipeline.
+    """
+
+    out: list[SceneRegions] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(
+                SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available")
+            )
+            continue
+        raw = vision_fn(s.keyframe_path, REGION_PROMPT)
+        out.append(_parse_region_reply(s.scene_id, raw))
+    return out
+
+
+def _parse_region_reply(scene_id: str, raw: str) -> SceneRegions:
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        return SceneRegions(scene_id=scene_id, raw_reason=f"JSON parse error: {e!r}")
+
+    def _opt_bbox(value: object) -> BoundingBox | None:
+        if not value:
+            return None
+        try:
+            return BoundingBox.model_validate(value)
+        except Exception:
+            return None
+
+    return SceneRegions(
+        scene_id=scene_id,
+        person_bbox=_opt_bbox(data.get("person_bbox")),
+        chart_bbox=_opt_bbox(data.get("chart_bbox")),
+        ocr_text=str(data.get("ocr_text", ""))[:4000],
+        raw_reason=str(data.get("reason", ""))[:400],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Derivation: regions -> LayoutKind / LayoutInstruction
+# ---------------------------------------------------------------------------
+
+
+# Width threshold: if the chart bbox covers this much of the frame width, it
+# is wide enough to treat the scene as a split_chart_person. Tuned for the
+# source videos described in the spec (chart ~2/3 of width).
+_CHART_WIDTH_SPLIT_THRESHOLD = 0.45
+
+
+def classify_from_regions(regions: SceneRegions) -> SceneClassification:
+    """Pick a ``LayoutKind`` for a scene using only its ``SceneRegions``.
+
+    Priority:
+      1. If ``chart_bbox`` is present and wide, it's ``SPLIT_CHART_PERSON``.
+      2. Else if ``person_bbox`` is present and tight, ``ZOOM_CALL_CENTER``.
+      3. Else default to ``SIT_CENTER`` with low confidence.
+
+    "Tight" ≈ the person covers more than half the frame width (zoom-call
+    webcam framing). "Wide" for a chart ≈ 45% of frame width or more.
+    """
+
+    if regions.chart_bbox and regions.chart_bbox.width >= _CHART_WIDTH_SPLIT_THRESHOLD:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            confidence=float(min(1.0, 0.5 + regions.chart_bbox.width / 2.0)),
+            reason=f"chart bbox covers {regions.chart_bbox.width:.2f} of width",
+        )
+    if regions.person_bbox and regions.person_bbox.width >= 0.5:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.ZOOM_CALL_CENTER,
+            confidence=float(min(1.0, 0.5 + regions.person_bbox.width / 2.0)),
+            reason=f"person bbox wide ({regions.person_bbox.width:.2f}) — tight framing",
+        )
+    if regions.person_bbox:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.7,
+            reason="person present, no wide chart, wider framing",
+        )
+    return SceneClassification(
+        scene_id=regions.scene_id,
+        layout=LayoutKind.SIT_CENTER,
+        confidence=0.3,
+        reason=regions.raw_reason or "no regions detected — defaulting to sit_center",
+    )
+
+
+def layout_instruction_from_regions(
+    regions: SceneRegions,
+    classification: SceneClassification,
+    *,
+    clip_id: str | None = None,
+    zoom: float = 1.0,
+) -> LayoutInstruction:
+    """Build a ``LayoutInstruction`` whose knobs are populated from bboxes.
+
+    ``person_x_norm`` uses the person bbox center when available; falls back
+    to 0.5 (center). ``chart_x_norm`` uses the chart bbox left edge; falls
+    back to 0.0.
+    """
+
+    person_x = regions.person_bbox.center_x if regions.person_bbox else 0.5
+    chart_x = regions.chart_bbox.x1 if regions.chart_bbox else 0.0
+    return LayoutInstruction(
+        clip_id=clip_id or classification.scene_id,
+        layout=classification.layout,
+        zoom=zoom,
+        person_x_norm=person_x,
+        chart_x_norm=chart_x,
+    )
+
+
+def classify_scenes_with_vision_llm(
+    scenes: list[Scene], vision_fn: LLMRegionFn
+) -> list[tuple[SceneRegions, SceneClassification]]:
+    """One-shot helper: keyframes -> regions -> classifications.
+
+    Returns ``(regions, classification)`` pairs per scene so the caller can
+    keep both artefacts on disk (regions = deep detail, classification =
+    what a renderer consumes).
+    """
+
+    regions = detect_regions_with_llm(scenes, vision_fn)
+    return [(r, classify_from_regions(r)) for r in regions]
diff --git a/humeo-core/src/humeo_core/schemas.py b/humeo-core/src/humeo_core/schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cee4af91664103ca45f060427ecef25e6ea4c78
--- /dev/null
+++ b/humeo-core/src/humeo_core/schemas.py
@@ -0,0 +1,518 @@
+"""Strict JSON contracts — the "container" of the rocket.
+
+Every primitive reads and writes these. No primitive takes or returns free-form
+strings. This is the non-negotiable interface described in the HIVE paper
+guide (section 7): machine-checkable intermediate artifacts at every stage.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, Field, field_validator, model_serializer, model_validator
+
+
+# ---------------------------------------------------------------------------
+# Extraction artifacts
+# ---------------------------------------------------------------------------
+
+
+class Scene(BaseModel):
+    """A single shot/scene detected in the source video."""
+
+    scene_id: str
+    start_time: float = Field(ge=0)
+    end_time: float = Field(gt=0)
+    keyframe_path: str | None = None
+
+    @field_validator("end_time")
+    @classmethod
+    def _end_after_start(cls, v: float, info) -> float:
+        start = info.data.get("start_time", 0.0)
+        if v <= start:
+            raise ValueError("end_time must be strictly greater than start_time")
+        return v
+
+    @property
+    def duration(self) -> float:
+        return self.end_time - self.start_time
+
+
+class TranscriptWord(BaseModel):
+    """One ASR token with times in **seconds on the source video** timeline."""
+
+    word: str
+    start_time: float = Field(ge=0)
+    end_time: float = Field(ge=0)
+
+
+class ClipSubtitleWords(BaseModel):
+    """Words for one clip with times in **seconds relative to clip start** (t=0 at cut in-point)."""
+
+    words: list[TranscriptWord] = Field(default_factory=list)
+
+
+class FocusStackOrder(str, Enum):
+    """Vertical order for split layouts: which item occupies the top vs bottom band.
+
+    Bands are split by :attr:`LayoutInstruction.top_band_ratio` (default 0.5 = even).
+    For ``SPLIT_CHART_PERSON`` this picks chart-on-top vs person-on-top.
+    For ``SPLIT_TWO_PERSONS`` / ``SPLIT_TWO_CHARTS`` it has no visible meaning
+    (both bands hold the same kind of item); the enum value is retained only
+    so a single stacking recipe drives all three split layouts.
+    """
+
+    CHART_THEN_PERSON = "chart_then_person"
+    PERSON_THEN_CHART = "person_then_chart"
+
+
+class RenderTheme(str, Enum):
+    """Visual treatment applied by the final renderer."""
+
+    LEGACY = "legacy"
+    REFERENCE_LOWER_THIRD = "reference_lower_third"
+    NATIVE_HIGHLIGHT = "native_highlight"
+
+
+class IngestResult(BaseModel):
+    """Everything Stage 1 (deterministic local extraction) produces."""
+
+    source_path: str
+    duration_sec: float
+    scenes: list[Scene]
+    transcript_words: list[TranscriptWord]
+    keyframes_dir: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Layout system — the 5 "thrusters" (max 2 on-screen items per short)
+# ---------------------------------------------------------------------------
+
+
+class LayoutKind(str, Enum):
+    """The 9:16 layouts. A short contains **at most two** on-screen items.
+
+    An "item" is one of ``person`` (a human speaker) or ``chart`` (slide, graph,
+    data visual, screenshare). Five combinations are allowed:
+
+    - ``ZOOM_CALL_CENTER``:   **1 person**, tight webcam/zoom-call framing, centered.
+    - ``SIT_CENTER``:         **1 person**, interview/seated framing, centered.
+    - ``SPLIT_CHART_PERSON``: **1 chart + 1 person** — chart + speaker share the
+                              source frame. Output stacks them vertically
+                              (by default ``focus_stack_order`` = chart-on-top).
+    - ``SPLIT_TWO_PERSONS``:  **2 persons** — two speakers (e.g. interview two-up).
+                              Output stacks them vertically.
+    - ``SPLIT_TWO_CHARTS``:   **2 charts** — two charts/slides side-by-side in source.
+                              Output stacks them vertically.
+
+    The "max 2 items" constraint is the keep-it-simple rule: every rendered short
+    is either one item centered, or two items stacked evenly top/bottom.
+    """
+
+    ZOOM_CALL_CENTER = "zoom_call_center"
+    SIT_CENTER = "sit_center"
+    SPLIT_CHART_PERSON = "split_chart_person"
+    SPLIT_TWO_PERSONS = "split_two_persons"
+    SPLIT_TWO_CHARTS = "split_two_charts"
+
+
+# Layouts that stack two items vertically in the 9:16 output.
+SPLIT_LAYOUTS: frozenset[LayoutKind] = frozenset(
+    {
+        LayoutKind.SPLIT_CHART_PERSON,
+        LayoutKind.SPLIT_TWO_PERSONS,
+        LayoutKind.SPLIT_TWO_CHARTS,
+    }
+)
+
+
+class TimedCenterPoint(BaseModel):
+    """Speaker x-center at a clip-relative time, used for tracked centering."""
+
+    t_sec: float = Field(ge=0.0)
+    x_norm: float = Field(ge=0.0, le=1.0)
+    zoom: float | None = Field(
+        default=None,
+        gt=0.0,
+        le=4.0,
+        description=(
+            "Optional per-sample crop zoom. When unset, the layout uses the "
+            "clip-level ``zoom`` value for that moment."
+        ),
+    )
+
+
+class ClipRenderSpan(BaseModel):
+    """One kept source-timeline span inside a selected clip."""
+
+    start_time_sec: float = Field(ge=0.0)
+    end_time_sec: float = Field(gt=0.0)
+
+    @field_validator("end_time_sec")
+    @classmethod
+    def _end_after_start(cls, v: float, info) -> float:
+        start = info.data.get("start_time_sec", 0.0)
+        if v <= start:
+            raise ValueError("render span end_time_sec must be greater than start_time_sec")
+        return v
+
+    @property
+    def duration_sec(self) -> float:
+        return self.end_time_sec - self.start_time_sec
+
+
+class LayoutInstruction(BaseModel):
+    """Per-clip decision telling the compiler which layout to apply and how to crop.
+
+    Every short is described by exactly one of these, keyed by ``clip_id``. Split
+    layouts additionally carry up to two normalized bounding boxes (chart/person
+    or two-of-a-kind) so the compiler crops source strips that **partition** the
+    source width without overlap or gap.
+    """
+
+    clip_id: str
+    layout: LayoutKind
+    # Optional per-layout knobs. Defaults are sane for a 1920x1080 source.
+    zoom: float = Field(default=1.0, gt=0, le=4.0)
+    person_x_norm: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Normalized x-center of the human subject in source frame (0=left, 1=right).",
+    )
+    person_tracking: list[TimedCenterPoint] = Field(
+        default_factory=list,
+        description=(
+            "Optional clip-relative speaker framing samples for moving 9:16 crops. "
+            "Each point can shift the x-center and optionally widen/tighten the crop "
+            "for that moment. When empty, the compiler uses the static "
+            "person_x_norm/zoom settings."
+        ),
+    )
+    chart_x_norm: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description=(
+            "split_chart_person only: left-edge trim of the chart strip, as a fraction of the "
+            "left 2/3 pane (0 = use full chart area)."
+        ),
+    )
+    focus_stack_order: FocusStackOrder = Field(
+        default=FocusStackOrder.CHART_THEN_PERSON,
+        description="For split_chart_person only: chart-on-top vs person-on-top in the 9:16 stack.",
+    )
+    split_chart_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "Optional normalized rect for the chart/slide crop (Gemini vision). "
+            "When set with split_person_region, the split layout uses these boxes instead of fixed 2/3|1/3."
+        ),
+    )
+    split_person_region: BoundingBox | None = Field(
+        default=None,
+        description="Optional normalized rect for the speaker crop (Gemini vision).",
+    )
+    split_second_chart_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "For ``SPLIT_TWO_CHARTS`` only: second chart bbox. The first chart occupies "
+            "the top output band, this one occupies the bottom band."
+        ),
+    )
+    split_second_person_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "For ``SPLIT_TWO_PERSONS`` only: second speaker bbox. The first person "
+            "occupies the top output band, this one occupies the bottom band."
+        ),
+    )
+    top_band_ratio: float = Field(
+        default=0.5,
+        ge=0.2,
+        le=0.8,
+        description=(
+            "Fraction of 9:16 output height used by the top band for split layouts. "
+            "0.5 = EVEN 50/50 split (default — the user-requested symmetric look). "
+            "0.6 historically matched the 'chart dominant / person small' look."
+        ),
+    )
+
+
+    @field_validator("person_tracking")
+    @classmethod
+    def _tracking_times_non_decreasing(
+        cls, points: list[TimedCenterPoint]
+    ) -> list[TimedCenterPoint]:
+        last_t = -1.0
+        for point in points:
+            if point.t_sec < last_t:
+                raise ValueError("person_tracking times must be non-decreasing")
+            last_t = point.t_sec
+        return points
+
+
+class SceneClassification(BaseModel):
+    """Result of the classifier: which layout should a given scene use."""
+
+    scene_id: str
+    layout: LayoutKind
+    confidence: float = Field(ge=0.0, le=1.0)
+    reason: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Vision bounding boxes — the LLM+OCR path (alt to pixel heuristics)
+# ---------------------------------------------------------------------------
+
+
+class BoundingBox(BaseModel):
+    """Normalized [0..1] bounding box in the source frame coordinate space.
+
+    Normalized coords keep these outputs portable across source resolutions
+    and stop the model hallucinating pixel values. ``x2 > x1`` and
+    ``y2 > y1`` are enforced.
+    """
+
+    x1: float = Field(ge=0.0, le=1.0)
+    y1: float = Field(ge=0.0, le=1.0)
+    x2: float = Field(ge=0.0, le=1.0)
+    y2: float = Field(ge=0.0, le=1.0)
+    label: str = ""
+    confidence: float = Field(default=1.0, ge=0.0, le=1.0)
+
+    @field_validator("x2")
+    @classmethod
+    def _x2_after_x1(cls, v: float, info) -> float:
+        x1 = info.data.get("x1", 0.0)
+        if v <= x1:
+            raise ValueError("x2 must be > x1")
+        return v
+
+    @field_validator("y2")
+    @classmethod
+    def _y2_after_y1(cls, v: float, info) -> float:
+        y1 = info.data.get("y1", 0.0)
+        if v <= y1:
+            raise ValueError("y2 must be > y1")
+        return v
+
+    @property
+    def center_x(self) -> float:
+        return (self.x1 + self.x2) / 2.0
+
+    @property
+    def center_y(self) -> float:
+        return (self.y1 + self.y2) / 2.0
+
+    @property
+    def width(self) -> float:
+        return self.x2 - self.x1
+
+
+class SceneRegions(BaseModel):
+    """Vision-LLM output for a single scene keyframe.
+
+    Flow: detect a scene change locally (cheap) -> extract one keyframe per
+    scene -> send that keyframe to a vision LLM with an OCR hint -> get
+    normalized bounding boxes for the on-screen roles (``person``,
+    ``chart``). Those boxes drive ``person_x_norm`` / ``chart_x_norm`` on a
+    ``LayoutInstruction`` without any pixel code running in Python.
+    """
+
+    scene_id: str
+    person_bbox: BoundingBox | None = None
+    chart_bbox: BoundingBox | None = None
+    ocr_text: str = ""
+    raw_reason: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Clip planning
+# ---------------------------------------------------------------------------
+
+
+class Clip(BaseModel):
+    clip_id: str
+    topic: str
+    start_time_sec: float = Field(ge=0)
+    end_time_sec: float = Field(gt=0)
+    viral_hook: str = ""
+    virality_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    transcript: str = ""
+    suggested_overlay_title: str = ""
+    layout: LayoutKind | None = None
+    score_breakdown: dict[str, float] | None = None
+    origin: Literal["text", "visual", "both"] = "text"
+    visual_notes: str | None = None
+    reasoning: str | None = None
+
+    # Optional LLM metadata (source timeline is start_time_sec / end_time_sec).
+    hook_start_sec: float | None = Field(
+        default=None,
+        description="Seconds from clip in-point where the viral hook begins (0 = clip start).",
+    )
+    hook_end_sec: float | None = Field(
+        default=None,
+        description="Seconds from clip in-point where the hook ends (exclusive upper bound).",
+    )
+    trim_start_sec: float = Field(
+        default=0.0,
+        ge=0,
+        description="Seconds to remove from the start of this segment when exporting.",
+    )
+    trim_end_sec: float = Field(
+        default=0.0,
+        ge=0,
+        description="Seconds to remove from the end of this segment when exporting.",
+    )
+    render_spans: list[ClipRenderSpan] = Field(
+        default_factory=list,
+        description=(
+            "Optional ordered source-timeline spans to keep when exporting. "
+            "When present, these spans override contiguous trim_start/trim_end export."
+        ),
+    )
+    shorts_title: str = ""
+    description: str = ""
+    hashtags: list[str] = Field(default_factory=list)
+    layout_hint: LayoutKind | None = None
+    needs_review: bool = False
+    review_reason: str = ""
+
+    @field_validator("score_breakdown")
+    @classmethod
+    def _score_breakdown_in_range(
+        cls, v: dict[str, float] | None
+    ) -> dict[str, float] | None:
+        if v is None:
+            return None
+        cleaned: dict[str, float] = {}
+        for axis, score in v.items():
+            if score < 0.0:
+                raise ValueError(f"score_breakdown[{axis!r}] must be non-negative")
+            cleaned[axis] = min(score, 1.0)
+        return cleaned
+
+    @model_validator(mode="after")
+    def _timing_consistency(self) -> "Clip":
+        if self.end_time_sec <= self.start_time_sec:
+            raise ValueError("end_time_sec must be greater than start_time_sec")
+        dur = self.end_time_sec - self.start_time_sec
+        hs, he = self.hook_start_sec, self.hook_end_sec
+        if (hs is None) ^ (he is None):
+            raise ValueError("hook_start_sec and hook_end_sec must both be set or both omitted")
+        if hs is not None and he is not None:
+            if not (0 <= hs < he <= dur):
+                raise ValueError(
+                    "hook window must satisfy 0 <= hook_start_sec < hook_end_sec <= clip duration"
+                )
+        if self.trim_start_sec + self.trim_end_sec > dur:
+            raise ValueError("trim_start_sec + trim_end_sec must not exceed clip duration")
+        last_end = None
+        for span in self.render_spans:
+            if span.start_time_sec < self.start_time_sec - 1e-6:
+                raise ValueError("render_spans must stay within the clip start_time_sec")
+            if span.end_time_sec > self.end_time_sec + 1e-6:
+                raise ValueError("render_spans must stay within the clip end_time_sec")
+            if last_end is not None and span.start_time_sec < last_end - 1e-6:
+                raise ValueError("render_spans must be ordered and non-overlapping")
+            last_end = span.end_time_sec
+        return self
+
+    @model_serializer(mode="wrap")
+    def _serialize_without_default_extensions(self, handler):
+        data = handler(self)
+        if data.get("score_breakdown") is None:
+            data.pop("score_breakdown", None)
+        if data.get("origin") == "text":
+            data.pop("origin", None)
+        if data.get("visual_notes") is None:
+            data.pop("visual_notes", None)
+        if data.get("reasoning") is None:
+            data.pop("reasoning", None)
+        return data
+
+    @property
+    def duration_sec(self) -> float:
+        return self.end_time_sec - self.start_time_sec
+
+
+class ClipPlan(BaseModel):
+    """Output of the clip-selection stage — a list of clips + their layouts."""
+
+    source_path: str
+    clips: list[Clip]
+
+
+class ApprovalResult(BaseModel):
+    action: Literal["proceed", "refine", "quit", "accept_all"]
+    selected_ids: list[str] | None = None
+    steering_note: str | None = None
+
+
+class RatingFeedback(BaseModel):
+    rating: Literal[1, 2, 3]
+    issues: list[
+        Literal[
+            "wrong_moments",
+            "bad_cuts",
+            "boring",
+            "confusing",
+            "wrong_layout",
+            "length_off",
+            "other",
+        ]
+    ] = Field(default_factory=list)
+    free_text: str | None = None
+
+
+class SessionState(BaseModel):
+    source_key: str = ""
+    iteration: int = 0
+    steering_notes: list[str] = Field(default_factory=list)
+    last_rating: RatingFeedback | None = None
+    last_selected_ids: list[str] | None = None
+
+
+# ---------------------------------------------------------------------------
+# Render
+# ---------------------------------------------------------------------------
+
+
+class RenderRequest(BaseModel):
+    source_path: str
+    clip: Clip
+    layout: LayoutInstruction
+    output_path: str
+    width: int = 1080
+    height: int = 1920
+    subtitle_path: str | None = None
+    subtitle_font_size: int = Field(
+        default=48,
+        ge=10,
+        le=120,
+        description=(
+            "Caption font size in **output pixels** (libass is pinned to "
+            "``original_size=width x height`` by the compiler, so this is a "
+            "true pixel value, not the old PlayResY=288 unit)."
+        ),
+    )
+    subtitle_margin_v: int = Field(
+        default=160,
+        ge=0,
+        le=800,
+        description="Vertical caption margin in output pixels (bottom-anchored).",
+    )
+    title_text: str = ""
+    render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT
+    mode: Literal["normal", "dry_run"] = "normal"
+
+
+class RenderResult(BaseModel):
+    clip_id: str
+    output_path: str
+    ffmpeg_cmd: list[str]
+    success: bool
+    error: str = ""
diff --git a/humeo-core/src/humeo_core/server.py b/humeo-core/src/humeo_core/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..610b66778ad1a61f307fa67927fcb7a4bf315b7e
--- /dev/null
+++ b/humeo-core/src/humeo_core/server.py
@@ -0,0 +1,332 @@
+"""FastMCP server — the control panel for the reusable rocket.
+
+Every primitive is exposed as a single MCP ``tool``. Each tool takes and
+returns strict Pydantic-validated JSON, so an MCP client (Cursor, Claude
+Desktop, etc.) can compose a full long-to-short pipeline without guessing
+any interface.
+
+Tools:
+
+    humeo.ingest                      — Stage 1 extraction (scenes + keyframes [+ transcript])
+    humeo.classify_scenes             — Assign one of 5 layouts to each scene (pixel heuristic)
+    humeo.classify_scenes_with_vision — Assign layouts using bboxes from a vision LLM + OCR
+    humeo.detect_scene_regions        — Raw LLM bbox output per scene keyframe (OCR-assisted)
+    humeo.select_clips                — Pick top clips from a transcript (heuristic)
+    humeo.plan_layout                 — Return the ffmpeg filtergraph for a given layout
+    humeo.build_render_cmd            — Build the full ffmpeg command (dry-run safe)
+    humeo.render_clip                 — Build + actually run ffmpeg to produce a 9:16 clip
+    humeo.list_layouts                — List the 5 available layouts (discovery)
+
+Resources:
+
+    humeo://layouts             — JSON listing of the 5 layouts + description
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from mcp.server.fastmcp import FastMCP
+
+from .primitives import classify as classify_mod
+from .primitives import compile as compile_mod
+from .primitives import ingest as ingest_mod
+from .primitives import layouts as layouts_mod
+from .primitives import select_clips as select_mod
+from .primitives import vision as vision_mod
+from .schemas import (
+    IngestResult,
+    LayoutInstruction,
+    LayoutKind,
+    RenderRequest,
+    RenderResult,
+    Scene,
+    SceneRegions,
+    TranscriptWord,
+)
+
+
+mcp = FastMCP(
+    "humeo-core",
+    instructions=(
+        "Humeo MCP: reusable primitives for turning long videos into 9:16 shorts. "
+        "Compose tools in this order: ingest -> classify_scenes -> select_clips -> "
+        "plan_layout/build_render_cmd -> render_clip. All IO is strict JSON."
+    ),
+)
+
+
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def list_layouts() -> dict[str, Any]:
+    """Return the 5 fixed 9:16 layouts this server supports.
+
+    Every short shows **at most two** on-screen items (person/chart), which
+    gives exactly five recipes. Use this to discover the set of
+    :class:`LayoutKind` values before classifying scenes or requesting
+    renders.
+    """
+
+    return {
+        "layouts": [
+            {
+                "kind": LayoutKind.ZOOM_CALL_CENTER.value,
+                "items": ["person"],
+                "description": "1 person, tight zoom-call / webcam framing, centered.",
+            },
+            {
+                "kind": LayoutKind.SIT_CENTER.value,
+                "items": ["person"],
+                "description": "1 person, interview / seated framing, centered.",
+            },
+            {
+                "kind": LayoutKind.SPLIT_CHART_PERSON.value,
+                "items": ["chart", "person"],
+                "description": (
+                    "1 chart + 1 person. Source is partitioned left/right by the chart and "
+                    "person bboxes (falling back to a 2/3 | 1/3 split); each strip is scaled "
+                    "to fill its output band. Bands default to an even 50/50 vertical split; "
+                    "configurable via ``top_band_ratio`` and swappable via ``focus_stack_order``."
+                ),
+            },
+            {
+                "kind": LayoutKind.SPLIT_TWO_PERSONS.value,
+                "items": ["person", "person"],
+                "description": (
+                    "2 people (interview two-up / panel). Left speaker in the top band, right "
+                    "speaker in the bottom band; seam sits between the two person bboxes."
+                ),
+            },
+            {
+                "kind": LayoutKind.SPLIT_TWO_CHARTS.value,
+                "items": ["chart", "chart"],
+                "description": (
+                    "2 charts / slides side-by-side in source. Left chart on top, right chart "
+                    "on bottom; each is scaled to fill its band."
+                ),
+            },
+        ]
+    }
+
+
+@mcp.resource("humeo://layouts")
+def layouts_resource() -> str:
+    return json.dumps(list_layouts(), indent=2)
+
+
+# ---------------------------------------------------------------------------
+# Landing gear: ingest
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def ingest(
+    source_path: str,
+    work_dir: str,
+    with_transcript: bool = False,
+    whisper_model: str = "base",
+) -> dict[str, Any]:
+    """Run deterministic local extraction (scenes + keyframes, optional transcript).
+
+    Args:
+        source_path: absolute path to a local video file.
+        work_dir: directory where keyframes/ and temp artifacts will be written.
+        with_transcript: if True, run faster-whisper word-level transcription.
+        whisper_model: whisper model name (e.g. "tiny", "base", "small").
+    """
+
+    result: IngestResult = ingest_mod.ingest(
+        source_path,
+        work_dir,
+        with_transcript=with_transcript,
+        whisper_model=whisper_model,
+    )
+    return result.model_dump()
+
+
+# ---------------------------------------------------------------------------
+# Pilot: classify scenes
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def classify_scenes(scenes: list[dict[str, Any]]) -> dict[str, Any]:
+    """Classify each scene into exactly one of the 5 supported layouts.
+
+    Uses an offline pixel heuristic on each scene's keyframe. Agents that
+    want a smarter classifier can post-process or overwrite the result,
+    or call ``classify_scenes_with_vision`` with bboxes from a vision LLM.
+    """
+
+    parsed = [Scene.model_validate(s) for s in scenes]
+    results = classify_mod.classify_scenes_heuristic(parsed)
+    return {"classifications": [r.model_dump() for r in results]}
+
+
+# ---------------------------------------------------------------------------
+# Pilot (alt path): vision-LLM + OCR bbox classifier
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def detect_scene_regions(scenes: list[dict[str, Any]]) -> dict[str, Any]:
+    """Return the prompt + per-scene stubs used for LLM+OCR bbox detection.
+
+    This tool is the *adapter* half of the vision primitive. The MCP server
+    itself never calls an LLM — the agent does. So this endpoint returns:
+
+    1. the exact ``REGION_PROMPT`` to send along with each keyframe, and
+    2. a list of ``{scene_id, keyframe_path, prompt}`` jobs.
+
+    The agent runs its own vision model for each job, then feeds the
+    resulting JSON back via ``classify_scenes_with_vision``.
+    """
+
+    parsed = [Scene.model_validate(s) for s in scenes]
+    return {
+        "prompt": vision_mod.REGION_PROMPT,
+        "jobs": [
+            {
+                "scene_id": s.scene_id,
+                "keyframe_path": s.keyframe_path,
+                "prompt": vision_mod.REGION_PROMPT,
+            }
+            for s in parsed
+        ],
+    }
+
+
+@mcp.tool()
+def classify_scenes_with_vision(regions: list[dict[str, Any]]) -> dict[str, Any]:
+    """Classify scenes from already-gathered ``SceneRegions`` bbox records.
+
+    Input is a list of ``SceneRegions`` JSON dicts (output of the agent's
+    vision-LLM pass). Output is a ``{classifications, layout_instructions}``
+    pair — the layout kind per scene plus a ready-to-render
+    ``LayoutInstruction`` with ``person_x_norm`` / ``chart_x_norm`` already
+    populated from the bboxes.
+    """
+
+    parsed_regions = [SceneRegions.model_validate(r) for r in regions]
+    classifications = [vision_mod.classify_from_regions(r) for r in parsed_regions]
+    instructions = [
+        vision_mod.layout_instruction_from_regions(r, c)
+        for r, c in zip(parsed_regions, classifications)
+    ]
+    return {
+        "classifications": [c.model_dump() for c in classifications],
+        "layout_instructions": [i.model_dump() for i in instructions],
+    }
+
+
+# ---------------------------------------------------------------------------
+# Pilot: select clips
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def select_clips(
+    source_path: str,
+    transcript_words: list[dict[str, Any]],
+    duration_sec: float,
+    target_count: int = 5,
+    min_sec: float = 30.0,
+    max_sec: float = 60.0,
+) -> dict[str, Any]:
+    """Heuristically select top clips from a word-level transcript.
+
+    Scoring is word-density per window. Returns a ``ClipPlan`` with up to
+    ``target_count`` non-overlapping clips.
+    """
+
+    words = [TranscriptWord.model_validate(w) for w in transcript_words]
+    plan = select_mod.select_clips_heuristic(
+        source_path,
+        words,
+        duration_sec,
+        target_count=target_count,
+        min_sec=min_sec,
+        max_sec=max_sec,
+    )
+    return plan.model_dump()
+
+
+# ---------------------------------------------------------------------------
+# Thrusters: plan + render
+# ---------------------------------------------------------------------------
+
+
+@mcp.tool()
+def plan_layout(
+    layout: str,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    src_w: int = 1920,
+    src_h: int = 1080,
+    zoom: float = 1.0,
+    person_x_norm: float = 0.5,
+    chart_x_norm: float = 0.0,
+    clip_id: str = "preview",
+) -> dict[str, Any]:
+    """Return the ffmpeg filter_complex fragment for one layout.
+
+    This is the pure, deterministic function underpinning the 5 thrusters.
+    No rendering is performed. Useful for agents that want to preview the
+    filtergraph or compose it with their own ffmpeg invocation.
+    """
+
+    instr = LayoutInstruction(
+        clip_id=clip_id,
+        layout=LayoutKind(layout),
+        zoom=zoom,
+        person_x_norm=person_x_norm,
+        chart_x_norm=chart_x_norm,
+    )
+    fp = layouts_mod.plan_layout(instr, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
+    return {"filtergraph": fp.filtergraph, "out_label": fp.out_label}
+
+
+@mcp.tool()
+def build_render_cmd(request: dict[str, Any]) -> dict[str, Any]:
+    """Build (but do NOT run) the ffmpeg command for a render request.
+
+    ``request`` must conform to the ``RenderRequest`` schema. This is a
+    dry-run helper so an agent can review the command before executing it.
+    """
+
+    req = RenderRequest.model_validate({**request, "mode": "dry_run"})
+    result = compile_mod.render_clip(req)
+    return result.model_dump()
+
+
+@mcp.tool()
+def render_clip(request: dict[str, Any]) -> dict[str, Any]:
+    """Render a single 9:16 clip with the specified layout.
+
+    ``request`` must conform to ``RenderRequest``. If ``request.mode`` is
+    ``"dry_run"`` the ffmpeg command is returned without execution.
+    """
+
+    req = RenderRequest.model_validate(request)
+    result: RenderResult = compile_mod.render_clip(req)
+    return result.model_dump()
+
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    """stdio entrypoint for ``humeo-core`` console-script."""
+
+    mcp.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/humeo-core/tests/__init__.py b/humeo-core/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/humeo-core/tests/test_classify.py b/humeo-core/tests/test_classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a6eebac4316b28571310ee7af2f218ff8690943
--- /dev/null
+++ b/humeo-core/tests/test_classify.py
@@ -0,0 +1,39 @@
+import json
+
+from humeo_core.primitives.classify import (
+    classify_scenes_heuristic,
+    classify_scenes_with_llm,
+)
+from humeo_core.schemas import LayoutKind, Scene
+
+
+def test_heuristic_no_keyframe_defaults_sit_center():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path=None)]
+    result = classify_scenes_heuristic(scenes)
+    assert len(result) == 1
+    assert result[0].scene_id == "s0"
+    assert result[0].layout == LayoutKind.SIT_CENTER
+
+
+def test_llm_classifier_uses_callback_and_validates():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
+
+    def fake_vision(image_path: str, prompt: str) -> str:
+        return json.dumps(
+            {"layout": "split_chart_person", "confidence": 0.88, "reason": "chart left"}
+        )
+
+    result = classify_scenes_with_llm(scenes, fake_vision)
+    assert result[0].layout == LayoutKind.SPLIT_CHART_PERSON
+    assert result[0].confidence == 0.88
+
+
+def test_llm_classifier_parse_error_is_safe():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
+
+    def bad_vision(image_path: str, prompt: str) -> str:
+        return "not json"
+
+    result = classify_scenes_with_llm(scenes, bad_vision)
+    assert result[0].layout == LayoutKind.SIT_CENTER
+    assert "parse error" in result[0].reason.lower()
diff --git a/humeo-core/tests/test_compile.py b/humeo-core/tests/test_compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d39184966a484b5bccfc80c92262184cba8b6f
--- /dev/null
+++ b/humeo-core/tests/test_compile.py
@@ -0,0 +1,329 @@
+from pathlib import Path
+
+from humeo_core.primitives import compile as compile_mod
+from humeo_core.primitives.compile import (
+    _ensure_windows_fontconfig,
+    build_ffmpeg_cmd,
+    plan_title_drawtext,
+)
+from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RenderRequest, RenderTheme
+
+
+def _req(**overrides):
+    c = Clip(clip_id="1", topic="t", start_time_sec=10.0, end_time_sec=40.0)
+    li = LayoutInstruction(clip_id="1", layout=LayoutKind.SIT_CENTER)
+    data = dict(
+        source_path="/tmp/src.mp4",
+        clip=c,
+        layout=li,
+        output_path="/tmp/out.mp4",
+        render_theme=RenderTheme.LEGACY,
+        mode="dry_run",
+    )
+    data.update(overrides)
+    return RenderRequest(**data)
+
+
+def test_ffmpeg_cmd_has_ss_duration_filtergraph_output():
+    cmd = build_ffmpeg_cmd(_req())
+    assert "-ss" in cmd
+    assert "-t" in cmd
+    assert "-filter_complex" in cmd
+    # duration = 30.0
+    t_idx = cmd.index("-t")
+    assert float(cmd[t_idx + 1]) == 30.0
+    ss_idx = cmd.index("-ss")
+    assert float(cmd[ss_idx + 1]) == 10.0
+    assert cmd[-1] == "/tmp/out.mp4"
+
+
+def test_title_text_injects_drawtext():
+    cmd = build_ffmpeg_cmd(_req(title_text="Hello: world's"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext" in fg
+    # colon should be escaped
+    assert "Hello\\:" in fg
+    assert "worlds" in fg
+    assert "world's" not in fg
+    assert "expansion=none" in fg
+
+
+def test_map_vout_and_primary_audio():
+    cmd = build_ffmpeg_cmd(_req())
+    assert "[vout]" in cmd
+    assert "0:a:0" in cmd
+
+
+def test_subtitle_style_uses_requested_font_and_margin():
+    cmd = build_ffmpeg_cmd(
+        _req(subtitle_path="/tmp/clip.srt", subtitle_font_size=18, subtitle_margin_v=64)
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "subtitles='" in fg
+    assert "FontSize=18" in fg
+    assert "MarginV=64" in fg
+    # Smart word wrap so long captions break into multiple readable lines.
+    assert "WrapStyle=0" in fg
+
+
+def test_subtitle_original_size_pins_libass_to_output_resolution():
+    """Without original_size=W x H, libass uses PlayResY=288 and blows up fonts/margins.
+
+    This is the root cause of the "subtitles floating in the middle of the
+    frame / blocked" bug the user reported.
+    """
+    cmd = build_ffmpeg_cmd(_req(subtitle_path="/tmp/clip.srt"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "original_size=1080x1920" in fg
+
+
+def test_subtitles_applied_after_crop_and_title():
+    """Order: crop/compose -> drawtext title -> subtitles.
+
+    The pipeline must crop **first**, then draw text on the finished frame.
+    """
+    cmd = build_ffmpeg_cmd(
+        _req(title_text="Hook", subtitle_path="/tmp/clip.srt")
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    crop_pos = fg.index("[0:v]crop=")
+    drawtext_pos = fg.index("drawtext")
+    subs_pos = fg.index("subtitles=")
+    assert crop_pos < drawtext_pos < subs_pos
+
+
+def test_build_is_layout_specific():
+    c = Clip(clip_id="1", topic="t", start_time_sec=0, end_time_sec=10)
+    split_req = _req(
+        clip=c,
+        layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
+    )
+    cmd = build_ffmpeg_cmd(split_req)
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "vstack" in fg
+
+
+def test_title_is_suppressed_on_split_layouts():
+    """Split layouts already contain a slide/chart with its own title.
+
+    Overlaying an additional drawtext title just obscures content -- that's
+    what was happening in the Cathy Wood "chart overlaps subject" report.
+    """
+    for kind in (
+        LayoutKind.SPLIT_CHART_PERSON,
+        LayoutKind.SPLIT_TWO_PERSONS,
+        LayoutKind.SPLIT_TWO_CHARTS,
+    ):
+        cmd = build_ffmpeg_cmd(
+            _req(
+                layout=LayoutInstruction(clip_id="1", layout=kind),
+                title_text="This should not render",
+            )
+        )
+        fg = cmd[cmd.index("-filter_complex") + 1]
+        assert "drawtext" not in fg, f"title leaked into split layout {kind}"
+
+
+def test_title_is_drawn_on_single_subject_layouts():
+    """Titles are still rendered on ZOOM_CALL_CENTER and SIT_CENTER."""
+    for kind in (LayoutKind.ZOOM_CALL_CENTER, LayoutKind.SIT_CENTER):
+        cmd = build_ffmpeg_cmd(
+            _req(
+                layout=LayoutInstruction(clip_id="1", layout=kind),
+                title_text="Hook title",
+            )
+        )
+        fg = cmd[cmd.index("-filter_complex") + 1]
+        assert "drawtext=text='Hook title'" in fg
+
+
+# ---------------------------------------------------------------------------
+# Title wrapping / auto-shrink (P2: fixes the "Prediction Markets vs
+# Derivatives" clipped-title bug reported against the Cathy Wood run).
+# ---------------------------------------------------------------------------
+
+
+def test_plan_title_short_stays_single_line_at_72px():
+    """Backward compat: short titles keep the pre-P2 single-drawtext form.
+
+    Byte-identical output for short titles is important because it keeps
+    previously-calibrated visual output unchanged and avoids needless cache
+    churn on existing renders.
+    """
+    frag = plan_title_drawtext("Hook title", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 1
+    assert "fontsize=72" in frag
+    assert "y=80" in frag
+    assert "drawtext=text='Hook title'" in frag
+
+
+def test_plan_title_long_wraps_to_two_lines_below_72px():
+    """Long titles wrap at the best word boundary and shrink to fit.
+
+    "Prediction Markets vs Derivatives" is 33 chars — it overflows a 1080px
+    canvas at 72px. It must wrap into "Prediction Markets" / "vs Derivatives"
+    (balanced halves) at a smaller font.
+    """
+    frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 2, "long titles must split into two drawtext calls"
+    assert "drawtext=text='Prediction Markets'" in frag
+    assert "drawtext=text='vs Derivatives'" in frag
+    assert "fontsize=72" not in frag, "two-line layout must use a smaller font"
+    # Both lines share the same shrunken fontsize.
+    import re
+
+    sizes = re.findall(r"fontsize=(\d+)", frag)
+    assert len(sizes) == 2 and sizes[0] == sizes[1]
+    assert 44 <= int(sizes[0]) <= 64
+
+
+def test_plan_title_empty_returns_none():
+    assert plan_title_drawtext("", out_w=1080) is None
+    assert plan_title_drawtext("   ", out_w=1080) is None
+
+
+def test_plan_title_single_huge_word_shrinks_instead_of_wrapping():
+    """A single word cannot be word-wrapped; it must shrink to fit."""
+    frag = plan_title_drawtext("Supercalifragilisticexpialidocious", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 1  # no wrap possible
+    assert "fontsize=72" not in frag
+
+
+def test_title_uses_arial_font_not_default_serif():
+    """Titles must render in Arial (matching the ASS subtitle font), not the
+    platform default which is Times New Roman on Windows.
+
+    Regression test for the "ugly serif title on the finance short" bug.
+    Both the single-line and the two-line drawtext variants must carry a
+    ``font=Arial`` directive so fontconfig resolves to the same family as
+    the subtitle ``Fontname=Arial``.
+    """
+    short = plan_title_drawtext("Hook title", out_w=1080)
+    assert short is not None
+    assert "font=Arial" in short or "fontfile='" in short
+
+    long_frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
+    assert long_frag is not None
+    if "font=Arial" in long_frag:
+        assert long_frag.count("font=Arial") == 2
+    else:
+        assert long_frag.count("fontfile='") == 2
+
+
+def test_title_font_matches_subtitle_font_family():
+    """Title overlay and subtitle captions must read as one typographic
+    family. Both routes through ``build_ffmpeg_cmd`` should carry the same
+    Arial reference.
+    """
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="Hook title",
+            subtitle_path="/tmp/clip.ass",
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "font=Arial" in fg or "fontfile='" in fg
+    assert "Fontname=Arial" in fg
+
+
+def test_long_title_pipes_through_build_ffmpeg_cmd():
+    """End-to-end: a long title routed through the full command builder
+    produces a valid filtergraph with two drawtext filters and no syntax
+    errors ffmpeg would choke on.
+    """
+    cmd = build_ffmpeg_cmd(_req(title_text="Prediction Markets vs Derivatives"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert fg.count("drawtext=") == 2
+    assert "[v_prepad]drawtext=text='Prediction Markets'" in fg
+    assert "[vout]" in fg
+    assert ";;" not in fg  # no empty chain links
+    assert ",," not in fg  # no stray commas
+
+
+def test_reference_theme_draws_title_and_caption_bars():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="A Multi-Trillion Dollar Opportunity",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawbox=x=28:y=32" in fg
+    assert "drawbox=x=0:y=" in fg
+    assert "Fontname=Source Sans 3" in fg
+    assert "Alignment=2" in fg
+    assert "Outline=2" in fg
+
+
+def test_reference_theme_wraps_long_titles_inside_the_title_bar():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="12% Youth Unemployment? Start a Business With AI",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert fg.count("drawtext=") >= 2
+    assert "..." not in fg
+
+
+def test_reference_theme_draws_frosted_caption_ribbon_when_subtitles_exist():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="Hook title",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawbox=x=0:y=" in fg
+
+
+def test_reference_theme_allows_titles_on_split_layouts():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
+            title_text="Hook title",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext=" in fg
+
+
+def test_native_highlight_theme_skips_title_card_and_keeps_ass_styles():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="This title should not render",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.NATIVE_HIGHLIGHT,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext" not in fg
+    assert "subtitles='" in fg
+    assert "force_style='" not in fg
+
+
+def test_ensure_windows_fontconfig_is_noop_off_windows():
+    env = _ensure_windows_fontconfig()
+    assert isinstance(env, dict)
+
+
+def test_ensure_windows_fontconfig_creates_config(monkeypatch, tmp_path):
+    monkeypatch.setattr(compile_mod.os, "name", "nt", raising=False)
+    monkeypatch.delenv("FONTCONFIG_FILE", raising=False)
+    monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "localappdata"))
+    monkeypatch.setenv("WINDIR", str(tmp_path / "winroot"))
+
+    env = _ensure_windows_fontconfig()
+
+    cfg_file = Path(env["FONTCONFIG_FILE"])
+    assert cfg_file.is_file()
+    text = cfg_file.read_text(encoding="utf-8")
+    assert (tmp_path / "winroot" / "Fonts").as_posix() in text
+    assert "fontconfig-cache" in text
diff --git a/humeo-core/tests/test_face_detect.py b/humeo-core/tests/test_face_detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1850da11d71efaa166e9e39c67b9f2dd5df2e1d
--- /dev/null
+++ b/humeo-core/tests/test_face_detect.py
@@ -0,0 +1,73 @@
+"""Tests for the MediaPipe-backed face detection primitive.
+
+Uses a stub ``face_fn`` so MediaPipe itself is not required to run the
+tests — the primitive contract is what we care about: *given* a face
+bbox, does the primitive produce the right ``SceneRegions``.
+"""
+
+from humeo_core.primitives.face_detect import detect_face_regions
+from humeo_core.schemas import BoundingBox, Scene
+
+
+def _scene(i: int, kf: str | None = "/tmp/k.jpg") -> Scene:
+    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
+
+
+def test_no_keyframe_returns_raw_reason():
+    out = detect_face_regions([_scene(0, kf=None)], face_fn=lambda _p: None)
+    assert out[0].person_bbox is None
+    assert "no keyframe" in out[0].raw_reason.lower()
+
+
+def test_no_face_detected_returns_raw_reason():
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: None)
+    assert out[0].person_bbox is None
+    assert "no face" in out[0].raw_reason.lower()
+
+
+def test_face_centered_produces_person_only():
+    centered = BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.7, label="face", confidence=0.9)
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: centered)
+    r = out[0]
+    assert r.person_bbox is not None
+    assert r.person_bbox.center_x == centered.center_x
+    assert r.chart_bbox is None
+
+
+def test_face_pushed_right_synthesises_chart_bbox():
+    # face center x ~ 0.86 -> above default threshold 0.65 -> chart bbox inferred
+    face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9, label="face", confidence=0.95)
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: face)
+    r = out[0]
+    assert r.person_bbox is not None
+    assert r.chart_bbox is not None
+    assert r.chart_bbox.x1 == 0.0
+    assert r.chart_bbox.x2 <= 0.75  # can't overlap the face
+    assert r.chart_bbox.x2 <= 0.65  # bounded by threshold too
+    assert "synthetic chart" in r.raw_reason
+
+
+def test_face_detector_exception_is_isolated_per_scene():
+    scenes = [_scene(0), _scene(1)]
+    calls: list[str] = []
+
+    def flaky_fn(path: str) -> BoundingBox | None:
+        calls.append(path)
+        if len(calls) == 1:
+            raise RuntimeError("boom")
+        return BoundingBox(x1=0.3, y1=0.2, x2=0.7, y2=0.8)
+
+    out = detect_face_regions(scenes, face_fn=flaky_fn)
+    assert out[0].person_bbox is None
+    assert "error" in out[0].raw_reason.lower()
+    assert out[1].person_bbox is not None
+
+
+def test_custom_threshold_prevents_false_chart_split():
+    face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9)
+    out = detect_face_regions(
+        [_scene(0)],
+        face_fn=lambda _p: face,
+        chart_split_threshold=0.95,
+    )
+    assert out[0].chart_bbox is None
diff --git a/humeo-core/tests/test_layout_bbox.py b/humeo-core/tests/test_layout_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..426b35bc291ff6b9487aa5d9507a80ef24f97839
--- /dev/null
+++ b/humeo-core/tests/test_layout_bbox.py
@@ -0,0 +1,17 @@
+"""Split layout uses optional normalized bbox regions (Gemini vision)."""
+
+from humeo_core.primitives.layouts import plan_layout
+from humeo_core.schemas import BoundingBox, FocusStackOrder, LayoutInstruction, LayoutKind
+
+
+def test_split_with_bbox_regions_not_fixed_thirds():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.64, y2=1.0),
+        split_person_region=BoundingBox(x1=0.64, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "crop=1228:1080:0:0" in fg or "crop=1224:1080:0:0" in fg
+    assert "vstack=inputs=2" in fg
diff --git a/humeo-core/tests/test_layouts.py b/humeo-core/tests/test_layouts.py
new file mode 100644
index 0000000000000000000000000000000000000000..b11dec0e5b0736cadadfbefb9fcdd54deb18fb87
--- /dev/null
+++ b/humeo-core/tests/test_layouts.py
@@ -0,0 +1,312 @@
+import re
+
+from humeo_core.primitives.layouts import (
+    _center_crop_to_9x16,
+    _crop_box,
+    plan_layout,
+)
+from humeo_core.schemas import (
+    BoundingBox,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    TimedCenterPoint,
+)
+
+
+def test_crop_box_aspect_exact():
+    cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 1.0, 0.5, 0.5)
+    # 9:16 inside 1920x1080 -> height-limited: ch=1080, cw ~= 608
+    assert ch == 1080
+    assert abs(cw / ch - 9 / 16) < 0.01
+    assert 0 <= x <= 1920 - cw
+    assert y == 0
+
+
+def test_crop_box_clamps_inside_frame():
+    cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 2.0, 0.99, 0.5)
+    assert x + cw <= 1920
+    assert y + ch <= 1080
+
+
+def test_crop_box_zoom_tightens():
+    cw_small, ch_small, _, _ = _center_crop_to_9x16(1920, 1080, 2.0, 0.5)
+    cw_large, ch_large, _, _ = _center_crop_to_9x16(1920, 1080, 1.0, 0.5)
+    assert cw_small < cw_large
+    assert ch_small < ch_large
+
+
+def test_even_dimensions():
+    cw, ch, x, y = _crop_box(1921, 1081, 9 / 16, 1.3, 0.4, 0.5)
+    assert cw % 2 == 0 and ch % 2 == 0
+    assert x % 2 == 0 and y % 2 == 0
+
+
+def _contains(s: str, *subs: str) -> bool:
+    return all(sub in s for sub in subs)
+
+
+def test_zoom_call_layout_filtergraph_shape():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.5, person_x_norm=0.5
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    fg = plan.filtergraph
+    assert _contains(fg, "[0:v]crop=", "scale=1080:1920", "[vout]")
+
+
+def test_sit_center_layout_filtergraph_shape():
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    assert "[vout]" in plan.filtergraph
+    assert plan.out_label == "vout"
+
+
+def test_sit_center_tracking_uses_dynamic_crop_expression():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2),
+            TimedCenterPoint(t_sec=10.0, x_norm=0.8),
+        ],
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    assert "setpts=PTS-STARTPTS" in fg
+    assert "[vsrc]crop=" in fg
+    assert "if(lt(t\\,4.850)" in fg
+    assert "*(t-4.850)/(0.300)" in fg
+
+
+def test_sit_center_tracking_with_zoom_uses_dynamic_crop_window_expressions():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.28),
+            TimedCenterPoint(t_sec=10.0, x_norm=0.8, zoom=1.0),
+        ],
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    assert "setpts=PTS-STARTPTS" in fg
+    assert "[vsrc]crop=" in fg
+    assert "out_w/2" in fg
+    assert "out_h/2" in fg
+    assert "floor((min(" in fg
+
+
+def test_split_layout_contains_vstack():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        person_x_norm=0.83,
+        chart_x_norm=0.0,
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    fg = plan.filtergraph
+    assert _contains(fg, "split=2", "vstack=inputs=2", "[vout]")
+    assert "[top]" in fg and "[bot]" in fg
+
+
+def test_split_layout_person_crop_is_right_third():
+    """Chart uses left 2/3; person uses right 1/3 (non-overlapping)."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # Right third: x=1280, w=640 for 1920-wide source.
+    assert "crop=640:1080:1280:0" in fg
+
+
+def test_split_layout_can_swap_stack_order():
+    """PERSON_THEN_CHART puts the right-strip (person) crop into the top band."""
+    chart_first = plan_layout(
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
+        ),
+        out_w=1080,
+        out_h=1920,
+    ).filtergraph
+    person_first = plan_layout(
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            focus_stack_order=FocusStackOrder.PERSON_THEN_CHART,
+        ),
+        out_w=1080,
+        out_h=1920,
+    ).filtergraph
+
+    def top_crop(fg: str) -> str:
+        m = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg)
+        assert m is not None, fg
+        return m.group(1)
+
+    # chart strip = left 1280px of source (2/3 split seam).
+    assert top_crop(chart_first) == "1280:1080:0:0"
+    # person strip = right 640px -> x=1280.
+    assert top_crop(person_first) == "640:1080:1280:0"
+    assert "vstack=inputs=2" in chart_first
+    assert "vstack=inputs=2" in person_first
+
+
+def test_split_layout_person_clamped():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, person_x_norm=1.0
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    assert "crop=" in plan.filtergraph  # no OOB math crash
+
+
+def test_plan_layout_dispatch_covers_all_kinds():
+    for k in LayoutKind:
+        instr = LayoutInstruction(clip_id="c", layout=k)
+        plan = plan_layout(instr)
+        assert plan.out_label == "vout"
+        assert plan.filtergraph.endswith("[vout]")
+
+
+def test_default_split_is_even_50_50_bands():
+    """The user-requested symmetric look: top and bottom bands are equal."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    # Each strip should scale to the same height (half of 1920).
+    heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
+    assert len(heights) == 2
+    assert heights[0] == heights[1] == "960", f"expected even 960/960, got {heights}"
+
+
+def test_top_band_ratio_honored_for_uneven_splits():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, top_band_ratio=0.6
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
+    assert heights == ["1152", "768"], heights
+
+
+def test_split_seam_is_midpoint_between_bboxes():
+    """When both bboxes are provided, strips partition the source -- no overlap, no gap."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.50, y2=1.0),
+        split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # chart.x2 = 960px, person.x1 = 1056px -> midpoint = 1008 -> even -> 1008.
+    # Chart strip: x=0, cw=1008. Person strip: x=1008, cw=912.
+    top_crop = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
+    bot_crop = re.search(r"\[src2\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
+    assert top_crop == "1008:1080:0:0"
+    assert bot_crop == "912:1080:1008:0"
+
+
+def test_split_uses_bbox_y_for_tight_band_fill():
+    """Chart bboxes anchor the crop, with a little extra height for edge safety."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.1, x2=0.5, y2=0.7),
+        split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # Chart bbox y: 0.1..0.7 -> y=108, ch=648, then a modest 12% pad per side.
+    assert "crop=1008:804:0:30" in fg
+
+
+def test_split_chart_person_adds_vertical_pad_to_reduce_chart_side_crop():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.02, y1=0.03, x2=0.58, y2=0.7),
+        split_person_region=BoundingBox(x1=0.585, y1=0.0, x2=0.995, y2=0.62),
+        top_band_ratio=0.436,
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=640, src_h=360).filtergraph
+    assert "[src1]crop=372:280:0:0" in fg
+
+
+def test_split_minimum_strip_width_enforced():
+    """If chart/person bboxes are pathological (seam at edge), don't starve a strip."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.05, y2=1.0),
+        split_person_region=BoundingBox(x1=0.05, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    widths = [int(m) for m in re.findall(r"crop=(\d+):\d+:\d+:\d+", fg)]
+    # Min strip = 20% of 1920 = 384 px. Neither strip should be narrower.
+    assert all(w >= 384 for w in widths), widths
+
+
+def test_split_two_persons_stacks_two_crops():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_TWO_PERSONS,
+        split_person_region=BoundingBox(x1=0.0, y1=0.05, x2=0.5, y2=0.95),
+        split_second_person_region=BoundingBox(x1=0.5, y1=0.05, x2=1.0, y2=0.95),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "split=2" in fg and "vstack=inputs=2" in fg
+    # Seam at x=960. bbox y: 0.05..0.95 -> y=54, ch=972 (even).
+    assert "[src1]crop=960:972:0:54" in fg
+    assert "[src2]crop=960:972:960:54" in fg
+
+
+def test_split_two_charts_stacks_two_crops():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_TWO_CHARTS,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.5, y2=1.0),
+        split_second_chart_region=BoundingBox(x1=0.5, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "split=2" in fg and "vstack=inputs=2" in fg
+    assert "[src1]crop=960:1080:0:0" in fg
+    assert "[src2]crop=960:1080:960:0" in fg
+
+
+def test_split_two_persons_without_bboxes_defaults_to_centered():
+    """No bboxes -> centered 50/50 seam, full source height fallback."""
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_TWO_PERSONS
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "[src1]crop=960:1080:0:0" in fg
+    assert "[src2]crop=960:1080:960:0" in fg
+
+
+def test_split_bands_use_cover_scale_plus_center_crop():
+    """Each band is painted edge-to-edge -- no letterbox bars."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert fg.count("force_original_aspect_ratio=increase") == 2
+    assert fg.count("setsar=1") == 2
+
+
+def test_zoom_tighter_means_smaller_crop_window():
+    from humeo_core.primitives.layouts import plan_zoom_call_center
+
+    wide = plan_zoom_call_center(
+        LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.0),
+        out_w=1080,
+        out_h=1920,
+    )
+    tight = plan_zoom_call_center(
+        LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=2.0),
+        out_w=1080,
+        out_h=1920,
+    )
+    # Parse crop=CW:CH:X:Y out of each filtergraph.
+    import re
+
+    def crop(fg: str) -> tuple[int, int]:
+        m = re.search(r"crop=(\d+):(\d+):", fg)
+        assert m is not None
+        return int(m.group(1)), int(m.group(2))
+
+    wcw, wch = crop(wide.filtergraph)
+    tcw, tch = crop(tight.filtergraph)
+    assert tcw < wcw and tch < wch
diff --git a/humeo-core/tests/test_schemas.py b/humeo-core/tests/test_schemas.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49470eec90402a1196d29e4eef6896f19bcb12b
--- /dev/null
+++ b/humeo-core/tests/test_schemas.py
@@ -0,0 +1,267 @@
+import pytest
+from pydantic import ValidationError
+
+from humeo_core.schemas import (
+    ApprovalResult,
+    Clip,
+    ClipPlan,
+    ClipSubtitleWords,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    RatingFeedback,
+    RenderRequest,
+    Scene,
+    SessionState,
+    TimedCenterPoint,
+    TranscriptWord,
+)
+
+
+def test_scene_requires_end_after_start():
+    Scene(scene_id="s1", start_time=0.0, end_time=1.0)
+    with pytest.raises(ValueError):
+        Scene(scene_id="s1", start_time=5.0, end_time=5.0)
+    with pytest.raises(ValueError):
+        Scene(scene_id="s1", start_time=5.0, end_time=1.0)
+
+
+def test_layout_instruction_defaults_and_bounds():
+    li = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
+    assert li.zoom == 1.0
+    assert 0 <= li.person_x_norm <= 1
+    assert li.person_tracking == []
+    assert li.focus_stack_order == FocusStackOrder.CHART_THEN_PERSON
+    with pytest.raises(ValueError):
+        LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, zoom=0.0)
+    with pytest.raises(ValueError):
+        LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, person_x_norm=2.0)
+
+
+def test_layout_instruction_accepts_sorted_tracking_points():
+    li = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.25),
+            TimedCenterPoint(t_sec=5.0, x_norm=0.8, zoom=1.0),
+        ],
+    )
+    assert [point.t_sec for point in li.person_tracking] == [0.0, 5.0]
+    assert li.person_tracking[0].zoom == pytest.approx(1.25)
+
+
+def test_layout_instruction_rejects_unsorted_tracking_points():
+    with pytest.raises(ValueError, match="person_tracking times"):
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SIT_CENTER,
+            person_tracking=[
+                TimedCenterPoint(t_sec=5.0, x_norm=0.8),
+                TimedCenterPoint(t_sec=1.0, x_norm=0.2),
+            ],
+        )
+
+
+def test_clip_duration():
+    c = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=10.0,
+        end_time_sec=42.5,
+    )
+    assert c.duration_sec == pytest.approx(32.5)
+
+
+def test_clip_hook_relative_to_clip_in_point():
+    c = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=100.0,
+        end_time_sec=130.0,
+        hook_start_sec=0.0,
+        hook_end_sec=3.0,
+    )
+    assert c.hook_end_sec == 3.0
+
+
+def test_clip_hook_must_be_within_duration():
+    with pytest.raises(ValueError, match="hook window"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            hook_start_sec=0.0,
+            hook_end_sec=15.0,
+        )
+
+
+def test_clip_hook_both_or_neither():
+    with pytest.raises(ValueError, match="hook_start_sec and hook_end_sec"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            hook_start_sec=1.0,
+            hook_end_sec=None,
+        )
+
+
+def test_clip_trim_cannot_exceed_duration():
+    with pytest.raises(ValueError, match="trim"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            trim_start_sec=6.0,
+            trim_end_sec=6.0,
+        )
+
+
+def test_clip_plan_roundtrip():
+    plan = ClipPlan(
+        source_path="/tmp/x.mp4",
+        clips=[
+            Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+        ],
+    )
+    d = plan.model_dump()
+    assert ClipPlan.model_validate(d) == plan
+
+
+def test_clip_roundtrip_with_extended_fields():
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"message_wow": 0.9, "hook_emotion": 0.7},
+        origin="both",
+        visual_notes="Speaker leans in.",
+        reasoning="Strong explanation and hook.",
+    )
+
+    dumped = clip.model_dump()
+
+    assert dumped["score_breakdown"] == {"message_wow": 0.9, "hook_emotion": 0.7}
+    assert dumped["origin"] == "both"
+    assert dumped["visual_notes"] == "Speaker leans in."
+    assert dumped["reasoning"] == "Strong explanation and hook."
+    assert Clip.model_validate(dumped) == clip
+
+
+def test_clip_defaults_validate_and_do_not_serialize_new_fields():
+    clip = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+
+    assert clip.origin == "text"
+    assert clip.score_breakdown is None
+    assert clip.visual_notes is None
+    assert clip.reasoning is None
+
+    dumped = clip.model_dump()
+    assert "score_breakdown" not in dumped
+    assert "origin" not in dumped
+    assert "visual_notes" not in dumped
+    assert "reasoning" not in dumped
+    assert Clip.model_validate(dumped) == clip
+
+
+def test_clip_score_breakdown_validation():
+    with pytest.raises(ValidationError):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=30.0,
+            score_breakdown={"hook": -0.1},
+        )
+
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"hook": 1.2},
+    )
+    assert clip.score_breakdown == {"hook": 1.0}
+
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={},
+    )
+    assert clip.score_breakdown == {}
+
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"hook": 0.5},
+    )
+    assert clip.score_breakdown == {"hook": 0.5}
+
+
+def test_clip_subtitle_words_relative_times():
+    w = ClipSubtitleWords(
+        words=[TranscriptWord(word="hi", start_time=0.0, end_time=0.2)]
+    )
+    assert w.words[0].start_time == 0.0
+
+
+def test_render_request_modes():
+    c = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+    li = LayoutInstruction(clip_id="1", layout=LayoutKind.ZOOM_CALL_CENTER)
+    req = RenderRequest(
+        source_path="/tmp/x.mp4",
+        clip=c,
+        layout=li,
+        output_path="/tmp/out.mp4",
+    )
+    assert req.mode == "normal"
+    req2 = RenderRequest(**{**req.model_dump(), "mode": "dry_run"})
+    assert req2.mode == "dry_run"
+
+
+def test_approval_result_roundtrip():
+    result = ApprovalResult(
+        action="proceed",
+        selected_ids=["001", "003"],
+        steering_note="prefer emotional moments",
+    )
+    assert ApprovalResult.model_validate(result.model_dump()) == result
+
+
+def test_approval_result_rejects_invalid_action():
+    with pytest.raises(ValidationError):
+        ApprovalResult(action="invalid")
+
+
+def test_rating_feedback_roundtrip():
+    feedback = RatingFeedback(
+        rating=2,
+        issues=["wrong_moments", "other"],
+        free_text="needs more context",
+    )
+    assert RatingFeedback.model_validate(feedback.model_dump()) == feedback
+
+
+def test_rating_feedback_rejects_invalid_rating():
+    with pytest.raises(ValidationError):
+        RatingFeedback(rating=4)
+
+
+def test_session_state_roundtrip():
+    state = SessionState(
+        source_key="youtube:PdVv_vLkUgk",
+        iteration=3,
+        steering_notes=["be punchier"],
+        last_rating=RatingFeedback(rating=3),
+        last_selected_ids=["001", "002"],
+    )
+    assert SessionState.model_validate(state.model_dump()) == state
diff --git a/humeo-core/tests/test_select_clips.py b/humeo-core/tests/test_select_clips.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfe86ea77c7835323e902ec22f6f95a03f613a80
--- /dev/null
+++ b/humeo-core/tests/test_select_clips.py
@@ -0,0 +1,49 @@
+from humeo_core.primitives.select_clips import select_clips_heuristic
+from humeo_core.schemas import TranscriptWord
+
+
+def _words(start: float, end: float, n: int) -> list[TranscriptWord]:
+    step = (end - start) / max(1, n)
+    return [
+        TranscriptWord(word=f"w{i}", start_time=start + i * step, end_time=start + (i + 1) * step)
+        for i in range(n)
+    ]
+
+
+def test_no_transcript_returns_single_clip():
+    plan = select_clips_heuristic("/tmp/x.mp4", [], duration_sec=600.0)
+    assert len(plan.clips) == 1
+
+
+def test_prefers_dense_windows():
+    # dense between 30-90, sparse elsewhere
+    dense = _words(30.0, 90.0, 240)  # 4 words/sec
+    sparse_before = _words(0.0, 30.0, 6)
+    sparse_after = _words(90.0, 600.0, 30)
+    words = sparse_before + dense + sparse_after
+    plan = select_clips_heuristic(
+        "/tmp/x.mp4", words, duration_sec=600.0, target_count=1, min_sec=30, max_sec=60
+    )
+    assert len(plan.clips) == 1
+    c = plan.clips[0]
+    assert 30 <= c.start_time_sec <= 90
+    assert c.end_time_sec <= 120
+
+
+def test_no_overlap_when_multiple_picked():
+    dense_a = _words(30.0, 90.0, 240)
+    dense_b = _words(200.0, 260.0, 240)
+    words = dense_a + dense_b
+    plan = select_clips_heuristic(
+        "/tmp/x.mp4",
+        words,
+        duration_sec=400.0,
+        target_count=3,
+        min_sec=30,
+        max_sec=60,
+    )
+    # Should pick both dense regions without overlap.
+    assert len(plan.clips) >= 2
+    starts_ends = sorted((c.start_time_sec, c.end_time_sec) for c in plan.clips)
+    for (s1, e1), (s2, e2) in zip(starts_ends, starts_ends[1:]):
+        assert e1 <= s2
diff --git a/humeo-core/tests/test_server_tools.py b/humeo-core/tests/test_server_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ec239d7b798f94abbb8e06b815325efbd40424d
--- /dev/null
+++ b/humeo-core/tests/test_server_tools.py
@@ -0,0 +1,93 @@
+"""Exercise the MCP server tools as plain Python callables.
+
+FastMCP tools are registered on the server instance, but the underlying
+functions are ordinary Python functions decorated with ``@mcp.tool()``.
+We import the module and invoke those functions directly to verify the
+end-to-end wiring (schemas validated, dispatch correct, JSON-serializable).
+"""
+
+import humeo_core.server as srv
+from humeo_core.schemas import LayoutKind
+
+
+def test_list_layouts_lists_all_three():
+    result = srv.list_layouts()
+    kinds = {layout["kind"] for layout in result["layouts"]}
+    assert kinds == {k.value for k in LayoutKind}
+
+
+def test_plan_layout_tool_returns_filtergraph():
+    for k in LayoutKind:
+        out = srv.plan_layout(layout=k.value)
+        assert out["out_label"] == "vout"
+        assert "[vout]" in out["filtergraph"]
+
+
+def test_build_render_cmd_dry_run():
+    req = {
+        "source_path": "/tmp/src.mp4",
+        "clip": {
+            "clip_id": "1",
+            "topic": "t",
+            "start_time_sec": 0.0,
+            "end_time_sec": 30.0,
+        },
+        "layout": {"clip_id": "1", "layout": LayoutKind.SIT_CENTER.value},
+        "output_path": "/tmp/out.mp4",
+    }
+    out = srv.build_render_cmd(request=req)
+    assert out["success"] is True
+    assert out["output_path"] == "/tmp/out.mp4"
+    assert any("-filter_complex" == part for part in out["ffmpeg_cmd"])
+
+
+def test_select_clips_tool_happy_path():
+    words = [
+        {"word": f"w{i}", "start_time": float(i), "end_time": float(i) + 0.5}
+        for i in range(120)
+    ]
+    plan = srv.select_clips(
+        source_path="/tmp/x.mp4",
+        transcript_words=words,
+        duration_sec=120.0,
+        target_count=2,
+        min_sec=30.0,
+        max_sec=60.0,
+    )
+    assert plan["source_path"] == "/tmp/x.mp4"
+    assert 1 <= len(plan["clips"]) <= 2
+
+
+def test_classify_scenes_tool_no_keyframes():
+    scenes = [{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0}]
+    out = srv.classify_scenes(scenes=scenes)
+    assert out["classifications"][0]["scene_id"] == "s0"
+    assert out["classifications"][0]["layout"] in {k.value for k in LayoutKind}
+
+
+def test_detect_scene_regions_returns_jobs_and_prompt():
+    scenes = [
+        {"scene_id": "s0", "start_time": 0.0, "end_time": 5.0, "keyframe_path": "/tmp/k0.jpg"},
+        {"scene_id": "s1", "start_time": 5.0, "end_time": 10.0, "keyframe_path": "/tmp/k1.jpg"},
+    ]
+    out = srv.detect_scene_regions(scenes=scenes)
+    assert "STRICT JSON" in out["prompt"]
+    assert len(out["jobs"]) == 2
+    assert out["jobs"][0]["scene_id"] == "s0"
+    assert out["jobs"][0]["keyframe_path"] == "/tmp/k0.jpg"
+
+
+def test_classify_scenes_with_vision_derives_instructions():
+    regions = [
+        {
+            "scene_id": "s0",
+            "chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 0.66, "y2": 1.0},
+            "person_bbox": {"x1": 0.72, "y1": 0.1, "x2": 0.99, "y2": 0.95},
+            "ocr_text": "CPI YoY",
+        }
+    ]
+    out = srv.classify_scenes_with_vision(regions=regions)
+    assert out["classifications"][0]["layout"] == LayoutKind.SPLIT_CHART_PERSON.value
+    instr = out["layout_instructions"][0]
+    assert instr["chart_x_norm"] == 0.0
+    assert 0.8 < instr["person_x_norm"] < 0.9
diff --git a/humeo-core/tests/test_vision.py b/humeo-core/tests/test_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..d370e5f455fac525c6f10afc04945e506dfcfff9
--- /dev/null
+++ b/humeo-core/tests/test_vision.py
@@ -0,0 +1,228 @@
+"""Tests for the scene-change + vision-LLM + OCR bbox primitive.
+
+Covers:
+* happy path: well-formed JSON -> populated ``SceneRegions``.
+* bad JSON: degrade to empty regions + raw_reason, never raise.
+* bad bbox: one malformed bbox does not take down the whole scene record.
+* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.
+* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come
+  from the bboxes when present, defaults when not.
+"""
+
+import json
+
+import pytest
+
+from humeo_core.primitives.vision import (
+    _CHART_WIDTH_SPLIT_THRESHOLD,
+    classify_from_regions,
+    classify_scenes_with_vision_llm,
+    detect_regions_with_llm,
+    layout_instruction_from_regions,
+)
+from humeo_core.schemas import (
+    BoundingBox,
+    LayoutKind,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+)
+
+
+# ---------------------------------------------------------------------------
+# Schema
+# ---------------------------------------------------------------------------
+
+
+def test_bounding_box_requires_x2_gt_x1():
+    BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
+    with pytest.raises(ValueError):
+        BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
+    with pytest.raises(ValueError):
+        BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)
+
+
+def test_bounding_box_center_and_width():
+    b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
+    assert b.center_x == pytest.approx(0.4)
+    assert b.center_y == pytest.approx(0.65)
+    assert b.width == pytest.approx(0.4)
+
+
+# ---------------------------------------------------------------------------
+# detect_regions_with_llm
+# ---------------------------------------------------------------------------
+
+
+def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
+    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
+
+
+def test_detect_regions_happy_path():
+    scenes = [_scene(0)]
+
+    def vision_fn(_img: str, _prompt: str) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
+                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
+                "ocr_text": "Inflation YoY",
+                "reason": "explainer layout",
+            }
+        )
+
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert len(out) == 1
+    r = out[0]
+    assert r.scene_id == "s0"
+    assert r.person_bbox and r.person_bbox.center_x > 0.8
+    assert r.chart_bbox and r.chart_bbox.width > 0.6
+    assert "Inflation" in r.ocr_text
+
+
+def test_detect_regions_bad_json_is_safe():
+    scenes = [_scene(0)]
+
+    def vision_fn(*_a) -> str:
+        return "not json"
+
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert out[0].chart_bbox is None
+    assert "parse error" in out[0].raw_reason.lower()
+
+
+def test_detect_regions_missing_keyframe_is_safe():
+    scenes = [_scene(0, kf=None)]
+
+    def vision_fn(*_a) -> str:  # pragma: no cover - should not be called
+        raise AssertionError("vision_fn must not be called without a keyframe")
+
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert "no keyframe" in out[0].raw_reason.lower()
+
+
+def test_detect_regions_bad_bbox_degrades_gracefully():
+    scenes = [_scene(0)]
+
+    def vision_fn(*_a) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
+                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
+                "ocr_text": "",
+                "reason": "person bbox inverted",
+            }
+        )
+
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert out[0].chart_bbox is not None
+
+
+# ---------------------------------------------------------------------------
+# classify_from_regions
+# ---------------------------------------------------------------------------
+
+
+def test_classify_wide_chart_is_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
+        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SPLIT_CHART_PERSON
+    assert c.confidence > 0.5
+
+
+def test_classify_narrow_chart_not_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
+        person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    # chart width (0.1) is below the split threshold -> not split
+    assert c.layout != LayoutKind.SPLIT_CHART_PERSON
+
+
+def test_classify_wide_person_is_zoom_call():
+    r = SceneRegions(
+        scene_id="s0",
+        person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.ZOOM_CALL_CENTER
+
+
+def test_classify_small_person_is_sit_center():
+    r = SceneRegions(
+        scene_id="s0",
+        person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SIT_CENTER
+
+
+def test_classify_nothing_detected_defaults_sit_center_low_conf():
+    r = SceneRegions(scene_id="s0", raw_reason="model returned null")
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SIT_CENTER
+    assert c.confidence <= 0.5
+
+
+def test_chart_threshold_is_exported():
+    # guard against the tuning constant silently being removed
+    assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0
+
+
+# ---------------------------------------------------------------------------
+# layout_instruction_from_regions
+# ---------------------------------------------------------------------------
+
+
+def test_layout_instruction_from_regions_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
+        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    instr = layout_instruction_from_regions(r, c)
+    assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
+    # person_x_norm = center of (0.72, 0.99) = 0.855
+    assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
+    # chart_x_norm = left edge = 0.0
+    assert instr.chart_x_norm == pytest.approx(0.0)
+
+
+def test_layout_instruction_defaults_when_no_regions():
+    r = SceneRegions(scene_id="s0")
+    c = SceneClassification(
+        scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
+    )
+    instr = layout_instruction_from_regions(r, c)
+    assert instr.person_x_norm == 0.5
+    assert instr.chart_x_norm == 0.0
+
+
+def test_classify_scenes_with_vision_llm_returns_pairs():
+    scenes = [_scene(0)]
+
+    def vision_fn(*_a) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
+                "chart_bbox": None,
+                "ocr_text": "",
+                "reason": "solo subject",
+            }
+        )
+
+    pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
+    assert len(pairs) == 1
+    regions, classification = pairs[0]
+    assert regions.person_bbox is not None
+    assert classification.layout == LayoutKind.ZOOM_CALL_CENTER
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..1e8fbd4bd6d827fb709ea50cc604fbcfecaa228f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,56 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "humeo"
+version = "0.1.0"
+description = "Automated podcast-to-shorts pipeline"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "yt-dlp>=2024.0",
+    "fastapi>=0.115",
+    "openai>=1.0",
+    "google-genai>=1.0",
+    "httpx>=0.28",
+    "jinja2>=3.1",
+    "numpy>=1.24",
+    "Pillow>=10.0",
+    "python-dotenv>=1.0",
+    "replicate>=0.34.2",
+    "tqdm>=4.60",
+    "python-multipart>=0.0.9",
+    "uvicorn[standard]>=0.30",
+    "humeo-core",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest-asyncio>=0.23",
+    "ruff",
+    "pytest",
+]
+whisper = [
+    "whisperx @ git+https://github.com/m-bain/whisperX.git",
+]
+
+[tool.uv.sources]
+humeo-core = { path = "humeo-core", editable = true }
+
+[project.scripts]
+humeo = "humeo.cli:main"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+humeo = ["prompts/*.jinja2"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests", "humeo-core/tests"]
+addopts = "-ra -q"
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
diff --git a/src/humeo.egg-info/PKG-INFO b/src/humeo.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..dd7705a36a7c386b5f71595ef886799545abca9b
--- /dev/null
+++ b/src/humeo.egg-info/PKG-INFO
@@ -0,0 +1,223 @@
+Metadata-Version: 2.4
+Name: humeo
+Version: 0.1.0
+Summary: Automated podcast-to-shorts pipeline
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: yt-dlp>=2024.0
+Requires-Dist: openai>=1.0
+Requires-Dist: google-genai>=1.0
+Requires-Dist: httpx>=0.28
+Requires-Dist: jinja2>=3.1
+Requires-Dist: numpy>=1.24
+Requires-Dist: Pillow>=10.0
+Requires-Dist: python-dotenv>=1.0
+Requires-Dist: replicate>=0.34.2
+Requires-Dist: tqdm>=4.60
+Requires-Dist: humeo-core
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Provides-Extra: whisper
+Requires-Dist: whisperx @ git+https://github.com/m-bain/whisperX.git ; extra == "whisper"
+Dynamic: license-file
+
+---
+title: Humeo
+sdk: docker
+app_port: 7860
+---
+
+# Humeo
+
+Current default preset:
+
+- `native_highlight` captions
+- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
+- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
+- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
+
+Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
+
+**Architecture (static HTML, GitHub Pages):**  
+[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
+
+## Hugging Face Space
+
+This repo includes a Hugging Face Docker Space entrypoint in `app.py`.
+
+- Upload one local MP4
+- Watch live pipeline logs and stage progress
+- Download rendered `short_*.mp4` clips from the UI
+
+Required Space secrets:
+
+- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
+- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
+
+The Docker image pins `HUMEO_TRANSCRIBE_PROVIDER=openai` for the Space demo.
+
+## Repo layout
+
+| Path | Role |
+|------|------|
+| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
+| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
+
+## Pipeline (actual order)
+
+```text
+YouTube URL
+  → ingest (source.mp4, transcript.json)
+  → clip selection (Gemini → clips.json)
+  → hook detection (Gemini → hooks.json)
+  → content pruning (Gemini → prune.json)
+  → keyframes + layout vision (Gemini vision → layout_vision.json)
+  → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
+```
+
+Details: **`docs/PIPELINE.md`**.
+
+## Five layouts
+
+A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
+
+## Requirements
+
+- **Python** ≥ 3.10  
+- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)  
+- **`ffmpeg`** — on `PATH` for extract/render  
+- **API keys** — see **`docs/ENVIRONMENT.md`**  
+  - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages  
+  - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable  
+  - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
+
+Copy **`.env.example`** → **`.env`** (never commit `.env`).
+
+## Install
+
+```bash
+uv venv
+uv sync
+```
+
+Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
+
+```bash
+uv sync --extra whisper
+```
+
+## Run
+
+```bash
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+humeo --long-to-shorts "C:\path\to\video.mp4"
+```
+
+Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
+
+## CLI guide (all flags)
+
+Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
+
+### Required
+
+| Flag | Meaning |
+|------|---------|
+| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
+
+### Paths and cache behavior
+
+| Flag | Meaning |
+|------|---------|
+| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
+| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
+| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
+| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
+| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
+
+### Model selection and stage forcing
+
+| Flag | Meaning |
+|------|---------|
+| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
+| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
+| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
+| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
+| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
+| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
+| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
+
+### Pruning and subtitles
+
+| Flag | Meaning |
+|------|---------|
+| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
+| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
+| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
+| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
+| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
+
+### Logging
+
+| Flag | Meaning |
+|------|---------|
+| `--verbose`, `-v` | Enable debug logging. |
+
+### Common command recipes
+
+```bash
+# Basic run
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+
+# Local MP4
+humeo --long-to-shorts "C:\path\to\video.mp4"
+
+# Full fresh run for debugging / prompt tuning
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
+
+# Re-run only clip selection after prompt edits
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
+
+# Keep intermediates in a fixed local folder
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
+
+# Compare different prune levels on same source
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
+```
+
+## Documentation
+
+| Doc | Purpose |
+|-----|---------|
+| **`docs/README.md`** | Index of all files under `docs/` |
+| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
+| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
+| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
+| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
+| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
+| **`docs/full_run_output.txt`** | Example full run log (text) |
+| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
+| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
+| **`docs/TODO.md`** | Backlog |
+| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
+| **`docs/SOLUTIONS.md`** | Design rationale |
+| **`TERMINOLOGY.md`** | Glossary |
+
+## Tests
+
+```bash
+uv sync --extra dev
+uv run pytest
+```
+
+## Sharing outputs
+
+`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
+
+## License
+
+See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
diff --git a/src/humeo.egg-info/SOURCES.txt b/src/humeo.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9f037363506a416f615b401a87df964d04bc1d9f
--- /dev/null
+++ b/src/humeo.egg-info/SOURCES.txt
@@ -0,0 +1,58 @@
+LICENSE
+README.md
+pyproject.toml
+src/humeo/__init__.py
+src/humeo/best_of.py
+src/humeo/cli.py
+src/humeo/clip_assembly.py
+src/humeo/clip_selection_cache.py
+src/humeo/clip_selector.py
+src/humeo/config.py
+src/humeo/content_pruning.py
+src/humeo/cutter.py
+src/humeo/env.py
+src/humeo/gemini_generate.py
+src/humeo/hook_detector.py
+src/humeo/hook_library.py
+src/humeo/ingest.py
+src/humeo/interactive.py
+src/humeo/layout_vision.py
+src/humeo/pipeline.py
+src/humeo/prompt_loader.py
+src/humeo/reframe_ffmpeg.py
+src/humeo/render_window.py
+src/humeo/session_state.py
+src/humeo/transcript_align.py
+src/humeo/video_cache.py
+src/humeo.egg-info/PKG-INFO
+src/humeo.egg-info/SOURCES.txt
+src/humeo.egg-info/dependency_links.txt
+src/humeo.egg-info/entry_points.txt
+src/humeo.egg-info/requires.txt
+src/humeo.egg-info/top_level.txt
+src/humeo/prompts/clip_selection_system.jinja2
+src/humeo/prompts/clip_selection_user.jinja2
+src/humeo/prompts/content_pruning_system.jinja2
+src/humeo/prompts/hook_detection_system.jinja2
+tests/test_ass_subtitles.py
+tests/test_best_of.py
+tests/test_clip_assembly.py
+tests/test_clip_ranking.py
+tests/test_clip_selection_cache.py
+tests/test_clip_selector.py
+tests/test_content_pruning.py
+tests/test_cutter_native_highlight.py
+tests/test_gemini_generate.py
+tests/test_hook_detector.py
+tests/test_hook_library.py
+tests/test_ingest_openai_chunks.py
+tests/test_interactive.py
+tests/test_layout_vision_unit.py
+tests/test_pipeline_interactive.py
+tests/test_pipeline_quality_gate.py
+tests/test_prompt_loader.py
+tests/test_reframe_ffmpeg.py
+tests/test_render_window.py
+tests/test_session_state.py
+tests/test_transcript_align.py
+tests/test_video_cache.py
\ No newline at end of file
diff --git a/src/humeo.egg-info/dependency_links.txt b/src/humeo.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/src/humeo.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/humeo.egg-info/entry_points.txt b/src/humeo.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6650ec9964eaf820cd23924e97064deec871f740
--- /dev/null
+++ b/src/humeo.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+humeo = humeo.cli:main
diff --git a/src/humeo.egg-info/requires.txt b/src/humeo.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a25622009493d8500d8404f29971a15fc047d43
--- /dev/null
+++ b/src/humeo.egg-info/requires.txt
@@ -0,0 +1,19 @@
+yt-dlp>=2024.0
+openai>=1.0
+google-genai>=1.0
+httpx>=0.28
+jinja2>=3.1
+numpy>=1.24
+Pillow>=10.0
+python-dotenv>=1.0
+replicate>=0.34.2
+tqdm>=4.60
+humeo-core
+
+[dev]
+pytest-asyncio>=0.23
+ruff
+pytest
+
+[whisper]
+whisperx @ git+https://github.com/m-bain/whisperX.git
diff --git a/src/humeo.egg-info/top_level.txt b/src/humeo.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..16b4994761aa79c4c9a2ec5bb765ce91cf61f3a2
--- /dev/null
+++ b/src/humeo.egg-info/top_level.txt
@@ -0,0 +1 @@
+humeo
diff --git a/src/humeo/__init__.py b/src/humeo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e6ebddd25ec70dc23cb703e971b0276b7a833b
--- /dev/null
+++ b/src/humeo/__init__.py
@@ -0,0 +1,3 @@
+"""Humeo - Automated podcast-to-shorts pipeline."""
+
+__version__ = "0.1.0"
diff --git a/src/humeo/best_of.py b/src/humeo/best_of.py
new file mode 100644
index 0000000000000000000000000000000000000000..921e8c6c692570f922fd3f3403c96f81e94a1dcc
--- /dev/null
+++ b/src/humeo/best_of.py
@@ -0,0 +1,99 @@
+"""Curate a small review pack from a larger batch render."""
+
+from __future__ import annotations
+
+import json
+import re
+import shutil
+from pathlib import Path
+
+from humeo.clip_selector import clip_quality_priority_score
+from humeo_core.schemas import Clip
+
+_SHORT_FILENAME_RE = re.compile(r"^short_(?P<clip_id>\d+)\.mp4$", re.IGNORECASE)
+
+
+def _load_clip_map(work_dir: Path) -> dict[str, Clip]:
+    for filename in ("clips.json", "assembled_clips.json"):
+        path = work_dir / filename
+        if not path.is_file():
+            continue
+        data = json.loads(path.read_text(encoding="utf-8"))
+        items = data.get("clips", data) if isinstance(data, dict) else data
+        return {
+            clip["clip_id"]: Clip.model_validate(clip)
+            for clip in items
+            if isinstance(clip, dict) and clip.get("clip_id")
+        }
+    return {}
+
+
+def _default_work_dir_for_source(source_dir: Path, repo_root: Path) -> Path:
+    match = re.fullmatch(r"videoplayback_(\d+)", source_dir.name)
+    if match:
+        return repo_root / f".humeo_batch_videoplayback{match.group(1)}"
+    return repo_root / f".humeo_{source_dir.name}"
+
+
+def build_best_of_review_pack(
+    batch_root: Path,
+    destination_dir: Path,
+    *,
+    per_source: int = 2,
+    repo_root: Path | None = None,
+    work_dir_map: dict[str, Path] | None = None,
+) -> list[Path]:
+    batch_root = Path(batch_root)
+    destination_dir = Path(destination_dir)
+    repo_root = Path(repo_root) if repo_root is not None else batch_root.parent
+    destination_dir.mkdir(parents=True, exist_ok=True)
+
+    copied: list[Path] = []
+    manifest: list[dict[str, object]] = []
+    for source_dir in sorted(path for path in batch_root.iterdir() if path.is_dir()):
+        work_dir = (
+            work_dir_map[source_dir.name]
+            if work_dir_map is not None and source_dir.name in work_dir_map
+            else _default_work_dir_for_source(source_dir, repo_root)
+        )
+        clip_map = _load_clip_map(work_dir)
+        ranked: list[tuple[float, Path, str, Clip | None]] = []
+        for mp4_path in sorted(source_dir.glob("short_*.mp4")):
+            match = _SHORT_FILENAME_RE.match(mp4_path.name)
+            if not match:
+                continue
+            clip_id = match.group("clip_id")
+            clip = clip_map.get(clip_id)
+            score = clip_quality_priority_score(clip) if clip is not None else 0.0
+            ranked.append((score, mp4_path, clip_id, clip))
+
+        ranked.sort(
+            key=lambda item: (
+                item[0],
+                item[3].virality_score if item[3] is not None else 0.0,
+                -(item[3].duration_sec if item[3] is not None else 0.0),
+            ),
+            reverse=True,
+        )
+        for rank, (score, mp4_path, clip_id, clip) in enumerate(ranked[: max(1, per_source)], start=1):
+            target_path = destination_dir / f"{source_dir.name}__pick{rank:02d}__{mp4_path.name}"
+            shutil.copy2(mp4_path, target_path)
+            copied.append(target_path)
+            manifest.append(
+                {
+                    "source": source_dir.name,
+                    "rank": rank,
+                    "score": round(score, 4),
+                    "output_path": str(target_path),
+                    "original_path": str(mp4_path),
+                    "clip_id": clip.clip_id if clip is not None else clip_id,
+                    "title": clip.suggested_overlay_title if clip is not None else "",
+                    "topic": clip.topic if clip is not None else "",
+                }
+            )
+
+    (destination_dir / "best_of_manifest.json").write_text(
+        json.dumps({"clips": manifest}, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    return copied
diff --git a/src/humeo/cli.py b/src/humeo/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bd83f4801cd6190b2b898120b32d5c6495ca10
--- /dev/null
+++ b/src/humeo/cli.py
@@ -0,0 +1,369 @@
+"""CLI entry point for the Humeo pipeline."""
+
+import argparse
+import logging
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from humeo.config import PipelineConfig
+from humeo.pipeline import run_pipeline
+
+DEFAULT_SEGMENTATION_PROVIDER = (
+    (os.environ.get("HUMEO_SEGMENTATION_PROVIDER") or "").strip().lower()
+    or ("replicate" if (os.environ.get("REPLICATE_API_TOKEN") or "").strip() else "off")
+)
+
+
+def setup_logging(verbose: bool = False):
+    """Configure logging with a clean format."""
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s | %(levelname)-7s | %(name)s | %(message)s",
+        datefmt="%H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    # Suppress noisy third-party loggers
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    """Build the argument parser."""
+    parser = argparse.ArgumentParser(
+        prog="humeo",
+        description="Humeo - Automated podcast-to-shorts pipeline from YouTube or local MP4",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  humeo --long-to-shorts "https://youtube.com/watch?v=abc123"
+  humeo --long-to-shorts "C:\\Videos\\episode.mp4"
+  humeo --long-to-shorts "https://youtube.com/watch?v=abc123" --work-dir .humeo_work
+  humeo --long-to-shorts "https://youtube.com/watch?v=abc123" --gemini-model gemini-2.0-flash
+        """,
+    )
+
+    parser.add_argument(
+        "--long-to-shorts",
+        metavar="SOURCE",
+        required=True,
+        help="YouTube video URL or local MP4 path to process",
+    )
+
+    parser.add_argument(
+        "--output", "-o",
+        type=Path,
+        default=Path("output"),
+        help="Output directory for final shorts (default: ./output)",
+    )
+
+    parser.add_argument(
+        "--work-dir",
+        type=Path,
+        default=None,
+        help="Working directory for intermediate files. Default: per-video folder under the "
+        "cache root (see docs/ENVIRONMENT.md). Use this to force e.g. ./.humeo_work.",
+    )
+
+    parser.add_argument(
+        "--no-video-cache",
+        action="store_true",
+        help="Do not use per-video cache dirs; use ./.humeo_work unless --work-dir is set.",
+    )
+
+    parser.add_argument(
+        "--cache-root",
+        type=Path,
+        default=None,
+        help="Override cache root for manifests and per-video ingest (env: HUMEO_CACHE_ROOT).",
+    )
+
+    parser.add_argument(
+        "--gemini-model",
+        default=None,
+        help="Gemini model id for clip selection (default: GEMINI_MODEL env; see humeo.config).",
+    )
+
+    parser.add_argument(
+        "--render-theme",
+        choices=["legacy", "reference_lower_third", "native_highlight"],
+        default="native_highlight",
+        help="Visual theme for title/caption rendering (default: native_highlight).",
+    )
+
+    parser.add_argument(
+        "--hook-library-path",
+        type=Path,
+        default=None,
+        help="Zip or directory containing retrieved viral hook examples (env: HUMEO_HOOK_LIBRARY_PATH).",
+    )
+
+    parser.add_argument(
+        "--segmentation-provider",
+        choices=["off", "replicate"],
+        default=DEFAULT_SEGMENTATION_PROVIDER,
+        help=(
+            "Speaker-centering tracker. Defaults to HUMEO_SEGMENTATION_PROVIDER when set, "
+            "otherwise replicate if REPLICATE_API_TOKEN exists, else off."
+        ),
+    )
+
+    parser.add_argument(
+        "--segmentation-model",
+        default="meta/sam-2-video",
+        help="Segmentation model id used by the fallback tracker (default: meta/sam-2-video).",
+    )
+
+    parser.add_argument(
+        "--force-clip-selection",
+        action="store_true",
+        help="Re-run clip-selection LLM even when clips.meta.json matches the transcript.",
+    )
+
+    parser.add_argument(
+        "--gemini-vision-model",
+        default=None,
+        help="Gemini model for per-keyframe layout + bbox (default: GEMINI_VISION_MODEL env or --gemini-model).",
+    )
+
+    parser.add_argument(
+        "--force-layout-vision",
+        action="store_true",
+        help="Re-run Gemini vision for layouts even when layout_vision.meta.json matches.",
+    )
+
+    parser.add_argument(
+        "--prune-level",
+        choices=["off", "conservative", "balanced", "aggressive"],
+        default="balanced",
+        help=(
+            "Stage 2.5 inner-clip content pruning aggressiveness. "
+            "'off' skips pruning entirely; 'conservative' trims <=10%%, "
+            "'balanced' <=20%%, 'aggressive' <=35%% of each clip "
+            "(always clamped to the MIN_CLIP_DURATION_SEC floor). Default: balanced."
+        ),
+    )
+
+    parser.add_argument(
+        "--force-content-pruning",
+        action="store_true",
+        help="Re-run content-pruning LLM even when prune.meta.json matches.",
+    )
+
+    parser.add_argument(
+        "--no-hook-detection",
+        action="store_true",
+        help=(
+            "Skip Stage 2.25 hook detection. The selector's hook window "
+            "(possibly the 0.0-3.0s placeholder) will be carried through. "
+            "Stage 2.5 content pruning still treats that exact placeholder "
+            "as 'no hook' so pruning is not disabled."
+        ),
+    )
+
+    parser.add_argument(
+        "--force-hook-detection",
+        action="store_true",
+        help="Re-run hook-detection LLM even when hooks.meta.json matches.",
+    )
+
+    parser.add_argument(
+        "--clean-run",
+        action="store_true",
+        help=(
+            "Run with a fresh work dir and no cache reuse. Implies --no-video-cache, "
+            "--force-clip-selection, --force-layout-vision, and overwrite existing outputs."
+        ),
+    )
+
+    parser.add_argument(
+        "--interactive", "-i",
+        action="store_true",
+        help="Pause after clip selection and after render for human approval.",
+    )
+
+    parser.add_argument(
+        "--subtitle-font-size",
+        type=int,
+        default=48,
+        help=(
+            "Caption font size in output pixels. libass is pinned to "
+            "original_size=1080x1920, so this is a true pixel value. "
+            "(default: 48)"
+        ),
+    )
+
+    parser.add_argument(
+        "--subtitle-margin-v",
+        type=int,
+        default=160,
+        help="Caption bottom margin in output pixels (default: 160)",
+    )
+
+    parser.add_argument(
+        "--subtitle-max-words",
+        type=int,
+        default=4,
+        help="Max words per subtitle cue (default: 4)",
+    )
+
+    parser.add_argument(
+        "--subtitle-max-cue-sec",
+        type=float,
+        default=2.2,
+        help="Max subtitle cue duration in seconds (default: 2.2)",
+    )
+
+    parser.add_argument(
+        "--caption-highlight-lead-ms",
+        type=float,
+        default=60.0,
+        help="Native-highlight word box lead time in milliseconds (default: 60)",
+    )
+
+    parser.add_argument(
+        "--caption-highlight-min-dwell-ms",
+        type=float,
+        default=160.0,
+        help=(
+            "Minimum native-highlight word box dwell in milliseconds "
+            "when timing allows (default: 160)"
+        ),
+    )
+
+    parser.add_argument(
+        "--no-caption-timing-repair",
+        action="store_true",
+        help="Disable conservative repair of suspicious word-level ASR timings.",
+    )
+
+    parser.add_argument(
+        "--no-subtitles",
+        action="store_true",
+        help="Skip burning subtitles. Useful when the source already has captions baked in.",
+    )
+
+    parser.add_argument(
+        "--no-render-qa",
+        action="store_true",
+        help="Skip automatic render QA contact sheets, scores, and debug overlays.",
+    )
+
+    parser.add_argument(
+        "--qa-reference-video",
+        type=Path,
+        default=None,
+        help="Optional reference video for automatic A/B contact-sheet comparison.",
+    )
+
+    parser.add_argument(
+        "--no-qa-debug-overlay",
+        action="store_true",
+        help="Skip low-res crop/debug overlay videos in render QA.",
+    )
+
+    parser.add_argument(
+        "--rerender-clip",
+        action="append",
+        default=[],
+        metavar="CLIP_ID",
+        help="Rerender only this clip id, e.g. 002 or short_002. Can be repeated.",
+    )
+
+    parser.add_argument(
+        "--rerender-warned-only",
+        action="store_true",
+        help="Rerender only clips flagged in the existing render_qa/qa_manifest.json.",
+    )
+
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable debug logging",
+    )
+
+    return parser
+
+
+def main():
+    """CLI entry point."""
+    parser = build_parser()
+    args = parser.parse_args()
+    setup_logging(args.verbose)
+
+    use_video_cache = not args.no_video_cache
+    force_clip_selection = args.force_clip_selection
+    force_layout_vision = args.force_layout_vision
+    force_content_pruning = args.force_content_pruning
+    force_hook_detection = args.force_hook_detection
+    detect_hooks = not args.no_hook_detection
+    overwrite_outputs = False
+    work_dir = args.work_dir
+
+    if args.clean_run:
+        use_video_cache = False
+        force_clip_selection = True
+        force_layout_vision = True
+        force_content_pruning = True
+        force_hook_detection = True
+        overwrite_outputs = True
+        if work_dir is None:
+            stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            work_dir = Path(f".humeo_work_clean_{stamp}")
+
+    config = PipelineConfig(
+        youtube_url=args.long_to_shorts,
+        output_dir=args.output,
+        work_dir=work_dir,
+        use_video_cache=use_video_cache,
+        cache_root=args.cache_root,
+        gemini_model=args.gemini_model,
+        gemini_vision_model=args.gemini_vision_model,
+        render_theme=args.render_theme,
+        hook_library_path=args.hook_library_path,
+        segmentation_provider=args.segmentation_provider,
+        segmentation_model=args.segmentation_model,
+        force_clip_selection=force_clip_selection,
+        force_layout_vision=force_layout_vision,
+        clean_run=args.clean_run,
+        overwrite_outputs=overwrite_outputs,
+        interactive=args.interactive,
+        prune_level=args.prune_level,
+        force_content_pruning=force_content_pruning,
+        detect_hooks=detect_hooks,
+        force_hook_detection=force_hook_detection,
+        subtitle_font_size=args.subtitle_font_size,
+        subtitle_margin_v=args.subtitle_margin_v,
+        subtitle_max_words_per_cue=args.subtitle_max_words,
+        subtitle_max_cue_sec=args.subtitle_max_cue_sec,
+        burn_subtitles=not args.no_subtitles,
+        subtitle_highlight_lead_sec=max(0.0, args.caption_highlight_lead_ms / 1000.0),
+        subtitle_highlight_min_dwell_sec=max(
+            0.02,
+            args.caption_highlight_min_dwell_ms / 1000.0,
+        ),
+        repair_subtitle_word_timings=not args.no_caption_timing_repair,
+        render_qa=not args.no_render_qa,
+        qa_reference_video=args.qa_reference_video,
+        qa_debug_overlay=not args.no_qa_debug_overlay,
+        rerender_clip_ids=args.rerender_clip,
+        rerender_warned_only=args.rerender_warned_only,
+    )
+
+    try:
+        outputs = run_pipeline(config)
+        print(f"\nDone. {len(outputs)} shorts generated in: {config.output_dir}")
+        for p in outputs:
+            print(f"   -> {p}")
+    except KeyboardInterrupt:
+        print("\nPipeline interrupted.")
+        sys.exit(1)
+    except Exception as e:
+        logging.getLogger(__name__).error("Pipeline failed: %s", e, exc_info=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/humeo/clip_assembly.py b/src/humeo/clip_assembly.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5e33575be3f40e06c54e0640f35ceb41f8af2b
--- /dev/null
+++ b/src/humeo/clip_assembly.py
@@ -0,0 +1,303 @@
+"""Hard-cut filler/silence cleanup by assembling multiple kept spans."""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+
+from humeo_core.schemas import Clip, ClipPlan, ClipRenderSpan
+
+from humeo.render_window import effective_export_bounds
+
+logger = logging.getLogger(__name__)
+
+_SPAN_BREAK_MIN_GAP_SEC = 0.55
+_SPAN_EDGE_PAD_SEC = 0.05
+_SPAN_MIN_DURATION_SEC = 0.30
+_FILLER_SPAN_MIN_DURATION_SEC = 0.12
+_SEGMENT_BREAK_MIN_GAP_SEC = 0.65
+_SEGMENT_MAX_DURATION_SEC = 6.0
+_SEGMENT_MAX_WORDS = 18
+_FILLER_CUT_PAD_SEC = 0.02
+_FILLER_WORD_RE = re.compile(r"^(u+h+|u+m+|e+h+|e+r+|a+h+|h+m+|m+m+)$", re.IGNORECASE)
+_FILLER_WORDS = {
+    "ah",
+    "eh",
+    "er",
+    "hmm",
+    "mm",
+    "uh",
+    "uhh",
+    "uhm",
+    "um",
+    "umm",
+}
+
+
+@dataclass(frozen=True)
+class AssembledClip:
+    source_path: Path
+    clip: Clip
+    transcript: dict
+    spans: list[ClipRenderSpan]
+
+
+def _iter_words(transcript: dict) -> list[dict]:
+    words: list[dict] = []
+    for seg in transcript.get("segments", []) or []:
+        for raw in seg.get("words", []) or []:
+            try:
+                word = {
+                    "word": str(raw.get("word", "")).strip(),
+                    "start": float(raw["start"]),
+                    "end": float(raw["end"]),
+                }
+            except (KeyError, TypeError, ValueError):
+                continue
+            if not word["word"] or word["end"] <= word["start"]:
+                continue
+            words.append(word)
+    return words
+
+
+def _clean_word_token(text: str) -> str:
+    return re.sub(r"(^[^A-Za-z]+|[^A-Za-z]+$)", "", text or "").lower()
+
+
+def _looks_like_filler_word(text: str) -> bool:
+    token = _clean_word_token(text)
+    if not token:
+        return False
+    return token in _FILLER_WORDS or bool(_FILLER_WORD_RE.fullmatch(token))
+
+
+def derive_render_spans(clip: Clip, transcript: dict) -> list[ClipRenderSpan]:
+    if clip.render_spans:
+        return list(clip.render_spans)
+
+    start_sec, end_sec = effective_export_bounds(clip)
+    words = [
+        word
+        for word in _iter_words(transcript)
+        if word["end"] > start_sec and word["start"] < end_sec
+    ]
+    if not words:
+        return [ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec)]
+
+    spans: list[ClipRenderSpan] = []
+    span_start: float | None = None
+    prev_end: float | None = None
+    resume_after = start_sec
+
+    for word in words:
+        word_start = float(word["start"])
+        word_end = float(word["end"])
+        if _looks_like_filler_word(str(word["word"])):
+            if span_start is not None and prev_end is not None:
+                span_end = min(end_sec, max(span_start, word_start - _FILLER_CUT_PAD_SEC))
+                if span_end - span_start >= _FILLER_SPAN_MIN_DURATION_SEC:
+                    spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=span_end))
+            span_start = None
+            prev_end = None
+            resume_after = min(end_sec, word_end + _FILLER_CUT_PAD_SEC)
+            continue
+        if span_start is None:
+            span_start = max(start_sec, word_start - _SPAN_EDGE_PAD_SEC, resume_after)
+            prev_end = word_end
+            continue
+        if prev_end is not None and word_start - prev_end >= _SPAN_BREAK_MIN_GAP_SEC:
+            span_end = min(end_sec, prev_end + _SPAN_EDGE_PAD_SEC)
+            if span_end - span_start >= _SPAN_MIN_DURATION_SEC:
+                spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=span_end))
+            span_start = max(start_sec, word_start - _SPAN_EDGE_PAD_SEC)
+        prev_end = word_end
+
+    if span_start is None or prev_end is None:
+        if not spans:
+            spans.append(ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec))
+        return spans
+
+    final_end = min(end_sec, prev_end + _SPAN_EDGE_PAD_SEC)
+    if final_end - span_start >= _SPAN_MIN_DURATION_SEC:
+        spans.append(ClipRenderSpan(start_time_sec=span_start, end_time_sec=final_end))
+
+    if not spans:
+        spans.append(ClipRenderSpan(start_time_sec=start_sec, end_time_sec=end_sec))
+    return spans
+
+
+def apply_render_spans(clips: list[Clip], transcript: dict) -> list[Clip]:
+    out: list[Clip] = []
+    for clip in clips:
+        spans = derive_render_spans(clip, transcript)
+        out.append(clip.model_copy(update={"render_spans": spans}))
+    return out
+
+
+def _segment_local_words(words: list[dict], *, language: str) -> dict:
+    segments: list[dict] = []
+    chunk: list[dict] = []
+
+    def flush() -> None:
+        if not chunk:
+            return
+        segments.append(
+            {
+                "start": chunk[0]["start"],
+                "end": chunk[-1]["end"],
+                "text": " ".join(str(word["word"]) for word in chunk).strip(),
+                "words": list(chunk),
+            }
+        )
+        chunk.clear()
+
+    for word in words:
+        if chunk:
+            gap = float(word["start"]) - float(chunk[-1]["end"])
+            dur = float(word["end"]) - float(chunk[0]["start"])
+            if (
+                gap >= _SEGMENT_BREAK_MIN_GAP_SEC
+                or dur >= _SEGMENT_MAX_DURATION_SEC
+                or len(chunk) >= _SEGMENT_MAX_WORDS
+            ):
+                flush()
+        chunk.append(word)
+    flush()
+    return {"segments": segments, "language": language}
+
+
+def build_assembled_transcript(clip: Clip, transcript: dict) -> dict:
+    words = _iter_words(transcript)
+    local_words: list[dict] = []
+    current_offset = 0.0
+    for span in derive_render_spans(clip, transcript):
+        for word in words:
+            if word["end"] <= span.start_time_sec or word["start"] >= span.end_time_sec:
+                continue
+            if _looks_like_filler_word(str(word["word"])):
+                continue
+            local_words.append(
+                {
+                    "word": word["word"],
+                    "start": max(word["start"], span.start_time_sec) - span.start_time_sec + current_offset,
+                    "end": min(word["end"], span.end_time_sec) - span.start_time_sec + current_offset,
+                }
+            )
+        current_offset += span.duration_sec
+    language = str(transcript.get("language") or "en")
+    return _segment_local_words(local_words, language=language)
+
+
+def _ffmpeg_concat_filter(spans: list[ClipRenderSpan]) -> str:
+    parts: list[str] = []
+    for idx, span in enumerate(spans):
+        parts.append(
+            f"[0:v]trim=start={span.start_time_sec:.3f}:end={span.end_time_sec:.3f},setpts=PTS-STARTPTS[v{idx}]"
+        )
+        parts.append(
+            f"[0:a]atrim=start={span.start_time_sec:.3f}:end={span.end_time_sec:.3f},asetpts=PTS-STARTPTS[a{idx}]"
+        )
+    concat_inputs = "".join(f"[v{idx}][a{idx}]" for idx in range(len(spans)))
+    parts.append(f"{concat_inputs}concat=n={len(spans)}:v=1:a=1[vout][aout]")
+    return ";".join(parts)
+
+
+def assemble_clip(
+    source_path: Path,
+    clip: Clip,
+    transcript: dict,
+    output_dir: Path,
+) -> AssembledClip:
+    spans = derive_render_spans(clip, transcript)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    assembled_path = output_dir / f"clip_{clip.clip_id}.mp4"
+
+    ffmpeg = shutil.which("ffmpeg")
+    if not ffmpeg:
+        raise RuntimeError("ffmpeg not found on PATH")
+
+    cmd = [
+        ffmpeg,
+        "-y",
+        "-i",
+        str(source_path),
+        "-filter_complex",
+        _ffmpeg_concat_filter(spans),
+        "-map",
+        "[vout]",
+        "-map",
+        "[aout]",
+        "-c:v",
+        "libx264",
+        "-preset",
+        "veryfast",
+        "-crf",
+        "20",
+        "-c:a",
+        "aac",
+        "-b:a",
+        "160k",
+        "-movflags",
+        "+faststart",
+        str(assembled_path),
+    ]
+    subprocess.run(cmd, check=True, capture_output=True)
+
+    assembled_transcript = build_assembled_transcript(clip, transcript)
+    assembled_transcript_path = output_dir / f"clip_{clip.clip_id}.transcript.json"
+    assembled_transcript_path.write_text(
+        json.dumps(assembled_transcript, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    timeline_path = output_dir / f"clip_{clip.clip_id}.timeline.json"
+    timeline_path.write_text(
+        json.dumps(
+            {
+                "clip_id": clip.clip_id,
+                "source_spans": [span.model_dump() for span in spans],
+                "assembled_duration_sec": sum(span.duration_sec for span in spans),
+            },
+            indent=2,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+
+    assembled_duration = sum(span.duration_sec for span in spans)
+    assembled_clip = clip.model_copy(
+        update={
+            "start_time_sec": 0.0,
+            "end_time_sec": assembled_duration,
+            "trim_start_sec": 0.0,
+            "trim_end_sec": 0.0,
+            "hook_start_sec": None,
+            "hook_end_sec": None,
+            "render_spans": [],
+        }
+    )
+    logger.info(
+        "Assembled clip %s into %d span(s): %.1fs -> %.1fs",
+        clip.clip_id,
+        len(spans),
+        clip.duration_sec,
+        assembled_duration,
+    )
+    return AssembledClip(
+        source_path=assembled_path,
+        clip=assembled_clip,
+        transcript=assembled_transcript,
+        spans=spans,
+    )
+
+
+def write_clip_plan(path: Path, clips: list[Clip]) -> Path:
+    path.write_text(
+        ClipPlan(source_path="", clips=clips).model_dump_json(indent=2) + "\n",
+        encoding="utf-8",
+    )
+    return path
diff --git a/src/humeo/clip_selection_cache.py b/src/humeo/clip_selection_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..a84d84d5e8d3ffeb1009a73ca17c4856d3ceb246
--- /dev/null
+++ b/src/humeo/clip_selection_cache.py
@@ -0,0 +1,85 @@
+"""Persist Gemini clip-selection output and skip re-inference when transcript matches."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from humeo.config import GEMINI_MODEL, PipelineConfig
+from humeo.env import current_llm_provider
+from humeo.hook_library import hook_library_fingerprint, resolve_hook_library_path
+
+logger = logging.getLogger(__name__)
+
+# v3: includes hook-library fingerprint for retrieval-augmented prompts.
+CURRENT_META_VERSION = 3
+META_FILENAME = "clips.meta.json"
+RAW_FILENAME = "clip_selection_raw.json"
+
+
+def transcript_fingerprint(transcript: dict) -> str:
+    payload = json.dumps(transcript, sort_keys=True, ensure_ascii=False)
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def resolved_gemini_model(config: PipelineConfig) -> str:
+    return (config.gemini_model or GEMINI_MODEL).strip()
+
+
+def load_meta(work_dir: Path) -> dict[str, Any] | None:
+    path = work_dir / META_FILENAME
+    if not path.is_file():
+        return None
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def cache_valid(meta: dict[str, Any], fingerprint: str, config: PipelineConfig) -> bool:
+    if meta.get("transcript_sha256") != fingerprint:
+        return False
+    gm = resolved_gemini_model(config)
+    current_provider = current_llm_provider()
+    meta_provider = meta.get("llm_backend")
+    if current_provider == "openrouter":
+        if meta_provider != "openrouter":
+            return False
+    elif current_provider == "google":
+        if meta_provider not in (None, "google"):
+            return False
+    ver = meta.get("version", 1)
+    if ver >= CURRENT_META_VERSION:
+        return (
+            meta.get("gemini_model") == gm
+            and meta.get("hook_library_sha256", "")
+            == hook_library_fingerprint(resolve_hook_library_path(config))
+        )
+    # Legacy v1: had llm_provider + model fields
+    if meta.get("llm_provider") == "openai":
+        return False
+    return meta.get("gemini_model") == gm
+
+
+def write_artifacts(
+    work_dir: Path,
+    *,
+    transcript: dict,
+    config: PipelineConfig,
+    raw_response: str,
+) -> None:
+    work_dir.mkdir(parents=True, exist_ok=True)
+    fp = transcript_fingerprint(transcript)
+    meta: dict[str, Any] = {
+        "version": CURRENT_META_VERSION,
+        "transcript_sha256": fp,
+        "gemini_model": resolved_gemini_model(config),
+        "llm_backend": current_llm_provider() or "google",
+        "hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)),
+    }
+    (work_dir / RAW_FILENAME).write_text(raw_response, encoding="utf-8")
+    with open(work_dir / META_FILENAME, "w", encoding="utf-8") as f:
+        json.dump(meta, f, indent=2)
+        f.write("\n")
+    logger.info("Wrote %s and %s", META_FILENAME, RAW_FILENAME)
diff --git a/src/humeo/clip_selector.py b/src/humeo/clip_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..47887687fcd548f8d92b1374c5298be0dc0734fd
--- /dev/null
+++ b/src/humeo/clip_selector.py
@@ -0,0 +1,674 @@
+"""
+Step 2 - Clip Selection: Gemini-only LLM for viral clip identification.
+
+Uses the unified ``google-genai`` SDK (``from google import genai``). See:
+https://github.com/googleapis/python-genai
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+import time
+from pathlib import Path
+from typing import Callable, TypeVar
+
+from google import genai
+from openai import OpenAI
+
+from humeo.gemini_generate import gemini_generate_config
+
+from humeo_core.schemas import Clip, ClipPlan
+
+from humeo.config import (
+    GEMINI_MODEL,
+    MAX_CLIP_DURATION_SEC,
+    MIN_CLIP_DURATION_SEC,
+    TEXT_AXIS_WEIGHTS,
+    TARGET_CLIP_COUNT,
+)
+from humeo.env import (
+    OPENROUTER_BASE_URL,
+    model_name_for_provider,
+    openrouter_default_headers,
+    resolve_gemini_api_key,
+    resolve_llm_provider,
+    resolve_openrouter_api_key,
+)
+from humeo.hook_library import (
+    format_hook_examples,
+    retrieve_hook_examples,
+)
+from humeo.prompt_loader import clip_selection_prompts
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+LLM_MAX_ATTEMPTS = 3
+LLM_RETRY_DELAY_SEC = 2.0
+
+# Over-generation defaults (also exposed via PipelineConfig so callers can
+# override per-run without touching code). Rationale:
+#
+# - Ask Gemini for a *pool* of ~12 candidates at temperature 0.7 so the model
+#   considers a wider slice of the transcript instead of locking onto the
+#   first 5 obvious ones. More candidates -> more chance the actual gold
+#   nugget is in the list.
+# - Then rank by ``virality_score`` and keep everything >= threshold, but
+#   always keep at least ``min_kept`` and at most ``max_kept`` clips. This
+#   lets a single strong clip survive a weak transcript ("keep the best 5
+#   even if no one clears the bar") AND lets an exceptionally rich
+#   transcript ship 7-8 strong shorts instead of artificially capping at 5.
+DEFAULT_CANDIDATE_COUNT = 12
+DEFAULT_QUALITY_THRESHOLD = 0.70
+DEFAULT_MIN_KEPT = TARGET_CLIP_COUNT
+DEFAULT_MAX_KEPT = 8
+# Higher than the old 0.3 so the pool is meaningfully different from
+# "the same five most-obvious clips every run". Still well below 1.0 so we
+# do not get word-salad IDs or timestamps.
+DEFAULT_CANDIDATE_TEMPERATURE = 0.7
+_TITLE_SMALL_WORDS = {
+    "a",
+    "an",
+    "and",
+    "as",
+    "at",
+    "by",
+    "for",
+    "from",
+    "in",
+    "of",
+    "on",
+    "or",
+    "the",
+    "to",
+    "vs",
+    "with",
+}
+_TITLE_DROP_WORDS = {
+    "actually",
+    "entirely",
+    "just",
+    "next",
+    "really",
+    "still",
+    "that",
+    "their",
+    "these",
+    "this",
+    "those",
+    "very",
+    "will",
+    "your",
+}
+_TITLE_BLAND_WORDS = {
+    "big",
+    "future",
+    "important",
+    "lesson",
+    "matter",
+    "matters",
+    "opportunity",
+    "reason",
+    "soon",
+    "story",
+    "thing",
+}
+_GENERIC_TITLE_PATTERNS = (
+    "big opportunity",
+    "future of",
+    "important lesson",
+    "start a business with ai",
+    "why this matters",
+    "what this means",
+)
+_TITLE_TOKEN_REPLACEMENTS = {
+    "ai": "AI",
+    "agi": "AGI",
+    "api": "API",
+    "btc": "BTC",
+    "ev": "EV",
+    "evs": "EVs",
+    "us": "US",
+}
+_POWER_TITLE_TOKENS = {"$", "%", "under", "beats", "fewer", "more", "less", "vs"}
+_FILLER_OPENERS = {
+    "actually",
+    "basically",
+    "i",
+    "kind",
+    "look",
+    "listen",
+    "now",
+    "okay",
+    "ok",
+    "right",
+    "so",
+    "sort",
+    "well",
+    "yeah",
+    "you",
+}
+_FILLER_OPENING_PHRASES = {
+    "i mean",
+    "kind of",
+    "sort of",
+    "you know",
+}
+_PREFERRED_MAX_DURATION_SEC = 72.0
+
+
+def _has_valid_duration(clip: Clip) -> bool:
+    """Return True when the clip window satisfies the product duration contract."""
+    return MIN_CLIP_DURATION_SEC <= clip.duration_sec <= MAX_CLIP_DURATION_SEC
+
+
+def _text_composite_score(clip: Clip) -> float:
+    """Weighted composite from the text-axis breakdown, falling back to virality_score.
+
+    Cache compatibility note:
+    - New Ticket 3 clips use the three-axis rubric (message_wow / hook_emotion / catchy).
+    - Older caches may still contain legacy rule-name ``score_breakdown`` maps from the
+      pre-Ticket-3 prompt. If none of the expected axes are present, fall back cleanly
+      to ``virality_score`` instead of treating the legacy shape as three missing axes.
+    """
+    if not clip.score_breakdown:
+        return clip.virality_score
+
+    present_expected_axes = [axis for axis in TEXT_AXIS_WEIGHTS if axis in clip.score_breakdown]
+    if not present_expected_axes:
+        return clip.virality_score
+
+    total = 0.0
+    missing: list[str] = []
+    for axis, weight in TEXT_AXIS_WEIGHTS.items():
+        value = clip.score_breakdown.get(axis)
+        if value is None:
+            missing.append(axis)
+            continue
+        total += value * weight
+
+    if missing:
+        logger.warning(
+            "Clip %s score_breakdown missing axis(es) %s; treating as 0.0.",
+            clip.clip_id,
+            ", ".join(missing),
+        )
+    return total
+
+
+def _title_quality_penalty(clip: Clip) -> float:
+    title = _tighten_overlay_title_text(clip.suggested_overlay_title or "")
+    if not title:
+        return 0.0
+    penalty = 0.0
+    if _looks_generic_title(title):
+        penalty += 0.18
+    tokens = [token for token in _normalized_title(title).split() if token]
+    if len(tokens) < 2 or len(tokens) > 6:
+        penalty += 0.05
+    if not any(token in title.lower() for token in _POWER_TITLE_TOKENS) and not any(
+        ch.isdigit() for ch in title
+    ):
+        penalty += 0.03
+    return min(0.22, penalty)
+
+
+def _hook_quality_penalty(clip: Clip) -> float:
+    penalty = 0.0
+    if clip.hook_start_sec is not None and clip.hook_start_sec > 5.0:
+        penalty += min(0.18, 0.06 + (clip.hook_start_sec - 5.0) * 0.025)
+    opener = " ".join((clip.viral_hook or clip.transcript or "").split()).lower()
+    if opener:
+        first_words = opener.split()
+        first_word = first_words[0] if first_words else ""
+        opening_phrase = " ".join(first_words[:2])
+        if first_word in _FILLER_OPENERS:
+            penalty += 0.14
+        if opening_phrase in _FILLER_OPENING_PHRASES:
+            penalty += 0.06
+        if len(first_words) >= 12:
+            penalty += 0.03
+    return min(0.24, penalty)
+
+
+def _duration_quality_penalty(clip: Clip) -> float:
+    if clip.duration_sec <= _PREFERRED_MAX_DURATION_SEC:
+        return 0.0
+    drift = clip.duration_sec - _PREFERRED_MAX_DURATION_SEC
+    return min(0.14, 0.03 + drift * 0.01)
+
+
+def clip_quality_penalty(clip: Clip) -> float:
+    return min(
+        0.42,
+        _title_quality_penalty(clip)
+        + _hook_quality_penalty(clip)
+        + _duration_quality_penalty(clip),
+    )
+
+
+def clip_quality_priority_score(clip: Clip) -> float:
+    review_penalty = 0.5 if clip.needs_review else 0.0
+    composite = _text_composite_score(clip)
+    return composite - review_penalty - clip_quality_penalty(clip)
+
+
+def renumber_clips_dense(clips: list[Clip]) -> list[Clip]:
+    renumbered: list[Clip] = []
+    for idx, clip in enumerate(clips, start=1):
+        new_id = f"{idx:03d}"
+        renumbered.append(clip if clip.clip_id == new_id else clip.model_copy(update={"clip_id": new_id}))
+    return renumbered
+
+
+def _openai_message_text(content: object) -> str:
+    """Normalize OpenAI-compatible message content into plain text."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                text = item.get("text")
+                if isinstance(text, str):
+                    parts.append(text)
+        return "".join(parts)
+    return ""
+
+
+def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T:
+    last: Exception | None = None
+    for i in range(attempts):
+        try:
+            return fn()
+        except Exception as e:
+            last = e
+            if i < attempts - 1:
+                logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e)
+                time.sleep(LLM_RETRY_DELAY_SEC * (i + 1))
+    assert last is not None
+    raise last
+
+
+def _headline_case_title(text: str) -> str:
+    words = text.split()
+    if not words:
+        return ""
+    out: list[str] = []
+    for idx, word in enumerate(words):
+        if any(ch.isdigit() for ch in word) or word.startswith("$"):
+            out.append(word)
+            continue
+        raw = re.sub(r"^[^A-Za-z]+|[^A-Za-z]+$", "", word)
+        lower = raw.lower()
+        if lower in _TITLE_TOKEN_REPLACEMENTS:
+            out.append(word.replace(raw, _TITLE_TOKEN_REPLACEMENTS[lower]))
+            continue
+        if idx not in (0, len(words) - 1) and lower in _TITLE_SMALL_WORDS:
+            out.append(word.replace(raw, lower))
+            continue
+        out.append(word.replace(raw, raw.capitalize()))
+    return " ".join(out)
+
+
+def _normalized_title(text: str) -> str:
+    return re.sub(r"\s+", " ", re.sub(r"[^a-z0-9$% ]+", " ", (text or "").lower())).strip()
+
+
+def _looks_generic_title(text: str) -> bool:
+    normalized = _normalized_title(text)
+    if not normalized:
+        return True
+    if any(pattern in normalized for pattern in _GENERIC_TITLE_PATTERNS):
+        return True
+    tokens = [token for token in normalized.split() if token]
+    bland_count = sum(token in _TITLE_BLAND_WORDS for token in tokens)
+    return bland_count >= 2
+
+
+def _tighten_overlay_title_text(text: str) -> str:
+    title = " ".join((text or "").replace("—", "-").split()).strip(" .,!?:;-")
+    if not title:
+        return ""
+    title = re.sub(r"\bwill cost less than\b", "under", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bless than\b", "under", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bmade your\b", "", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bis still\b", "is", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bis creating\b", "creates", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bthere are\b", "", title, flags=re.IGNORECASE)
+    title = re.sub(r"\bentirely\b", "", title, flags=re.IGNORECASE)
+    words = title.split()
+    while len(words) > 6:
+        filtered = [word for word in words if word.lower() not in _TITLE_DROP_WORDS]
+        if len(filtered) == len(words):
+            break
+        words = filtered
+    if len(words) > 4:
+        words = [word for word in words if word.lower() not in {"your", "next"} or len(words) <= 4]
+    if len(words) > 6 and words[0].lower() in {"why", "how", "when"}:
+        words = words[1:]
+    if len(words) > 6:
+        words = words[:6]
+    return _headline_case_title(" ".join(words).strip(" .,!?:;-"))
+
+
+def _polish_overlay_title(clip: Clip) -> str:
+    current = _tighten_overlay_title_text(clip.suggested_overlay_title or "")
+    if current and not _looks_generic_title(current):
+        return current
+    for candidate in (clip.viral_hook or "", clip.topic or ""):
+        polished = _tighten_overlay_title_text(candidate)
+        if polished and not _looks_generic_title(polished):
+            return polished
+    return current
+
+
+def _polish_clip_metadata(clip: Clip) -> Clip:
+    title = _polish_overlay_title(clip)
+    if not title or title == clip.suggested_overlay_title:
+        return clip
+    return clip.model_copy(update={"suggested_overlay_title": title})
+
+
+def build_prompt(
+    transcript: dict,
+    *,
+    candidate_count: int = DEFAULT_CANDIDATE_COUNT,
+    steering_notes: list[str] | None = None,
+    hook_library_path: Path | None = None,
+) -> tuple[str, str]:
+    """Return ``(system_prompt, user_message)`` for the clip-selector LLM call.
+
+    ``candidate_count`` is the size of the candidate POOL we ask Gemini for.
+    A downstream ranker (``rank_and_filter_clips``) then keeps the top
+    clips that clear the quality threshold. Defaults preserve the previous
+    visible output (5 clips) when the pool is narrow.
+    """
+    lines = []
+    for seg in transcript.get("segments", []):
+        start = seg.get("start", 0)
+        end = seg.get("end", 0)
+        text = seg.get("text", "").strip()
+        lines.append(f"[{start:.1f}s - {end:.1f}s] {text}")
+
+    transcript_text = "\n".join(lines)
+
+    hook_examples = format_hook_examples(
+        retrieve_hook_examples(
+            transcript_text[:8000],
+            path=hook_library_path,
+            limit=8,
+        )
+    )
+
+    system, user = clip_selection_prompts(
+        transcript_text=transcript_text,
+        min_dur=MIN_CLIP_DURATION_SEC,
+        max_dur=MAX_CLIP_DURATION_SEC,
+        count=candidate_count,
+        steering_notes=steering_notes,
+        hook_examples=hook_examples,
+    )
+    return system, user
+
+
+def rank_and_filter_clips(
+    clips: list[Clip],
+    *,
+    threshold: float = DEFAULT_QUALITY_THRESHOLD,
+    min_kept: int = DEFAULT_MIN_KEPT,
+    max_kept: int = DEFAULT_MAX_KEPT,
+) -> list[Clip]:
+    """Rank ``clips`` by text composite (or legacy ``virality_score``) and apply
+    the threshold+floor+cap.
+
+    Rules (in order, with clear precedence):
+
+    1. Sort descending by the text composite score when the Ticket 3
+       three-axis ``score_breakdown`` is present; otherwise fall back to the
+       legacy ``virality_score``.
+    2. Keep clips whose active score signal is ``>= threshold`` (or
+       ``needs_review`` cleared). Reviewed-out clips (``needs_review=True``)
+       are always sent to the back of the priority queue.
+    3. If fewer than ``min_kept`` clips passed the threshold, fill up from
+       the remaining clips in rank order until we reach ``min_kept`` (or
+       run out of candidates).
+    4. Cap the final list at ``max_kept`` entries.
+    5. Renumber ``clip_id`` to ``001``, ``002``, ... so downstream artifacts
+       (keyframes, subtitles, output filenames) stay dense and ordered.
+
+    This is the "threshold with a floor" policy the user asked for: quality
+    first, but never ship zero shorts when the transcript is weak.
+    """
+    if not clips:
+        return []
+
+    score_signal = {id(c): _text_composite_score(c) for c in clips}
+    priority_signal = {id(c): clip_quality_priority_score(c) for c in clips}
+
+    def _priority(c: Clip) -> tuple[float, float]:
+        return (priority_signal[id(c)], score_signal[id(c)])
+
+    valid: list[Clip] = []
+    invalid: list[Clip] = []
+    for clip in clips:
+        if _has_valid_duration(clip):
+            valid.append(clip)
+        else:
+            invalid.append(clip)
+            logger.warning(
+                "Clip %s dropped before ranking: duration %.1fs is outside [%ds, %ds] - %s",
+                clip.clip_id,
+                clip.duration_sec,
+                MIN_CLIP_DURATION_SEC,
+                MAX_CLIP_DURATION_SEC,
+                clip.topic,
+            )
+
+    if not valid:
+        logger.warning(
+            "Clip ranking: 0 valid candidates remain after duration filtering (dropped=%d).",
+            len(invalid),
+        )
+        return []
+
+    ordered = sorted(valid, key=_priority, reverse=True)
+
+    strong = [c for c in ordered if priority_signal[id(c)] >= threshold and not c.needs_review]
+    kept = list(strong)
+
+    if len(kept) < min_kept:
+        backfill = [c for c in ordered if c not in kept]
+        for c in backfill:
+            if len(kept) >= min_kept:
+                break
+            kept.append(c)
+
+    if len(kept) < min_kept:
+        logger.warning(
+            "Clip ranking: only %d valid candidates remain after duration filtering; "
+            "cannot satisfy min_kept=%d without invalid clips.",
+            len(kept),
+            min_kept,
+        )
+
+    if len(kept) > max_kept:
+        kept = kept[:max_kept]
+
+    # Renumber clip_ids so consumers (filenames, layout vision, subtitles)
+    # always see 001..NNN in rank order regardless of what the LLM returned.
+    renumbered = renumber_clips_dense(kept)
+
+    dropped = len(valid) - len(kept) + len(invalid)
+    logger.info(
+        "Clip ranking: kept %d / %d candidates (threshold=%.2f, min=%d, max=%d, dropped=%d).",
+        len(renumbered),
+        len(clips),
+        threshold,
+        min_kept,
+        max_kept,
+        dropped,
+    )
+    for c in renumbered:
+        logger.info(
+            "  [%s] score=%.2f priority=%.2f penalty=%.2f %s %s",
+            c.clip_id,
+            c.virality_score,
+            clip_quality_priority_score(c),
+            clip_quality_penalty(c),
+            "(review)" if c.needs_review else "",
+            c.topic,
+        )
+    return renumbered
+
+
+def select_clips(
+    transcript: dict,
+    *,
+    gemini_model: str | None = None,
+    hook_library_path: Path | None = None,
+    candidate_count: int = DEFAULT_CANDIDATE_COUNT,
+    quality_threshold: float = DEFAULT_QUALITY_THRESHOLD,
+    min_kept: int = DEFAULT_MIN_KEPT,
+    max_kept: int = DEFAULT_MAX_KEPT,
+    temperature: float = DEFAULT_CANDIDATE_TEMPERATURE,
+    steering_notes: list[str] | None = None,
+) -> tuple[list[Clip], str]:
+    """
+    Call Gemini to select clips. Returns ``(clips, raw_json)`` for caching / debugging.
+
+    The returned clip list has already been ranked + filtered by
+    :func:`rank_and_filter_clips`. ``raw_json`` is the untouched LLM
+    response so the cache artifact reflects the entire candidate pool for
+    audit / re-ranking without another LLM call.
+
+    Uses ``google.genai.Client`` and ``GenerateContentConfig`` (see Google
+    Gen AI SDK for Python).
+    """
+    provider = resolve_llm_provider()
+    model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider)
+    system_prompt, user_text = build_prompt(
+        transcript,
+        candidate_count=candidate_count,
+        steering_notes=steering_notes,
+        hook_library_path=hook_library_path,
+    )
+
+    def _call() -> str:
+        logger.info(
+            "%s clip selection (model=%s, candidate_pool=%d, temp=%.2f)...",
+            provider,
+            model_name,
+            candidate_count,
+            temperature,
+        )
+        if provider == "google":
+            client = genai.Client(api_key=resolve_gemini_api_key())
+            response = client.models.generate_content(
+                model=model_name,
+                contents=user_text,
+                config=gemini_generate_config(
+                    system_instruction=system_prompt,
+                    temperature=temperature,
+                    response_mime_type="application/json",
+                ),
+            )
+            if not response.text:
+                raise RuntimeError("Gemini returned empty response text")
+            return response.text
+
+        client = OpenAI(
+            api_key=resolve_openrouter_api_key(),
+            base_url=OPENROUTER_BASE_URL,
+            default_headers=openrouter_default_headers(),
+        )
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_text},
+            ],
+            temperature=temperature,
+            response_format={"type": "json_object"},
+        )
+        text = _openai_message_text(response.choices[0].message.content)
+        if not text:
+            raise RuntimeError("OpenRouter returned empty response text")
+        return text
+
+    raw = _retry_llm("Gemini clip selection", _call)
+    candidates = _parse_clips(raw)
+    # The ranker can only backfill from the pool Gemini returned. If Gemini
+    # under-delivered (e.g. returned 2 of a requested 12), the min_kept floor
+    # is unenforceable -- warn loudly so we do not silently ship fewer shorts
+    # than the caller expected.
+    if len(candidates) < min_kept:
+        logger.warning(
+            "Clip selection: Gemini returned only %d candidates (requested %d, floor %d). "
+            "Output will be capped at %d shorts -- check prompt or transcript length.",
+            len(candidates),
+            candidate_count,
+            min_kept,
+            len(candidates),
+        )
+    elif len(candidates) < candidate_count:
+        logger.info(
+            "Clip selection: Gemini returned %d of %d requested candidates "
+            "(pool still >= floor of %d).",
+            len(candidates),
+            candidate_count,
+            min_kept,
+        )
+    clips = rank_and_filter_clips(
+        candidates,
+        threshold=quality_threshold,
+        min_kept=min_kept,
+        max_kept=max_kept,
+    )
+    return clips, raw
+
+
+def _parse_clips(raw_json: str) -> list[Clip]:
+    """Parse and validate the LLM's JSON response into Clip objects."""
+    data = json.loads(raw_json)
+    clips_data = data.get("clips", data) if isinstance(data, dict) else data
+
+    clips: list[Clip] = []
+    for item in clips_data:
+        payload = dict(item)
+        payload.pop("duration_sec", None)
+        clip = _polish_clip_metadata(Clip.model_validate(payload))
+
+        actual_dur = clip.end_time_sec - clip.start_time_sec
+        stated_dur = item.get("duration_sec")
+        if stated_dur is not None and abs(actual_dur - float(stated_dur)) > 1.0:
+            logger.warning(
+                "Clip %s: stated duration %.1fs doesn't match (%.1f-%.1f = %.1f).",
+                clip.clip_id, float(stated_dur),
+                clip.start_time_sec, clip.end_time_sec, actual_dur,
+            )
+        clips.append(clip)
+
+    logger.info("Parsed %d clips from LLM response", len(clips))
+    return clips
+
+
+def save_clips(clips: list[Clip], output_path: Path) -> Path:
+    """Persist clips to a JSON file using the shared Pydantic schema."""
+    plan = ClipPlan(source_path="", clips=list(clips))
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(plan.model_dump_json(indent=2))
+    logger.info("Saved %d clips to %s", len(clips), output_path)
+    return output_path
+
+
+def load_clips(clips_path: Path) -> list[Clip]:
+    """Load clips from a previously saved JSON file."""
+    with open(clips_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict) and "clips" in data:
+        return [Clip.model_validate(c) for c in data["clips"]]
+    return [Clip.model_validate(c) for c in data]
diff --git a/src/humeo/config.py b/src/humeo/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..053e719db08b1f25ef1ea9a80093b68f2f44dcd3
--- /dev/null
+++ b/src/humeo/config.py
@@ -0,0 +1,159 @@
+"""Configuration for the product pipeline."""
+
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from humeo_core.schemas import RenderTheme
+
+from humeo.env import bootstrap_env
+
+bootstrap_env()
+
+# ---------------------------------------------------------------------------
+# Video Output
+# ---------------------------------------------------------------------------
+TARGET_WIDTH = 1080
+TARGET_HEIGHT = 1920
+TARGET_ASPECT = 9 / 16
+
+# ---------------------------------------------------------------------------
+# Clip Selection
+# ---------------------------------------------------------------------------
+# Clip length bounds for Gemini (also referenced in prompts/clip_selection_system.jinja2).
+MIN_CLIP_DURATION_SEC = 50
+MAX_CLIP_DURATION_SEC = 90
+TARGET_CLIP_COUNT = 5
+TEXT_AXIS_WEIGHTS: dict[str, float] = {
+    "message_wow": 0.4,
+    "hook_emotion": 0.35,
+    "catchy": 0.25,
+}
+
+# Gemini model id (override with GEMINI_MODEL in .env or shell). See docs/ENVIRONMENT.md.
+GEMINI_MODEL = (os.environ.get("GEMINI_MODEL") or "google/gemini-2.5-pro").strip() or "google/gemini-2.5-pro"
+# Optional *only* when layout vision should use a different id than clip selection
+# (e.g. cheaper model per keyframe). Empty unset → ``resolved_vision_model`` uses
+# ``GEMINI_MODEL`` / ``PipelineConfig.gemini_model`` (same multimodal stack).
+GEMINI_VISION_MODEL = (os.environ.get("GEMINI_VISION_MODEL") or "").strip() or None
+DEFAULT_SEGMENTATION_PROVIDER = (
+    (os.environ.get("HUMEO_SEGMENTATION_PROVIDER") or "").strip().lower()
+    or ("replicate" if (os.environ.get("REPLICATE_API_TOKEN") or "").strip() else "off")
+)
+
+# ---------------------------------------------------------------------------
+@dataclass
+class PipelineConfig:
+    """Runtime configuration for a single pipeline run."""
+
+    youtube_url: str | None = None
+    source: str | None = None
+    output_dir: Path = field(default_factory=lambda: Path("output"))
+    # None = auto: per-video dir under the cache root (see docs/ENVIRONMENT.md).
+    work_dir: Path | None = None
+    use_video_cache: bool = True
+    # None = default from env (HUMEO_CACHE_ROOT) or platform default.
+    cache_root: Path | None = None
+
+    # None = use GEMINI_MODEL from env / module default (Gemini-only clip selection).
+    gemini_model: str | None = None
+    # None = GEMINI_VISION_MODEL env or same as gemini_model (per-keyframe layout + bbox).
+    gemini_vision_model: str | None = None
+    render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT
+    hook_library_path: Path | None = None
+    segmentation_provider: str = DEFAULT_SEGMENTATION_PROVIDER
+    segmentation_model: str = "meta/sam-2-video"
+    # When True, always re-run clip-selection LLM (ignore clips.meta.json match).
+    force_clip_selection: bool = False
+    # When True, always re-run Gemini vision for layouts (ignore layout_vision.meta.json).
+    force_layout_vision: bool = False
+    # When True, use an isolated work dir and force all stages to recompute.
+    clean_run: bool = False
+    # When True, render stage overwrites existing output files.
+    overwrite_outputs: bool = False
+    # When True, pause after clip selection and after render for human approval.
+    interactive: bool = False
+    # Interactive steering notes injected into the clip-selection prompt on reruns.
+    steering_notes: list[str] = field(default_factory=list)
+    # Hard cap on interactive reruns.
+    max_iterations: int = 5
+
+    # Stage 2.25 - hook detection. The clip selector is unreliable at
+    # localising the hook sentence and tends to echo the 0.0-3.0s placeholder
+    # from the prompt verbatim. This dedicated stage reads each candidate
+    # window and returns a real hook window per clip, which Stage 2.5 then
+    # uses to clamp pruning safely. When False, the clip-selection hook
+    # (possibly a placeholder) is carried through unchanged.
+    detect_hooks: bool = True
+    # When True, re-run the hook-detection LLM even when hooks.meta.json matches.
+    force_hook_detection: bool = False
+
+    # Stage 2.5 - inner-clip content pruning (HIVE "irrelevant content pruning"
+    # applied at clip scale). One of: off | conservative | balanced | aggressive.
+    # See ``src/humeo/content_pruning.py`` for the caps and the prompt.
+    prune_level: str = "balanced"
+    # When True, re-run the pruning LLM even when prune.meta.json matches.
+    force_content_pruning: bool = False
+
+    # Stage 2 - candidate over-generation. The selector now asks Gemini for a
+    # pool of candidates (``clip_selection_candidate_count``), scores them,
+    # and keeps the top ones that pass ``clip_selection_quality_threshold``.
+    # We always keep at least ``clip_selection_min_kept`` clips even when
+    # none pass the threshold, so rendering never blocks on a weak transcript.
+    # See ``src/humeo/clip_selector.py`` for the ranking logic.
+    clip_selection_candidate_count: int = 12
+    clip_selection_quality_threshold: float = 0.70
+    clip_selection_min_kept: int = 5
+    clip_selection_max_kept: int = 8
+
+    # Subtitle rendering / cue shaping.
+    # Values are in **output pixels** for a 1080x1920 short: libass is pinned to
+    # the output resolution via ``original_size``, so ``FontSize`` and ``MarginV``
+    # mean what they say. 48px font with a 160px bottom margin lands the caption
+    # in the lower third with a readable-but-not-shouting size.
+    subtitle_font_size: int = 38
+    subtitle_margin_v: int = 166
+    subtitle_max_words_per_cue: int = 10
+    subtitle_max_cue_sec: float = 2.8
+    burn_subtitles: bool = True
+    subtitle_highlight_lead_sec: float = 0.06
+    subtitle_highlight_min_dwell_sec: float = 0.16
+    repair_subtitle_word_timings: bool = True
+
+    # Render QA. Best-effort: failures write warnings and do not fail a render.
+    render_qa: bool = True
+    qa_reference_video: Path | None = None
+    qa_debug_overlay: bool = True
+    rerender_clip_ids: list[str] = field(default_factory=list)
+    rerender_warned_only: bool = False
+
+    def __post_init__(self):
+        youtube_url = (self.youtube_url or "").strip() or None
+        source = (self.source or "").strip() or None
+
+        if source is None and youtube_url is None:
+            raise ValueError("PipelineConfig requires either source or youtube_url.")
+        if source is not None and youtube_url is not None and source != youtube_url:
+            raise ValueError("PipelineConfig source and youtube_url must match when both are set.")
+        if source is None:
+            source = youtube_url
+        if youtube_url is None:
+            youtube_url = source
+
+        self.source = source
+        self.youtube_url = youtube_url
+        if isinstance(self.render_theme, str):
+            self.render_theme = RenderTheme(self.render_theme)
+        self.segmentation_provider = (self.segmentation_provider or "off").strip().lower()
+        self.output_dir = Path(self.output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.cache_root is not None:
+            self.cache_root = Path(self.cache_root)
+        if self.work_dir is not None:
+            self.work_dir = Path(self.work_dir)
+            self.work_dir.mkdir(parents=True, exist_ok=True)
+        if self.hook_library_path is not None:
+            self.hook_library_path = Path(self.hook_library_path)
+        if self.qa_reference_video is not None:
+            self.qa_reference_video = Path(self.qa_reference_video)
+        self.rerender_clip_ids = [str(clip_id).strip() for clip_id in self.rerender_clip_ids if str(clip_id).strip()]
diff --git a/src/humeo/content_pruning.py b/src/humeo/content_pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d06c3d8628b57a61f33dec0c109f7977d8ed7e
--- /dev/null
+++ b/src/humeo/content_pruning.py
@@ -0,0 +1,1144 @@
+"""Stage 2.5 - Content pruning inside each selected clip.
+
+This is the HIVE "irrelevant content pruning" sub-task, applied at the
+*inner-clip* scale rather than the scene scale. After the clip selector has
+chosen 5 x 50-90s windows, we ask Gemini to tighten each window by dropping
+weak lead-in (throat-clears, false starts, slow setup) and weak tail content
+(trailing ramble, fade-out talk).
+
+Design choices kept deliberately minimal:
+
+- **No schema changes.** The existing ``Clip.trim_start_sec`` /
+  ``Clip.trim_end_sec`` fields already feed ``humeo.render_window`` and
+  ``humeo_core.primitives.compile`` via ``-ss`` / ``-t``. Writing the pruned
+  in / out points into those fields tightens the exported window for free.
+- **Contiguous trimming only** (V1). We move the in-point forward and the
+  out-point backward; we do not cut in the middle. That keeps subtitles and
+  layout vision untouched.
+- **Strict clamping** after the LLM returns, so the final duration always
+  respects ``MIN_CLIP_DURATION_SEC`` and any declared hook window is
+  preserved.
+- **Never fatal.** Any failure (API error, malformed JSON, missing clip_id)
+  degrades to no-op trims (0.0 / 0.0) for that clip. The pipeline still
+  produces output identical to the pre-Stage-2.5 behaviour.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Literal, TypeVar
+
+from google import genai
+from openai import OpenAI
+from pydantic import BaseModel, Field, ValidationError
+
+from humeo_core.schemas import Clip
+
+from humeo.config import (
+    GEMINI_MODEL,
+    MAX_CLIP_DURATION_SEC,
+    MIN_CLIP_DURATION_SEC,
+    PipelineConfig,
+)
+from humeo.env import (
+    OPENROUTER_BASE_URL,
+    current_llm_provider,
+    model_name_for_provider,
+    openrouter_default_headers,
+    resolve_gemini_api_key,
+    resolve_llm_provider,
+    resolve_openrouter_api_key,
+)
+from humeo.gemini_generate import gemini_generate_config
+from humeo.prompt_loader import content_pruning_system_prompt
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+PRUNE_META_VERSION = 1
+PRUNE_META_FILENAME = "prune.meta.json"
+PRUNE_RAW_FILENAME = "prune_raw.json"
+PRUNE_ARTIFACT_FILENAME = "prune.json"
+
+LLM_MAX_ATTEMPTS = 3
+LLM_RETRY_DELAY_SEC = 2.0
+
+PruneLevel = Literal["off", "conservative", "balanced", "aggressive"]
+
+VALID_LEVELS: tuple[PruneLevel, ...] = ("off", "conservative", "balanced", "aggressive")
+
+
+def _openai_message_text(content: object) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                text = item.get("text")
+                if isinstance(text, str):
+                    parts.append(text)
+        return "".join(parts)
+    return ""
+
+# The clip-selection prompt uses `[0.0, 3.0]` as an example / fallback hook
+# window. Gemini frequently copies this placeholder verbatim instead of
+# localising the real hook, which silently disables Stage 2.5 start-trims for
+# every clip (the hook clamp below refuses to trim past `hook_start_sec`, so
+# any `trim_start_sec > 0` returned by the prune LLM gets zeroed).
+#
+# Treat this exact fingerprint as "no real hook" for clamp purposes. The real
+# fix is the Stage 2.25 hook detector (``humeo.hook_detector``) which
+# overwrites the clip's hook fields with a localised window before pruning
+# runs. This constant is the belt-and-suspenders guard for the case where
+# hook detection is disabled, fails, or cache-hits stale data.
+_DEFAULT_HOOK_FINGERPRINT: tuple[float, float] = (0.0, 3.0)
+_DEFAULT_HOOK_EPS: float = 1e-3
+
+
+def _looks_like_default_hook(hook_start: float | None, hook_end: float | None) -> bool:
+    """True when the hook window matches the prompt's 0-3s placeholder.
+
+    This is intentionally a narrow, exact-match check so a real hook that
+    happens to open at t=0 with a 3.0s window is still respected.
+    """
+    if hook_start is None or hook_end is None:
+        return False
+    return (
+        abs(hook_start - _DEFAULT_HOOK_FINGERPRINT[0]) < _DEFAULT_HOOK_EPS
+        and abs(hook_end - _DEFAULT_HOOK_FINGERPRINT[1]) < _DEFAULT_HOOK_EPS
+    )
+
+# Per-level cap on the fraction of the original clip the LLM is allowed to
+# trim. Even if the LLM tries to be more eager, we clamp. Final duration is
+# additionally clamped to ``MIN_CLIP_DURATION_SEC``.
+_MAX_TOTAL_TRIM_PCT: dict[PruneLevel, float] = {
+    "off": 0.0,
+    "conservative": 0.10,
+    "balanced": 0.20,
+    "aggressive": 0.35,
+}
+
+
+class _PruneDecision(BaseModel):
+    """Per-clip decision returned by Gemini (clip-relative seconds)."""
+
+    clip_id: str
+    trim_start_sec: float = Field(default=0.0, ge=0.0)
+    trim_end_sec: float = Field(default=0.0, ge=0.0)
+    reason: str = ""
+
+
+class _PruneResponse(BaseModel):
+    decisions: list[_PruneDecision] = Field(default_factory=list)
+
+
+@dataclass
+class _ClampStats:
+    """Diagnostics for why a returned trim got reshaped."""
+
+    clamped_start: bool = False
+    clamped_end: bool = False
+    hook_protected: bool = False
+    min_duration_protected: bool = False
+    max_pct_protected: bool = False
+
+
+def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T:
+    last: Exception | None = None
+    for i in range(attempts):
+        try:
+            return fn()
+        except Exception as e:
+            last = e
+            if i < attempts - 1:
+                logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e)
+                time.sleep(LLM_RETRY_DELAY_SEC * (i + 1))
+    assert last is not None
+    raise last
+
+
+# ---------------------------------------------------------------------------
+# Clamping
+# ---------------------------------------------------------------------------
+
+
+def _clamp_decision(
+    clip: Clip,
+    trim_start: float,
+    trim_end: float,
+    *,
+    level: PruneLevel,
+) -> tuple[float, float, _ClampStats]:
+    """Clamp a raw (trim_start, trim_end) pair so the resulting clip is legal.
+
+    Guarantees:
+    - ``trim_start`` and ``trim_end`` are non-negative.
+    - Final duration (``clip.duration_sec - trim_start - trim_end``) is at
+      least ``MIN_CLIP_DURATION_SEC`` (or the original duration, whichever is
+      smaller - we never *extend* a clip that was already too short).
+    - Combined trim does not exceed the level's allowed fraction of the
+      original duration.
+    - If ``hook_start_sec`` / ``hook_end_sec`` are set on the clip, the hook
+      window stays fully inside the result.
+    """
+    stats = _ClampStats()
+    duration = clip.duration_sec
+
+    ts = max(0.0, float(trim_start))
+    te = max(0.0, float(trim_end))
+    if ts != trim_start:
+        stats.clamped_start = True
+    if te != trim_end:
+        stats.clamped_end = True
+
+    max_pct = _MAX_TOTAL_TRIM_PCT.get(level, 0.0)
+    max_total_trim = duration * max_pct
+    if ts + te > max_total_trim:
+        scale = max_total_trim / max(ts + te, 1e-9)
+        ts = ts * scale
+        te = te * scale
+        stats.max_pct_protected = True
+
+    # Only protect the hook when the clip carries a *real* localised hook
+    # window. The clip-selection LLM frequently echoes the prompt's
+    # 0.0-3.0s placeholder, which would otherwise lock ``trim_start`` to 0
+    # for every clip and silently disable the entire pruning stage. See
+    # ``_looks_like_default_hook`` for the fingerprint rationale.
+    hook_is_real = (
+        clip.hook_start_sec is not None
+        and clip.hook_end_sec is not None
+        and not _looks_like_default_hook(clip.hook_start_sec, clip.hook_end_sec)
+    )
+    if hook_is_real:
+        hook_lo = clip.hook_start_sec  # type: ignore[assignment]
+        hook_hi = clip.hook_end_sec  # type: ignore[assignment]
+        if ts > max(0.0, hook_lo - 0.25):
+            ts = max(0.0, hook_lo - 0.25)
+            stats.hook_protected = True
+        if te > max(0.0, duration - hook_hi - 0.25):
+            te = max(0.0, duration - hook_hi - 0.25)
+            stats.hook_protected = True
+
+    min_final = min(float(MIN_CLIP_DURATION_SEC), duration)
+    max_total_by_min = max(0.0, duration - min_final)
+    if ts + te > max_total_by_min:
+        overflow = ts + te - max_total_by_min
+        te_cut = min(te, overflow)
+        te -= te_cut
+        overflow -= te_cut
+        if overflow > 0:
+            ts = max(0.0, ts - overflow)
+        stats.min_duration_protected = True
+
+    ts = max(0.0, min(ts, duration))
+    te = max(0.0, min(te, duration - ts))
+    return ts, te, stats
+
+
+# Tolerance used when snapping trim boundaries to WhisperX segment edges. A
+# 3s window comfortably covers "finish the current sentence" cases without
+# materially deviating from what the LLM asked for. Tuned on the reported
+# mid-sentence cut in clip 001 of the ``PdVv_vLkUgk`` run (6.38s trim vs a
+# sentence that ended ~1.5s later).
+_SEGMENT_SNAP_TOLERANCE_SEC: float = 3.0
+_BOUNDARY_GAP_SEC: float = 0.5
+_BOUNDARY_TIME_EPS_SEC: float = 0.12
+_START_BOUNDARY_WINDOW_SEC: float = 3.0
+_END_BOUNDARY_WINDOW_SEC: float = 2.0
+_TERMINAL_PUNCT: tuple[str, ...] = (".", "?", "!")
+_WEAK_START_WORDS: frozenset[str] = frozenset({"and", "but", "so", "or", "then", "because"})
+
+
+@dataclass(frozen=True)
+class _BoundaryCandidate:
+    """A possible snapped boundary on the source timeline."""
+
+    time_sec: float
+    clean: bool
+    reason: str
+    source: str
+    weak_start: bool = False
+
+
+def _snap_trims_to_segment_boundaries(
+    clip: Clip,
+    transcript: dict,
+    *,
+    level: PruneLevel,
+    tolerance_sec: float = _SEGMENT_SNAP_TOLERANCE_SEC,
+) -> tuple[float, float]:
+    """Snap an already-clamped ``(trim_start, trim_end)`` to phrase boundaries.
+
+    WhisperX segments correspond to natural phrase / sentence groupings.
+    Landing cuts on segment edges eliminates the "this could be..." class of
+    mid-sentence truncation, even when the LLM rounds to an arbitrary
+    syllable.
+
+    Direction preference:
+
+    - ``trim_start``: prefer the nearest segment START at-or-after the
+      current in-point (trim a hair more to drop lead-in filler). Fallback
+      is the nearest segment start behind, within tolerance.
+    - ``trim_end``: prefer the nearest segment END at-or-after the current
+      out-point (let the sentence finish, keeping MORE content). Fallback
+      is the nearest segment end before, within tolerance.
+
+    Safety: the snapped pair is reverted if it would violate
+    ``MIN_CLIP_DURATION_SEC``, exceed the level's ``max_pct`` trim cap, or
+    eat into a real (non-placeholder) hook window. Snapping can only
+    *improve* a decision, never break it.
+    """
+    ts0 = float(clip.trim_start_sec)
+    te0 = float(clip.trim_end_sec)
+    if ts0 < 0.05 and te0 < 0.05:
+        return ts0, te0
+
+    segs = _segments_within_clip(transcript, clip)
+    if not segs:
+        return ts0, te0
+
+    duration = clip.duration_sec
+    seg_starts = [float(s["start"]) for s in segs]
+    seg_ends = [float(s["end"]) for s in segs]
+
+    new_ts = ts0
+    if ts0 >= 0.05:
+        forward = [s for s in seg_starts if s >= ts0 and (s - ts0) <= tolerance_sec]
+        backward = [s for s in seg_starts if s < ts0 and (ts0 - s) <= tolerance_sec]
+        if forward:
+            new_ts = min(forward)
+        elif backward:
+            new_ts = max(backward)
+
+    new_te = te0
+    if te0 >= 0.05:
+        out0 = duration - te0
+        forward = [e for e in seg_ends if e >= out0 and (e - out0) <= tolerance_sec]
+        backward = [e for e in seg_ends if e < out0 and (out0 - e) <= tolerance_sec]
+        if forward:
+            new_out = min(forward)
+        elif backward:
+            new_out = max(backward)
+        else:
+            new_out = out0
+        new_te = max(0.0, duration - new_out)
+
+    new_ts = max(0.0, min(new_ts, duration))
+    new_te = max(0.0, min(new_te, duration - new_ts))
+
+    min_final = min(float(MIN_CLIP_DURATION_SEC), duration)
+    if duration - new_ts - new_te < min_final - 1e-6:
+        return ts0, te0
+
+    max_pct = _MAX_TOTAL_TRIM_PCT.get(level, 0.0)
+    if max_pct > 0.0 and (new_ts + new_te) > duration * max_pct + 1e-6:
+        return ts0, te0
+
+    if (
+        clip.hook_start_sec is not None
+        and clip.hook_end_sec is not None
+        and not _looks_like_default_hook(clip.hook_start_sec, clip.hook_end_sec)
+    ):
+        hook_lo = float(clip.hook_start_sec)
+        hook_hi = float(clip.hook_end_sec)
+        if new_ts > max(0.0, hook_lo - 0.25) + 1e-6:
+            return ts0, te0
+        if duration - new_te < hook_hi + 0.25 - 1e-6:
+            return ts0, te0
+
+    return new_ts, new_te
+
+
+def _flatten_transcript_words(transcript: dict) -> list[dict[str, float | str]]:
+    words: list[dict[str, float | str]] = []
+    for seg in transcript.get("segments", []):
+        for word in seg.get("words", []):
+            if "start" not in word or "end" not in word:
+                continue
+            try:
+                start = float(word["start"])
+                end = float(word["end"])
+            except (TypeError, ValueError):
+                continue
+            words.append(
+                {
+                    "word": str(word.get("word", "")),
+                    "start": start,
+                    "end": end,
+                }
+            )
+    return words
+
+
+def _normalized_last_char(text: str) -> str:
+    stripped = text.rstrip()
+    return stripped[-1] if stripped else ""
+
+
+def _segment_start_hint(
+    segments: list[dict[str, Any]],
+    words: list[dict[str, float | str]],
+    time_sec: float,
+) -> tuple[bool, str, bool]:
+    for idx, seg in enumerate(segments):
+        seg_start = float(seg.get("start", 0.0))
+        if abs(seg_start - time_sec) > _BOUNDARY_TIME_EPS_SEC:
+            continue
+        seg_words = seg.get("words") or []
+        first_word = ""
+        if seg_words:
+            first_word = str(seg_words[0].get("word", "")).strip().lower()
+        weak_start = first_word in _WEAK_START_WORDS
+        if idx == 0:
+            return True, "first transcript segment", weak_start
+        prev_text = str(segments[idx - 1].get("text", "")).rstrip()
+        if _normalized_last_char(prev_text) in _TERMINAL_PUNCT:
+            return True, "previous segment ends with terminal punctuation", weak_start
+        break
+
+    for idx, word in enumerate(words):
+        start = float(word["start"])
+        if abs(start - time_sec) > _BOUNDARY_TIME_EPS_SEC:
+            continue
+        weak_start = str(word["word"]).strip().lower() in _WEAK_START_WORDS
+        if idx == 0:
+            return True, "first transcript word", weak_start
+        gap_before = start - float(words[idx - 1]["end"])
+        if gap_before >= _BOUNDARY_GAP_SEC:
+            return True, f"silence gap before boundary ({gap_before:.2f}s)", weak_start
+        return False, "no terminal punctuation or >=0.5s silence before boundary", weak_start
+
+    return False, "no matching transcript boundary", False
+
+
+def _segment_end_hint(
+    segments: list[dict[str, Any]],
+    words: list[dict[str, float | str]],
+    time_sec: float,
+) -> tuple[bool, str]:
+    for seg in segments:
+        seg_end = float(seg.get("end", 0.0))
+        if abs(seg_end - time_sec) > _BOUNDARY_TIME_EPS_SEC:
+            continue
+        text = str(seg.get("text", "")).rstrip()
+        if _normalized_last_char(text) in _TERMINAL_PUNCT:
+            return True, "segment ends with terminal punctuation"
+        break
+
+    for idx, word in enumerate(words):
+        end = float(word["end"])
+        if abs(end - time_sec) > _BOUNDARY_TIME_EPS_SEC:
+            continue
+        if idx == len(words) - 1:
+            return True, "last transcript word"
+        gap_after = float(words[idx + 1]["start"]) - end
+        if gap_after >= _BOUNDARY_GAP_SEC:
+            return True, f"silence gap after boundary ({gap_after:.2f}s)"
+        return False, "no terminal punctuation or >=0.5s silence after boundary"
+
+    return False, "no matching transcript boundary"
+
+
+def _candidate_key(time_sec: float) -> float:
+    return round(time_sec, 3)
+
+
+def _gather_start_candidates(
+    clip: Clip,
+    current_start: float,
+    transcript: dict,
+) -> list[_BoundaryCandidate]:
+    low = current_start - _START_BOUNDARY_WINDOW_SEC
+    high = current_start + _START_BOUNDARY_WINDOW_SEC
+    segments = list(transcript.get("segments", []))
+    words = _flatten_transcript_words(transcript)
+
+    by_time: dict[float, _BoundaryCandidate] = {}
+
+    def add_candidate(time_sec: float, source: str) -> None:
+        clean, reason, weak = _segment_start_hint(segments, words, time_sec)
+        candidate = _BoundaryCandidate(
+            time_sec=float(time_sec),
+            clean=clean,
+            reason=reason,
+            source=source,
+            weak_start=weak,
+        )
+        key = _candidate_key(candidate.time_sec)
+        existing = by_time.get(key)
+        if existing is None:
+            by_time[key] = candidate
+            return
+        if candidate.clean and not existing.clean:
+            by_time[key] = candidate
+            return
+        if candidate.clean == existing.clean and not candidate.weak_start and existing.weak_start:
+            by_time[key] = candidate
+
+    add_candidate(current_start, "current")
+    add_candidate(clip.start_time_sec, "raw")
+
+    for seg in segments:
+        seg_start = float(seg.get("start", 0.0))
+        if low <= seg_start <= high:
+            add_candidate(seg_start, "segment")
+    for word in words:
+        word_start = float(word["start"])
+        if low <= word_start <= high:
+            add_candidate(word_start, "word")
+
+    return list(by_time.values())
+
+
+def _gather_end_candidates(
+    clip: Clip,
+    current_end: float,
+    transcript: dict,
+) -> list[_BoundaryCandidate]:
+    low = current_end - _END_BOUNDARY_WINDOW_SEC
+    high = current_end + _END_BOUNDARY_WINDOW_SEC
+    segments = list(transcript.get("segments", []))
+    words = _flatten_transcript_words(transcript)
+
+    by_time: dict[float, _BoundaryCandidate] = {}
+
+    def add_candidate(time_sec: float, source: str) -> None:
+        clean, reason = _segment_end_hint(segments, words, time_sec)
+        candidate = _BoundaryCandidate(
+            time_sec=float(time_sec),
+            clean=clean,
+            reason=reason,
+            source=source,
+        )
+        key = _candidate_key(candidate.time_sec)
+        existing = by_time.get(key)
+        if existing is None or (candidate.clean and not existing.clean):
+            by_time[key] = candidate
+
+    add_candidate(current_end, "current")
+    add_candidate(clip.end_time_sec, "raw")
+
+    for seg in segments:
+        seg_end = float(seg.get("end", 0.0))
+        if low <= seg_end <= high:
+            add_candidate(seg_end, "segment")
+    for word in words:
+        word_end = float(word["end"])
+        if low <= word_end <= high:
+            add_candidate(word_end, "word")
+
+    return list(by_time.values())
+
+
+def _candidate_priority(current_time: float, candidate: _BoundaryCandidate) -> tuple[int, int, int, float]:
+    source_rank = {"current": 0, "raw": 1, "segment": 2, "word": 3}.get(candidate.source, 9)
+    weak_rank = 1 if candidate.weak_start else 0
+    clean_rank = 0 if candidate.clean else 1
+    return (clean_rank, weak_rank, source_rank, abs(candidate.time_sec - current_time))
+
+
+def _pair_priority(
+    current_start: float,
+    current_end: float,
+    start_candidate: _BoundaryCandidate,
+    end_candidate: _BoundaryCandidate,
+) -> tuple[int, int, int, float]:
+    good_start = start_candidate.clean and not start_candidate.weak_start
+    good_end = end_candidate.clean
+    return (
+        -(int(good_start) + int(good_end)),
+        1 if start_candidate.weak_start else 0,
+        0 if (good_start or good_end) else 1,
+        abs(start_candidate.time_sec - current_start) + abs(end_candidate.time_sec - current_end),
+    )
+
+
+def snap_render_windows_to_sentence_boundaries(
+    clips: list[Clip],
+    transcript: dict,
+) -> list[Clip]:
+    """Snap render windows to nearby complete-thought boundaries.
+
+    This runs after Stage 2.5 pruning and operates on the *actual* render
+    window (`start + trim_start`, `end - trim_end`). Unlike trim snapping, it
+    can undo a harmful trim or move slightly beyond the original selected
+    window, as long as the final duration still satisfies the hard
+    `[MIN_CLIP_DURATION_SEC, MAX_CLIP_DURATION_SEC]` contract.
+    """
+    if not transcript.get("segments"):
+        return clips
+
+    snapped: list[Clip] = []
+    for clip in clips:
+        current_start = clip.start_time_sec + clip.trim_start_sec
+        current_end = clip.end_time_sec - clip.trim_end_sec
+        start_candidates = sorted(
+            _gather_start_candidates(clip, current_start, transcript),
+            key=lambda c: _candidate_priority(current_start, c),
+        )
+        end_candidates = sorted(
+            _gather_end_candidates(clip, current_end, transcript),
+            key=lambda c: _candidate_priority(current_end, c),
+        )
+
+        current_start_candidate = next(c for c in start_candidates if c.source == "current")
+        current_end_candidate = next(c for c in end_candidates if c.source == "current")
+        current_pair = (current_start_candidate, current_end_candidate)
+        best_pair = current_pair
+        best_priority = _pair_priority(
+            current_start,
+            current_end,
+            current_start_candidate,
+            current_end_candidate,
+        )
+
+        for start_candidate in start_candidates:
+            for end_candidate in end_candidates:
+                if end_candidate.time_sec <= start_candidate.time_sec:
+                    continue
+                duration = end_candidate.time_sec - start_candidate.time_sec
+                if duration < MIN_CLIP_DURATION_SEC or duration > MAX_CLIP_DURATION_SEC:
+                    continue
+                priority = _pair_priority(
+                    current_start,
+                    current_end,
+                    start_candidate,
+                    end_candidate,
+                )
+                if priority < best_priority:
+                    best_pair = (start_candidate, end_candidate)
+                    best_priority = priority
+
+        start_candidate, end_candidate = best_pair
+        start_improved = best_pair[0] is not current_pair[0]
+        end_improved = best_pair[1] is not current_pair[1]
+        if start_improved or end_improved:
+            logger.info(
+                "Clip %s: render window snapped %.2f-%.2f -> %.2f-%.2f "
+                "(start=%s; end=%s).",
+                clip.clip_id,
+                current_start,
+                current_end,
+                start_candidate.time_sec,
+                end_candidate.time_sec,
+                start_candidate.reason,
+                end_candidate.reason,
+            )
+            snapped.append(
+                clip.model_copy(
+                    update={
+                        "start_time_sec": start_candidate.time_sec,
+                        "end_time_sec": end_candidate.time_sec,
+                        "trim_start_sec": 0.0,
+                        "trim_end_sec": 0.0,
+                        "hook_start_sec": None,
+                        "hook_end_sec": None,
+                    }
+                )
+            )
+            continue
+
+        warnings: list[str] = []
+        if not current_start_candidate.clean or current_start_candidate.weak_start:
+            warnings.append(f"start@{current_start:.2f}s")
+        if not current_end_candidate.clean:
+            warnings.append(f"end@{current_end:.2f}s")
+        if warnings:
+            logger.warning(
+                "Clip %s: no valid clean sentence boundary found for %s; leaving render window unchanged.",
+                clip.clip_id,
+                ", ".join(warnings),
+            )
+        snapped.append(clip)
+
+    return snapped
+
+
+def apply_prune_decisions(
+    clips: list[Clip],
+    decisions: list[_PruneDecision],
+    *,
+    level: PruneLevel,
+    transcript: dict | None = None,
+) -> list[Clip]:
+    """Return new clips with trim_start / trim_end set from LLM decisions.
+
+    Clips whose ``clip_id`` is missing from ``decisions`` are returned with
+    trims of 0 / 0 (no-op). Decisions are always clamped; no exception is
+    raised if the model returned invalid numbers.
+
+    When ``transcript`` is provided, each clamped trim pair is additionally
+    snapped to the nearest WhisperX segment boundary (see
+    :func:`_snap_trims_to_segment_boundaries`) so cuts never land
+    mid-sentence. The clamp is authoritative -- snapping only ever produces
+    an equally-safe boundary, never a looser one.
+    """
+    by_id = {d.clip_id: d for d in decisions}
+    out: list[Clip] = []
+    for clip in clips:
+        d = by_id.get(clip.clip_id)
+        if d is None or level == "off":
+            out.append(clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0}))
+            continue
+        ts, te, stats = _clamp_decision(
+            clip, d.trim_start_sec, d.trim_end_sec, level=level
+        )
+        # Surface every non-trivial clamp so silent degradations (e.g. a
+        # fake hook nuking every trim) are visible in INFO logs, not just
+        # buried in ``prune_raw.json``.
+        requested = d.trim_start_sec + d.trim_end_sec
+        applied = ts + te
+        reshaped = (
+            stats.hook_protected
+            or stats.min_duration_protected
+            or stats.max_pct_protected
+            or (requested > 0.0 and abs(applied - requested) > 0.05)
+        )
+        if reshaped:
+            logger.info(
+                "Clip %s: prune decision clamped (hook=%s min=%s cap=%s) "
+                "requested %.2f/%.2f -> applied %.2f/%.2f",
+                clip.clip_id,
+                stats.hook_protected,
+                stats.min_duration_protected,
+                stats.max_pct_protected,
+                d.trim_start_sec,
+                d.trim_end_sec,
+                ts,
+                te,
+            )
+        candidate = clip.model_copy(update={"trim_start_sec": ts, "trim_end_sec": te})
+        if transcript is not None:
+            snapped_ts, snapped_te = _snap_trims_to_segment_boundaries(
+                candidate, transcript, level=level
+            )
+            if abs(snapped_ts - ts) > 1e-3 or abs(snapped_te - te) > 1e-3:
+                logger.info(
+                    "Clip %s: prune boundaries snapped to segment edges "
+                    "%.2f/%.2f -> %.2f/%.2f",
+                    clip.clip_id,
+                    ts,
+                    te,
+                    snapped_ts,
+                    snapped_te,
+                )
+                candidate = candidate.model_copy(
+                    update={"trim_start_sec": snapped_ts, "trim_end_sec": snapped_te}
+                )
+        out.append(candidate)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Prompt construction
+# ---------------------------------------------------------------------------
+
+
+def _segments_within_clip(transcript: dict, clip: Clip) -> list[dict]:
+    """Return transcript segments that overlap the clip window, with times
+    expressed as seconds relative to the clip start.
+    """
+    s0 = clip.start_time_sec
+    s1 = clip.end_time_sec
+    lines: list[dict] = []
+    for seg in transcript.get("segments", []):
+        start = float(seg.get("start", 0.0))
+        end = float(seg.get("end", start))
+        if end <= s0 or start >= s1:
+            continue
+        rel_start = max(0.0, start - s0)
+        rel_end = min(clip.duration_sec, end - s0)
+        if rel_end <= rel_start:
+            continue
+        lines.append(
+            {
+                "start": rel_start,
+                "end": rel_end,
+                "text": (seg.get("text") or "").strip(),
+            }
+        )
+    return lines
+
+
+def _build_user_message(clips: list[Clip], transcript: dict) -> str:
+    """Render a compact textual view of every clip for the LLM user turn."""
+    blocks: list[str] = []
+    for clip in clips:
+        seg_lines = _segments_within_clip(transcript, clip)
+        header = (
+            f"clip_id: {clip.clip_id}\n"
+            f"duration_sec: {clip.duration_sec:.2f}\n"
+            f"topic: {clip.topic}"
+        )
+        if clip.hook_start_sec is not None and clip.hook_end_sec is not None:
+            header += (
+                f"\nhook_window_sec: [{clip.hook_start_sec:.2f}, {clip.hook_end_sec:.2f}]"
+            )
+        body = "\n".join(
+            f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}" for seg in seg_lines
+        )
+        if not body:
+            body = "(no segments overlap this clip window)"
+        blocks.append(f"{header}\n---\n{body}")
+    return "\n\n===\n\n".join(blocks)
+
+
+# ---------------------------------------------------------------------------
+# Cache
+# ---------------------------------------------------------------------------
+
+
+def _clips_fingerprint(clips: list[Clip]) -> str:
+    """Fingerprint the clip *windows* (not trims, so the cache ignores previous
+    prune results when deciding whether to re-ask the LLM).
+    """
+    payload = json.dumps(
+        [
+            {
+                "id": c.clip_id,
+                "s": round(c.start_time_sec, 3),
+                "e": round(c.end_time_sec, 3),
+                "hs": c.hook_start_sec,
+                "he": c.hook_end_sec,
+            }
+            for c in clips
+        ],
+        sort_keys=True,
+        ensure_ascii=False,
+    )
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def _resolved_gemini_model(config: PipelineConfig) -> str:
+    return (config.gemini_model or GEMINI_MODEL).strip()
+
+
+def _prune_meta(
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    config: PipelineConfig,
+    level: PruneLevel,
+) -> dict[str, Any]:
+    return {
+        "version": PRUNE_META_VERSION,
+        "transcript_sha256": transcript_fp,
+        "clips_sha256": clips_fp,
+        "gemini_model": _resolved_gemini_model(config),
+        "prune_level": level,
+        "llm_backend": current_llm_provider() or "google",
+    }
+
+
+def _load_cached_clips(work_dir: Path, clips: list[Clip]) -> list[Clip] | None:
+    artifact = work_dir / PRUNE_ARTIFACT_FILENAME
+    if not artifact.is_file():
+        return None
+    try:
+        with open(artifact, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        cached = {item["clip_id"]: item for item in data.get("clips", [])}
+    except Exception as e:
+        logger.warning("Prune cache artifact unreadable (%s); re-running.", e)
+        return None
+    out: list[Clip] = []
+    for clip in clips:
+        cached_c = cached.get(clip.clip_id)
+        if cached_c is None:
+            return None
+        out.append(
+            clip.model_copy(
+                update={
+                    "trim_start_sec": float(cached_c.get("trim_start_sec", 0.0)),
+                    "trim_end_sec": float(cached_c.get("trim_end_sec", 0.0)),
+                }
+            )
+        )
+    return out
+
+
+def _write_cache(
+    work_dir: Path,
+    *,
+    pruned: list[Clip],
+    meta: dict[str, Any],
+    raw_response: str,
+) -> None:
+    work_dir.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "clips": [
+            {
+                "clip_id": c.clip_id,
+                "trim_start_sec": c.trim_start_sec,
+                "trim_end_sec": c.trim_end_sec,
+            }
+            for c in pruned
+        ]
+    }
+    (work_dir / PRUNE_ARTIFACT_FILENAME).write_text(
+        json.dumps(payload, indent=2), encoding="utf-8"
+    )
+    (work_dir / PRUNE_RAW_FILENAME).write_text(raw_response, encoding="utf-8")
+    with open(work_dir / PRUNE_META_FILENAME, "w", encoding="utf-8") as f:
+        json.dump(meta, f, indent=2)
+        f.write("\n")
+    logger.info(
+        "Wrote %s, %s and %s",
+        PRUNE_META_FILENAME,
+        PRUNE_ARTIFACT_FILENAME,
+        PRUNE_RAW_FILENAME,
+    )
+
+
+def _prune_cache_valid(
+    work_dir: Path,
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    config: PipelineConfig,
+    level: PruneLevel,
+) -> bool:
+    meta_path = work_dir / PRUNE_META_FILENAME
+    if not meta_path.is_file():
+        return False
+    try:
+        with open(meta_path, encoding="utf-8") as f:
+            meta = json.load(f)
+    except Exception:
+        return False
+    if meta.get("version") != PRUNE_META_VERSION:
+        return False
+    if meta.get("transcript_sha256") != transcript_fp:
+        return False
+    if meta.get("clips_sha256") != clips_fp:
+        return False
+    current_provider = current_llm_provider()
+    meta_provider = meta.get("llm_backend")
+    if current_provider == "openrouter":
+        if meta_provider != "openrouter":
+            return False
+    elif current_provider == "google":
+        if meta_provider not in (None, "google"):
+            return False
+    if meta.get("gemini_model") != _resolved_gemini_model(config):
+        return False
+    if meta.get("prune_level") != level:
+        return False
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Gemini call
+# ---------------------------------------------------------------------------
+
+
+def _parse_decisions(raw_json: str) -> list[_PruneDecision]:
+    """Parse a raw JSON response into decisions; bare arrays accepted too."""
+    data = json.loads(raw_json)
+    if isinstance(data, dict) and "decisions" in data:
+        try:
+            return _PruneResponse.model_validate(data).decisions
+        except ValidationError as e:
+            logger.warning("Prune response failed validation: %s", e)
+            return []
+    if isinstance(data, list):
+        decisions: list[_PruneDecision] = []
+        for item in data:
+            try:
+                decisions.append(_PruneDecision.model_validate(item))
+            except ValidationError:
+                continue
+        return decisions
+    return []
+
+
+def request_prune_decisions(
+    clips: list[Clip],
+    transcript: dict,
+    *,
+    level: PruneLevel,
+    gemini_model: str | None = None,
+) -> tuple[list[_PruneDecision], str]:
+    """Call Gemini for (potentially) one decision per clip.
+
+    Returns ``(decisions, raw_response)``. ``raw_response`` is the literal
+    string Gemini returned (cached to ``prune_raw.json`` for audit). On
+    transport or parse failure this raises; callers should catch and treat as
+    no-op.
+    """
+    if level == "off" or not clips:
+        return [], "{\"decisions\": []}"
+
+    system = content_pruning_system_prompt(
+        min_dur=MIN_CLIP_DURATION_SEC,
+        max_dur=MAX_CLIP_DURATION_SEC,
+        level=level,
+    )
+    user_text = _build_user_message(clips, transcript)
+
+    provider = resolve_llm_provider()
+    model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider)
+
+    def _call() -> str:
+        logger.info(
+            "%s content pruning (model=%s, level=%s, clips=%d)...",
+            provider,
+            model_name,
+            level,
+            len(clips),
+        )
+        if provider == "google":
+            client = genai.Client(api_key=resolve_gemini_api_key())
+            response = client.models.generate_content(
+                model=model_name,
+                contents=user_text,
+                config=gemini_generate_config(
+                    system_instruction=system,
+                    temperature=0.2,
+                    response_mime_type="application/json",
+                ),
+            )
+            if not response.text:
+                raise RuntimeError("Gemini returned empty response text for content pruning")
+            return response.text
+
+        client = OpenAI(
+            api_key=resolve_openrouter_api_key(),
+            base_url=OPENROUTER_BASE_URL,
+            default_headers=openrouter_default_headers(),
+        )
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user_text},
+            ],
+            temperature=0.2,
+            response_format={"type": "json_object"},
+        )
+        text = _openai_message_text(response.choices[0].message.content)
+        if not text:
+            raise RuntimeError("OpenRouter returned empty response text for content pruning")
+        return text
+
+    raw = _retry_llm("Gemini content pruning", _call)
+    decisions = _parse_decisions(raw)
+    return decisions, raw
+
+
+# ---------------------------------------------------------------------------
+# Public stage entrypoint (used by pipeline.run_pipeline)
+# ---------------------------------------------------------------------------
+
+
+def run_content_pruning_stage(
+    work_dir: Path,
+    clips: list[Clip],
+    transcript: dict,
+    *,
+    transcript_fp: str,
+    config: PipelineConfig,
+) -> list[Clip]:
+    """Apply Stage 2.5 pruning to ``clips`` and return the new list.
+
+    - When ``config.prune_level == "off"``, this is a cheap no-op: returns a
+      copy of the clips with trim_start/end zeroed.
+    - Otherwise, tries the cache first, then calls Gemini. A failing call
+      degrades to no-op (the pipeline is never killed by Stage 2.5).
+    """
+    level = _validated_level(config.prune_level)
+    if level == "off":
+        logger.info("Content pruning disabled (prune_level=off); skipping Stage 2.5.")
+        return [
+            clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0})
+            for clip in clips
+        ]
+
+    clips_fp = _clips_fingerprint(clips)
+
+    if not config.force_content_pruning and _prune_cache_valid(
+        work_dir,
+        transcript_fp=transcript_fp,
+        clips_fp=clips_fp,
+        config=config,
+        level=level,
+    ):
+        cached = _load_cached_clips(work_dir, clips)
+        if cached is not None:
+            logger.info(
+                "Content pruning cache hit (level=%s, %d clips); skipping LLM.",
+                level,
+                len(clips),
+            )
+            return cached
+
+    try:
+        decisions, raw = request_prune_decisions(
+            clips, transcript, level=level, gemini_model=config.gemini_model
+        )
+    except Exception as e:
+        logger.warning(
+            "Content pruning call failed (%s); continuing with un-pruned clips.", e
+        )
+        return [
+            clip.model_copy(update={"trim_start_sec": 0.0, "trim_end_sec": 0.0})
+            for clip in clips
+        ]
+
+    pruned = apply_prune_decisions(
+        clips, decisions, level=level, transcript=transcript
+    )
+    _log_prune_summary(pruned, clips)
+
+    meta = _prune_meta(
+        transcript_fp=transcript_fp,
+        clips_fp=clips_fp,
+        config=config,
+        level=level,
+    )
+    try:
+        _write_cache(work_dir, pruned=pruned, meta=meta, raw_response=raw)
+    except Exception as e:
+        logger.warning("Failed to write prune cache (%s); continuing.", e)
+    return pruned
+
+
+def _validated_level(level: str | None) -> PruneLevel:
+    lvl = (level or "balanced").strip().lower()
+    if lvl not in VALID_LEVELS:
+        logger.warning("Unknown prune_level=%r; falling back to 'balanced'.", level)
+        return "balanced"
+    return lvl  # type: ignore[return-value]
+
+
+def _log_prune_summary(pruned: list[Clip], original: list[Clip]) -> None:
+    total_before = sum(c.duration_sec for c in original)
+    total_after = sum(
+        max(0.0, c.duration_sec - c.trim_start_sec - c.trim_end_sec) for c in pruned
+    )
+    removed = total_before - total_after
+    pct = (removed / total_before * 100.0) if total_before > 0 else 0.0
+    logger.info(
+        "Content pruning done: removed %.1fs across %d clips (%.1f%% of total).",
+        removed,
+        len(pruned),
+        pct,
+    )
+    for c in pruned:
+        if c.trim_start_sec > 0 or c.trim_end_sec > 0:
+            final = c.duration_sec - c.trim_start_sec - c.trim_end_sec
+            logger.info(
+                "  [%s] trim=%.2fs/%.2fs  %.1fs -> %.1fs",
+                c.clip_id,
+                c.trim_start_sec,
+                c.trim_end_sec,
+                c.duration_sec,
+                final,
+            )
diff --git a/src/humeo/cutter.py b/src/humeo/cutter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b203cadae774dc1bae5da380a56ff6652c8102bf
--- /dev/null
+++ b/src/humeo/cutter.py
@@ -0,0 +1,651 @@
+"""Subtitle helpers for the product pipeline."""
+
+import logging
+import math
+import os
+import re
+from pathlib import Path
+
+from humeo_core.schemas import Clip, RenderTheme, TranscriptWord
+
+from humeo.transcript_align import (
+    clip_subtitle_words,
+    clip_words_to_srt_lines,
+    format_ass,
+    format_srt,
+    group_words_to_cue_chunks,
+)
+
+logger = logging.getLogger(__name__)
+
+_NATIVE_HIGHLIGHT_FONT_NAME = "Arial"
+_NATIVE_HIGHLIGHT_PURPLE = "&H00F65C8B"
+_NATIVE_HIGHLIGHT_LEAD_SEC = 0.06
+_NATIVE_HIGHLIGHT_MIN_DWELL_SEC = 0.16
+_NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC = 0.035
+_NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC = 1.65
+_NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO = 0.92
+_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE = r"\blur3.0"
+_NATIVE_HIGHLIGHT_STOPWORDS = {
+    "a",
+    "all",
+    "an",
+    "and",
+    "are",
+    "as",
+    "at",
+    "be",
+    "but",
+    "by",
+    "for",
+    "from",
+    "i",
+    "if",
+    "in",
+    "is",
+    "it",
+    "of",
+    "on",
+    "or",
+    "so",
+    "that",
+    "the",
+    "their",
+    "there",
+    "they",
+    "this",
+    "to",
+    "was",
+    "we",
+    "with",
+    "you",
+    "your",
+    "has",
+    "have",
+    "had",
+    "been",
+    "being",
+}
+
+
+def _balance_reference_caption(text: str) -> str:
+    words = text.split()
+    if len(words) <= 5 and len(text) <= 28:
+        return text
+    best_idx = 1
+    best_delta = 10**9
+    for idx in range(1, len(words)):
+        left = " ".join(words[:idx])
+        right = " ".join(words[idx:])
+        line_penalty = 0
+        if len(words[:idx]) < 2 or len(words[idx:]) < 2:
+            line_penalty += 1000
+        delta = abs(len(left) - len(right)) + abs(len(words[:idx]) - len(words[idx:])) * 6 + line_penalty
+        if delta < best_delta:
+            best_delta = delta
+            best_idx = idx
+    return " ".join(words[:best_idx]) + "\n" + " ".join(words[best_idx:])
+
+
+def _native_line_width(font, words) -> float:
+    return _text_width(font, " ".join(word.word for word in words))
+
+
+def _native_highlight_partition_penalty(lines, font, max_line_width: float) -> float:
+    widths = [_native_line_width(font, line) for line in lines]
+    overflow = sum(max(0.0, width - max_line_width) for width in widths)
+    word_counts = [len(line) for line in lines]
+    total_words = sum(word_counts)
+    width_balance = (max(widths) - min(widths)) if len(widths) > 1 else 0.0
+    word_balance = (max(word_counts) - min(word_counts)) if len(word_counts) > 1 else 0
+    single_word_penalty = sum(260 for line in lines if len(line) == 1 and total_words > 3)
+    return (
+        overflow * 80.0
+        + len(lines) * 120.0
+        + width_balance * 0.16
+        + word_balance * 120.0
+        + single_word_penalty
+    )
+
+
+def _candidate_native_highlight_partitions(words, max_lines: int):
+    n = len(words)
+    if n == 0:
+        return []
+    if max_lines <= 1 or n == 1:
+        return [[list(words)]]
+
+    out = [[list(words)]]
+    for first_break in range(1, n):
+        out.append([list(words[:first_break]), list(words[first_break:])])
+    if max_lines >= 3 and n >= 3:
+        for first_break in range(1, n - 1):
+            for second_break in range(first_break + 1, n):
+                out.append(
+                    [
+                        list(words[:first_break]),
+                        list(words[first_break:second_break]),
+                        list(words[second_break:]),
+                    ]
+                )
+    return out
+
+
+def _split_native_highlight_lines(words, *, font=None, max_line_width: float | None = None):
+    if len(words) <= 3 and len(" ".join(word.word for word in words)) <= 22:
+        return [list(words)]
+    if len(words) < 2:
+        return [list(words)]
+    if font is not None and max_line_width is not None:
+        candidates = _candidate_native_highlight_partitions(words, max_lines=3)
+        return min(
+            candidates,
+            key=lambda lines: _native_highlight_partition_penalty(
+                lines,
+                font,
+                max_line_width,
+            ),
+        )
+    best_idx = 1
+    best_delta = 10**9
+    for idx in range(1, len(words)):
+        left_words = words[:idx]
+        right_words = words[idx:]
+        left = " ".join(word.word for word in left_words)
+        right = " ".join(word.word for word in right_words)
+        line_penalty = 0
+        if len(left_words) < 2 or len(right_words) < 2:
+            line_penalty += 800
+        delta = abs(len(left) - len(right)) + abs(len(left_words) - len(right_words)) * 7 + line_penalty
+        if delta < best_delta:
+            best_delta = delta
+            best_idx = idx
+    return [list(words[:best_idx]), list(words[best_idx:])]
+
+
+def _clean_native_highlight_token(text: str) -> str:
+    return re.sub(r"(^[^A-Za-z0-9$%#]+|[^A-Za-z0-9$%#]+$)", "", text or "")
+
+
+def _native_highlight_span_score(words) -> float:
+    cleaned = [_clean_native_highlight_token(word.word) for word in words]
+    cleaned = [token for token in cleaned if token]
+    if not cleaned:
+        return -1e9
+    if all(token.lower() in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned):
+        return -1e9
+
+    score = 0.0
+    for token in cleaned:
+        lower = token.lower()
+        if lower not in _NATIVE_HIGHLIGHT_STOPWORDS:
+            score += 2.0
+        if any(ch.isdigit() for ch in token) or "$" in token or "%" in token:
+            score += 3.0
+        if len(token) >= 6:
+            score += 0.8
+        if token.isupper() and len(token) > 1:
+            score += 0.6
+    if len(cleaned) == 2:
+        score -= 0.55
+        if any(any(ch.isdigit() for ch in token) or "$" in token or "%" in token for token in cleaned):
+            score += 1.1
+        elif cleaned[0].lower() in _NATIVE_HIGHLIGHT_STOPWORDS or cleaned[1].lower() in _NATIVE_HIGHLIGHT_STOPWORDS:
+            score -= 0.6
+        else:
+            score += 0.3
+        if len(" ".join(cleaned)) > 18:
+            score -= 0.6
+    return score
+
+
+def _should_render_native_highlight_group(words) -> bool:
+    cleaned = [_clean_native_highlight_token(word.word) for word in words]
+    cleaned = [token for token in cleaned if token]
+    if not cleaned:
+        return False
+    return any(token.lower() not in _NATIVE_HIGHLIGHT_STOPWORDS for token in cleaned)
+
+
+def _native_highlight_font_path() -> Path | None:
+    windows_fonts = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts"
+    for filename in ("arialbd.ttf", "Arialbd.ttf", "ARIALBD.TTF", "arial.ttf"):
+        path = windows_fonts / filename
+        if path.is_file():
+            return path
+    return None
+
+
+def _text_width(font, text: str) -> float:
+    if not text:
+        return 0.0
+    if hasattr(font, "getlength"):
+        return float(font.getlength(text))
+    bbox = font.getbbox(text)
+    return float(bbox[2] - bbox[0])
+
+
+def _text_height(font) -> int:
+    bbox = font.getbbox("Ag")
+    return max(1, int(round(bbox[3] - bbox[1])))
+
+
+def _escape_ass_text(text: str) -> str:
+    return (
+        text.replace("\\", r"\\")
+        .replace("{", r"\{")
+        .replace("}", r"\}")
+        .replace("\n", r"\N")
+    )
+
+
+def _native_highlight_overlay_text(line_words, highlight_idx: int) -> str:
+    parts: list[str] = []
+    for word_idx, word in enumerate(line_words):
+        if word_idx == highlight_idx:
+            parts.append(
+                f"{{\\rHighlight{_NATIVE_HIGHLIGHT_ROUNDING_OVERRIDE}}}"
+                f"{_escape_ass_text(word.word)}"
+                "{\\rInvisible}"
+            )
+        else:
+            parts.append(_escape_ass_text(word.word))
+    return " ".join(parts)
+
+
+def _word_timing_weight(word: TranscriptWord) -> float:
+    token = _clean_native_highlight_token(word.word)
+    return max(0.65, min(2.2, len(token or word.word) / 5.5))
+
+
+def _suspicious_native_highlight_timing(
+    words: list[TranscriptWord],
+    idx: int,
+    *,
+    clip_duration: float,
+) -> bool:
+    word = words[idx]
+    start = float(word.start_time)
+    end = float(word.end_time)
+    if not (math.isfinite(start) and math.isfinite(end)):
+        return True
+    if start < -0.01 or end > clip_duration + 0.25:
+        return True
+    duration = end - start
+    if duration < _NATIVE_HIGHLIGHT_MIN_VALID_WORD_SEC:
+        return True
+    if duration > _NATIVE_HIGHLIGHT_MAX_VALID_WORD_SEC:
+        return True
+    if idx > 0:
+        prev = words[idx - 1]
+        if start < float(prev.start_time) - 0.03:
+            return True
+        if start < float(prev.end_time) - 0.35:
+            return True
+    if idx + 1 < len(words):
+        nxt = words[idx + 1]
+        if float(nxt.start_time) < start - 0.03:
+            return True
+    return False
+
+
+def _repair_native_highlight_timings(
+    words: list[TranscriptWord],
+    *,
+    clip_duration: float,
+) -> list[TranscriptWord]:
+    """Repair obvious ASR word timestamp glitches before per-word highlighting.
+
+    This is intentionally conservative: clean Whisper/ElevenLabs timings pass
+    through almost unchanged, while zero-length, reversed, huge, or badly
+    overlapping word timings get interpolated between neighboring reliable words.
+    """
+
+    if not words:
+        return []
+    clip_duration = max(0.0, float(clip_duration))
+    records: list[dict[str, object]] = []
+    for idx, word in enumerate(words):
+        start = max(0.0, min(clip_duration, float(word.start_time)))
+        end = max(0.0, min(clip_duration, float(word.end_time)))
+        records.append(
+            {
+                "word": word.word,
+                "start": start,
+                "end": end,
+                "bad": _suspicious_native_highlight_timing(
+                    words,
+                    idx,
+                    clip_duration=clip_duration,
+                ),
+                "weight": _word_timing_weight(word),
+            }
+        )
+
+    idx = 0
+    while idx < len(records):
+        if not records[idx]["bad"]:
+            idx += 1
+            continue
+        run_start = idx
+        while idx < len(records) and records[idx]["bad"]:
+            idx += 1
+        run_end = idx - 1
+        count = run_end - run_start + 1
+        left_time = (
+            float(records[run_start - 1]["end"])
+            if run_start > 0
+            else max(0.0, float(records[run_start]["start"]))
+        )
+        right_time = (
+            float(records[run_end + 1]["start"])
+            if run_end + 1 < len(records)
+            else min(clip_duration, max(left_time, float(records[run_end]["end"])))
+        )
+        weight_span = sum(float(r["weight"]) for r in records[run_start : run_end + 1]) * 0.13
+        min_span = max(0.11 * count, weight_span)
+        if right_time <= left_time + min_span:
+            right_time = min(clip_duration, left_time + min_span)
+        if right_time <= left_time:
+            right_time = min(clip_duration, left_time + max(0.08, 0.12 * count))
+
+        span = max(0.001, right_time - left_time)
+        weights = [float(r["weight"]) for r in records[run_start : run_end + 1]]
+        total_weight = max(0.001, sum(weights))
+        cursor = left_time
+        for offset, weight in enumerate(weights):
+            rec = records[run_start + offset]
+            next_cursor = (
+                right_time
+                if offset == count - 1
+                else cursor + span * (weight / total_weight)
+            )
+            rec["start"] = cursor
+            rec["end"] = max(cursor + 0.04, next_cursor)
+            cursor = float(rec["end"])
+
+    repaired: list[TranscriptWord] = []
+    prev_end = 0.0
+    for rec in records:
+        start = max(0.0, float(rec["start"]))
+        end = max(start + 0.02, float(rec["end"]))
+        if start < prev_end - 0.02:
+            start = prev_end
+            end = max(end, start + 0.04)
+        if clip_duration > 0.0:
+            end = min(clip_duration, end)
+            if end <= start:
+                start = max(0.0, min(start, clip_duration - 0.02))
+                end = min(clip_duration, start + 0.04)
+        repaired.append(TranscriptWord(word=str(rec["word"]), start_time=start, end_time=end))
+        prev_end = max(prev_end, end)
+    return repaired
+
+
+def _native_highlight_word_windows(
+    words: list[TranscriptWord],
+    *,
+    lead_sec: float,
+    min_dwell_sec: float,
+) -> list[tuple[float, float]]:
+    if not words:
+        return []
+    lead_sec = max(0.0, float(lead_sec))
+    min_dwell_sec = max(0.02, float(min_dwell_sec))
+    cue_start = max(0.0, words[0].start_time - lead_sec)
+    cue_end = max(words[-1].end_time, words[-1].start_time + min_dwell_sec)
+
+    starts: list[float] = []
+    for idx, word in enumerate(words):
+        start = max(cue_start, float(word.start_time) - lead_sec)
+        if idx > 0:
+            start = max(start, starts[-1] + 0.01)
+        starts.append(start)
+
+    windows: list[tuple[float, float]] = []
+    for idx, word in enumerate(words):
+        start = starts[idx]
+        natural_end = max(float(word.end_time), start + min_dwell_sec)
+        limit = starts[idx + 1] if idx + 1 < len(starts) else cue_end
+        end = min(natural_end, limit)
+        if end <= start:
+            end = min(limit, start + 0.01)
+        windows.append((start, max(start + 0.01, end)))
+    return windows
+
+
+def _fmt_ass_time(seconds: float) -> str:
+    seconds = max(0.0, seconds)
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    whole = int(secs)
+    cs = int(round((secs - whole) * 100))
+    if cs >= 100:
+        cs = 99
+    return f"{hours:d}:{minutes:02d}:{whole:02d}.{cs:02d}"
+
+
+def _format_native_highlight_ass(
+    cue_chunks,
+    *,
+    play_res_x: int,
+    play_res_y: int,
+    font_size: int,
+    margin_v: int,
+    font_name: str,
+    highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC,
+    highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC,
+) -> str:
+    from PIL import ImageFont
+
+    font_path = _native_highlight_font_path()
+    if font_path is not None:
+        font = ImageFont.truetype(str(font_path), size=font_size)
+    else:
+        font = ImageFont.load_default()
+
+    line_height = max(font_size, _text_height(font) + 6)
+    line_gap = max(8, int(round(font_size * 0.08)))
+    bottom_anchor = play_res_y - margin_v
+    max_line_width = play_res_x * _NATIVE_HIGHLIGHT_MAX_LINE_WIDTH_RATIO
+
+    header = (
+        "[Script Info]\n"
+        "ScriptType: v4.00+\n"
+        f"PlayResX: {play_res_x}\n"
+        f"PlayResY: {play_res_y}\n"
+        "WrapStyle: 0\n"
+        "ScaledBorderAndShadow: yes\n"
+        "YCbCr Matrix: None\n"
+        "\n"
+        "[V4+ Styles]\n"
+        "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
+        "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
+        "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
+        "Alignment, MarginL, MarginR, MarginV, Encoding\n"
+        f"Style: Base,{font_name},{font_size},&H00FFFFFF,&H000000FF,&H00101010,&H00000000,-1,0,0,0,100,100,-1,0,1,4,0,8,0,0,0,0\n"
+        f"Style: Highlight,{font_name},{font_size},&H00FFFFFF,&H000000FF,{_NATIVE_HIGHLIGHT_PURPLE},&H00000000,-1,0,0,0,100,100,-1,0,3,4,0,8,0,0,0,0\n"
+        f"Style: Invisible,{font_name},{font_size},&HFF000000,&H000000FF,&HFF000000,&HFF000000,-1,0,0,0,100,100,-1,0,1,0,0,8,0,0,0,0\n"
+        "\n"
+        "[Events]\n"
+        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
+    )
+
+    events: list[str] = []
+    for cue_words in cue_chunks:
+        if not cue_words:
+            continue
+        lines = _split_native_highlight_lines(
+            cue_words,
+            font=font,
+            max_line_width=max_line_width,
+        )
+        cue_windows = _native_highlight_word_windows(
+            cue_words,
+            lead_sec=highlight_lead_sec,
+            min_dwell_sec=highlight_min_dwell_sec,
+        )
+        block_height = len(lines) * line_height + max(0, len(lines) - 1) * line_gap
+        block_top = bottom_anchor - block_height
+        cue_start = cue_windows[0][0] if cue_windows else cue_words[0].start_time
+        cue_end = cue_windows[-1][1] if cue_windows else cue_words[-1].end_time
+        word_offset = 0
+        for line_idx, line_words in enumerate(lines):
+            if not line_words:
+                continue
+            line_text = " ".join(word.word for word in line_words)
+            line_top = block_top + line_idx * (line_height + line_gap)
+            line_left = (play_res_x - _text_width(font, line_text)) / 2.0
+            events.append(
+                "Dialogue: 1,"
+                f"{_fmt_ass_time(cue_start)},{_fmt_ass_time(cue_end)},Base,,0,0,0,,"
+                f"{{\\an7\\pos({line_left:.1f},{line_top:.1f})}}{_escape_ass_text(line_text)}"
+            )
+            for word_idx, word in enumerate(line_words):
+                cleaned = _clean_native_highlight_token(word.word)
+                if not cleaned:
+                    continue
+                word_start, word_end = cue_windows[word_offset + word_idx]
+                events.append(
+                    "Dialogue: 0,"
+                    f"{_fmt_ass_time(word_start)},{_fmt_ass_time(word_end)},Invisible,,0,0,0,,"
+                    f"{{\\an7\\pos({line_left:.1f},{line_top:.1f})}}"
+                    f"{_native_highlight_overlay_text(line_words, word_idx)}"
+                )
+            word_offset += len(line_words)
+
+    return header + "\n".join(events) + ("\n" if events else "")
+
+
+def generate_srt(
+    clip: Clip,
+    transcript: dict,
+    output_dir: Path,
+    *,
+    max_words_per_cue: int = 8,
+    max_cue_sec: float = 4.0,
+) -> Path:
+    """
+    Build an SRT file from word-level ASR aligned to this clip's timeline.
+
+    ``transcript`` is the persisted ``transcript.json`` (segments with optional
+    per-word timestamps). Times are shifted so 0 = clip in-point.
+    """
+    srt_path = output_dir / f"clip_{clip.clip_id}.srt"
+    aligned = clip_subtitle_words(transcript, clip)
+    lines = clip_words_to_srt_lines(
+        aligned.words,
+        max_words_per_cue=max_words_per_cue,
+        max_cue_sec=max_cue_sec,
+    )
+    srt_path.write_text(format_srt(lines), encoding="utf-8")
+    logger.info("Generated SRT: %s (%d cues)", srt_path, len(lines))
+    return srt_path
+
+
+def generate_ass(
+    clip: Clip,
+    transcript: dict,
+    output_dir: Path,
+    *,
+    max_words_per_cue: int = 4,
+    max_cue_sec: float = 2.2,
+    play_res_x: int = 1080,
+    play_res_y: int = 1920,
+    font_size: int = 48,
+    margin_v: int = 160,
+    margin_h: int = 60,
+    font_name: str = "Arial",
+    render_theme: RenderTheme = RenderTheme.LEGACY,
+    native_highlight_lead_sec: float = _NATIVE_HIGHLIGHT_LEAD_SEC,
+    native_highlight_min_dwell_sec: float = _NATIVE_HIGHLIGHT_MIN_DWELL_SEC,
+    repair_word_timings: bool = True,
+) -> Path:
+    """Generate an ASS caption file tuned for direct libass rendering.
+
+    Unlike SRT → libass (default PlayResY=288), an ASS file with
+    ``PlayResY = output_height`` means libass' scale factor is 1.0, so the
+    ``font_size`` / ``margin_v`` arguments below are honest output pixels.
+
+    This is the root-cause fix for the "captions rendering in the middle of
+    the frame, four times too large" bug the user reported.
+    """
+    ass_path = output_dir / f"clip_{clip.clip_id}.ass"
+    aligned = clip_subtitle_words(transcript, clip)
+    cue_words = max_words_per_cue
+    cue_sec = max_cue_sec
+    cue_font_size = font_size
+    cue_margin_v = margin_v
+    prefer_break_on_punctuation = False
+    min_words_before_break = 1
+    if render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+        cue_words = max(max_words_per_cue, 7)
+        cue_sec = max(max_cue_sec, 2.6)
+        cue_font_size = max(font_size, 52)
+        cue_margin_v = min(margin_v, 136)
+        prefer_break_on_punctuation = True
+        min_words_before_break = 5
+    elif render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+        cue_words = 8
+        cue_sec = 2.4
+        cue_font_size = max(font_size, 86)
+        cue_margin_v = max(margin_v, 300)
+        prefer_break_on_punctuation = True
+        min_words_before_break = 4
+
+    aligned_words = aligned.words
+    if render_theme == RenderTheme.NATIVE_HIGHLIGHT and repair_word_timings:
+        aligned_words = _repair_native_highlight_timings(
+            aligned_words,
+            clip_duration=clip.duration_sec,
+        )
+
+    cue_chunks = group_words_to_cue_chunks(
+        aligned_words,
+        max_words_per_cue=cue_words,
+        max_cue_sec=cue_sec,
+        prefer_break_on_punctuation=prefer_break_on_punctuation,
+        min_words_before_break=min_words_before_break,
+    )
+    lines = [
+        (chunk[0].start_time, chunk[-1].end_time, " ".join(word.word for word in chunk))
+        for chunk in cue_chunks
+    ]
+    if render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+        lines = [(start, end, _balance_reference_caption(text)) for start, end, text in lines]
+        ass_text = format_ass(
+            lines,
+            play_res_x=play_res_x,
+            play_res_y=play_res_y,
+            font_size=cue_font_size,
+            margin_v=cue_margin_v,
+            margin_h=margin_h,
+            font_name="Source Sans 3",
+            render_theme=render_theme,
+        )
+    elif render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+        ass_text = _format_native_highlight_ass(
+            cue_chunks,
+            play_res_x=play_res_x,
+            play_res_y=play_res_y,
+            font_size=cue_font_size,
+            margin_v=cue_margin_v,
+            font_name=_NATIVE_HIGHLIGHT_FONT_NAME,
+            highlight_lead_sec=native_highlight_lead_sec,
+            highlight_min_dwell_sec=native_highlight_min_dwell_sec,
+        )
+    else:
+        ass_text = format_ass(
+            lines,
+            play_res_x=play_res_x,
+            play_res_y=play_res_y,
+            font_size=cue_font_size,
+            margin_v=cue_margin_v,
+            margin_h=margin_h,
+            font_name=font_name,
+            render_theme=render_theme,
+        )
+    ass_path.write_text(ass_text, encoding="utf-8")
+    logger.info("Generated ASS: %s (%d cues)", ass_path, len(lines))
+    return ass_path
diff --git a/src/humeo/env.py b/src/humeo/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..63a64e3bd8b4e286d59b6fd3c948b7c31c494d81
--- /dev/null
+++ b/src/humeo/env.py
@@ -0,0 +1,130 @@
+"""Environment bootstrap (``.env``) and cache path helpers."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Literal
+
+_BOOTSTRAPPED = False
+LLMProvider = Literal["google", "openrouter"]
+OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+
+
+def bootstrap_env() -> None:
+    """Load ``.env`` from the process cwd (non-fatal if missing). Safe to call twice."""
+    global _BOOTSTRAPPED
+    if _BOOTSTRAPPED:
+        return
+    try:
+        from dotenv import load_dotenv
+
+        load_dotenv()
+    except ImportError:
+        pass
+    _BOOTSTRAPPED = True
+
+
+def default_humeo_cache_root() -> Path:
+    """Default cache root: ``~/.cache/humeo`` on Unix; ``%LOCALAPPDATA%/humeo`` on Windows."""
+    override = (os.environ.get("HUMEO_CACHE_ROOT") or "").strip()
+    if override:
+        return Path(override)
+    if os.name == "nt":
+        base = Path(os.environ.get("LOCALAPPDATA", str(Path.home() / "AppData" / "Local")))
+        return base / "humeo"
+    return Path.home() / ".cache" / "humeo"
+
+
+def resolve_gemini_api_key() -> str:
+    """Return an API key for Gemini, or raise if none is configured.
+
+    Prefer ``GOOGLE_API_KEY``; fall back to ``GEMINI_API_KEY``. Values are read from
+    the environment after ``bootstrap_env()`` (``.env`` in cwd).
+
+    We require an explicit key so we do not fall back to Application Default
+    Credentials (e.g. ``gcloud auth application-default login``), which often
+    lack the Generative Language API scope and produce
+    ``403 ACCESS_TOKEN_SCOPE_INSUFFICIENT``.
+    """
+    bootstrap_env()
+    for env_name in ("GOOGLE_API_KEY", "GEMINI_API_KEY"):
+        val = (os.environ.get(env_name) or "").strip()
+        if val:
+            return val
+    raise ValueError(
+        "Set GOOGLE_API_KEY or GEMINI_API_KEY for Gemini clip selection. "
+        "See docs/ENVIRONMENT.md. Without an API key the client may use ADC and fail "
+        "with insufficient scopes (403)."
+    )
+
+
+def resolve_openrouter_api_key() -> str:
+    """Return the OpenRouter API key, or raise if missing."""
+    bootstrap_env()
+    val = (os.environ.get("OPENROUTER_API_KEY") or "").strip()
+    if val:
+        return val
+    raise ValueError(
+        "Set OPENROUTER_API_KEY to use OpenRouter as the backend for the Gemini stages. "
+        "See docs/ENVIRONMENT.md."
+    )
+
+
+def current_llm_provider() -> LLMProvider | None:
+    """Best-effort active backend detection from the environment.
+
+    ``HUMEO_LLM_PROVIDER`` overrides key-based auto-detection when set.
+    """
+    bootstrap_env()
+    forced = (os.environ.get("HUMEO_LLM_PROVIDER") or "auto").strip().lower()
+    if forced in ("google", "openrouter"):
+        return forced  # type: ignore[return-value]
+    if (os.environ.get("GOOGLE_API_KEY") or "").strip():
+        return "google"
+    if (os.environ.get("GEMINI_API_KEY") or "").strip():
+        return "google"
+    if (os.environ.get("OPENROUTER_API_KEY") or "").strip():
+        return "openrouter"
+    return None
+
+
+def resolve_llm_provider() -> LLMProvider:
+    """Return the active backend for Gemini-like stages, or raise if none is configured."""
+    provider = current_llm_provider()
+    if provider is not None:
+        if provider == "google":
+            resolve_gemini_api_key()
+        else:
+            resolve_openrouter_api_key()
+        return provider
+    raise ValueError(
+        "Set GOOGLE_API_KEY or GEMINI_API_KEY for the Google Gemini SDK, "
+        "or set OPENROUTER_API_KEY to route these stages through OpenRouter. "
+        "You can also force the backend with HUMEO_LLM_PROVIDER=google|openrouter."
+    )
+
+
+def model_name_for_provider(model_name: str, provider: LLMProvider) -> str:
+    """Normalize model identifiers between Google Gemini SDK and OpenRouter.
+
+    - Google SDK expects bare Gemini ids like ``gemini-3.1-flash-lite-preview``.
+    - OpenRouter expects provider-qualified ids like
+      ``google/gemini-3.1-flash-lite-preview``.
+    """
+    name = model_name.strip()
+    if provider == "openrouter":
+        if "/" not in name and name.startswith(("gemini-", "gemma-")):
+            return f"google/{name}"
+        return name
+    if provider == "google" and name.startswith("google/"):
+        return name.split("/", 1)[1]
+    return name
+
+
+def openrouter_default_headers() -> dict[str, str]:
+    """Headers that help identify Humeo traffic to OpenRouter."""
+    return {
+        "HTTP-Referer": "https://github.com/frenzy2004/shortform",
+        "X-OpenRouter-Title": "Humeo",
+    }
diff --git a/src/humeo/gemini_generate.py b/src/humeo/gemini_generate.py
new file mode 100644
index 0000000000000000000000000000000000000000..847b07dd4efec0ed1def268449e4bd9990bc4682
--- /dev/null
+++ b/src/humeo/gemini_generate.py
@@ -0,0 +1,24 @@
+"""Shared ``GenerateContentConfig`` for product Gemini calls (KISS / DRY).
+
+Thinking knobs live here only — stages pass stage-specific fields
+(temperature, ``response_mime_type``, ``system_instruction``, …).
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from google.genai import types
+
+_THINKING = types.ThinkingConfig(
+    thinking_budget=1024,
+    include_thoughts=True,
+)
+
+
+def gemini_generate_config(**kwargs: Any) -> types.GenerateContentConfig:
+    """Return config with thinking enabled; ``kwargs`` are merged as-is."""
+    return types.GenerateContentConfig(
+        thinking_config=_THINKING,
+        **kwargs,
+    )
diff --git a/src/humeo/hook_detector.py b/src/humeo/hook_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5f851f541ae1593b5903a86784599ca7b2e610
--- /dev/null
+++ b/src/humeo/hook_detector.py
@@ -0,0 +1,574 @@
+"""Stage 2.25 - Hook detection.
+
+The clip-selection LLM returns a ``hook_start_sec`` / ``hook_end_sec`` pair
+per clip, but in practice it almost always echoes the ``[0.0, 3.0]``
+placeholder from the prompt instead of localising the real hook sentence.
+That placeholder is toxic to Stage 2.5 pruning -- the clamp refuses to
+trim past ``hook_start_sec``, so every ``trim_start_sec > 0`` the pruner
+returns gets zeroed out silently.
+
+This module is a dedicated Stage 2.25 that runs between clip selection and
+content pruning. For each clip it:
+
+1. Prepares a clip-relative segment listing (same format as pruning uses).
+2. Asks Gemini, in one batched JSON call, to localise the hook sentence of
+   every clip with `hook_start_sec`, `hook_end_sec`, `hook_text`, `reason`.
+3. Validates the returned window against the clip's duration + the "real
+   hook" heuristics, then overwrites ``clip.hook_start_sec`` /
+   ``clip.hook_end_sec`` on a copy of the clip.
+
+The stage is:
+
+- **Cached** (``hooks.json`` / ``hooks.meta.json`` in ``work_dir``) on
+  ``transcript_sha256 + clips_sha256 + gemini_model``.
+- **Never fatal.** Any failure (API error, malformed JSON, clip not
+  returned, window that still looks like the 0.0-3.0 placeholder) falls
+  back to the original clip with its original hook -- pruning will then
+  skip hook protection via the fingerprint guard in
+  :func:`humeo.content_pruning._looks_like_default_hook`.
+
+The stage writes three artifacts to ``work_dir`` for audit:
+
+- ``hooks.meta.json``: cache key (version, fingerprints, model).
+- ``hooks.json``: structured per-clip hook windows actually applied.
+- ``hooks_raw.json``: verbatim Gemini response text (for prompt tuning).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any, Callable, TypeVar
+
+from google import genai
+from openai import OpenAI
+from pydantic import BaseModel, Field, ValidationError
+
+from humeo_core.schemas import Clip
+
+from humeo.config import GEMINI_MODEL, PipelineConfig
+from humeo.content_pruning import _looks_like_default_hook, _segments_within_clip
+from humeo.env import (
+    OPENROUTER_BASE_URL,
+    current_llm_provider,
+    model_name_for_provider,
+    openrouter_default_headers,
+    resolve_gemini_api_key,
+    resolve_llm_provider,
+    resolve_openrouter_api_key,
+)
+from humeo.gemini_generate import gemini_generate_config
+from humeo.hook_library import (
+    format_hook_examples,
+    hook_library_fingerprint,
+    resolve_hook_library_path,
+    retrieve_hook_examples,
+)
+from humeo.prompt_loader import hook_detection_system_prompt
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+HOOK_META_VERSION = 2
+HOOK_META_FILENAME = "hooks.meta.json"
+HOOK_ARTIFACT_FILENAME = "hooks.json"
+HOOK_RAW_FILENAME = "hooks_raw.json"
+
+LLM_MAX_ATTEMPTS = 3
+LLM_RETRY_DELAY_SEC = 2.0
+
+# Hook window validation thresholds. The prompt asks for 1.5-7.0s windows;
+# we enforce 1.0-10.0s to be lenient on rounding while still rejecting
+# obvious "LLM returned the whole paragraph" mistakes.
+_MIN_HOOK_DURATION_SEC = 1.0
+_MAX_HOOK_DURATION_SEC = 10.0
+
+
+def _openai_message_text(content: object) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                text = item.get("text")
+                if isinstance(text, str):
+                    parts.append(text)
+        return "".join(parts)
+    return ""
+
+
+class _HookDecision(BaseModel):
+    """Per-clip hook window returned by Gemini (clip-relative seconds)."""
+
+    clip_id: str
+    hook_start_sec: float = Field(ge=0.0)
+    hook_end_sec: float = Field(ge=0.0)
+    hook_text: str = ""
+    reason: str = ""
+
+
+class _HookResponse(BaseModel):
+    hooks: list[_HookDecision] = Field(default_factory=list)
+
+
+def _retry_llm(name: str, fn: Callable[[], T], attempts: int = LLM_MAX_ATTEMPTS) -> T:
+    last: Exception | None = None
+    for i in range(attempts):
+        try:
+            return fn()
+        except Exception as e:  # noqa: BLE001 - rethrown below
+            last = e
+            if i < attempts - 1:
+                logger.warning("%s attempt %d/%d failed: %s", name, i + 1, attempts, e)
+                time.sleep(LLM_RETRY_DELAY_SEC * (i + 1))
+    assert last is not None
+    raise last
+
+
+# ---------------------------------------------------------------------------
+# Prompt construction
+# ---------------------------------------------------------------------------
+
+
+def _build_user_message(clips: list[Clip], transcript: dict) -> str:
+    """Render clip-relative segments + selector-guessed hook text for each clip."""
+    blocks: list[str] = []
+    for clip in clips:
+        segs = _segments_within_clip(transcript, clip)
+        header_lines = [
+            f"clip_id: {clip.clip_id}",
+            f"duration_sec: {clip.duration_sec:.2f}",
+            f"topic: {clip.topic}",
+        ]
+        if clip.viral_hook:
+            header_lines.append(f"viral_hook_text: {clip.viral_hook}")
+        if clip.hook_start_sec is not None and clip.hook_end_sec is not None:
+            header_lines.append(
+                f"selector_hook_window_sec: [{clip.hook_start_sec:.2f}, "
+                f"{clip.hook_end_sec:.2f}] (may be a placeholder; verify)"
+            )
+        header = "\n".join(header_lines)
+        body = "\n".join(
+            f"[{seg['start']:.2f}s - {seg['end']:.2f}s] {seg['text']}" for seg in segs
+        )
+        if not body:
+            body = "(no segments overlap this clip window)"
+        blocks.append(f"{header}\n---\n{body}")
+    return "\n\n===\n\n".join(blocks)
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+
+def _validate_hook_window(
+    clip: Clip, hook_start: float, hook_end: float
+) -> tuple[float, float] | None:
+    """Return a valid (hook_start, hook_end) or None if rejected.
+
+    Rules:
+    - ``0 <= hook_start < hook_end <= duration_sec``
+    - hook duration between ``_MIN_HOOK_DURATION_SEC`` and ``_MAX_HOOK_DURATION_SEC``
+    - NOT the ``(0.0, 3.0)`` placeholder fingerprint (we'd rather keep the
+      selector's value untouched than re-apply the same fake hook).
+    """
+    if hook_start < 0.0 or hook_end <= hook_start:
+        return None
+    if hook_end > clip.duration_sec + 1e-3:
+        # Clamp trailing rounding to duration; reject anything beyond.
+        if hook_end - clip.duration_sec > 0.5:
+            return None
+        hook_end = clip.duration_sec
+    dur = hook_end - hook_start
+    if dur < _MIN_HOOK_DURATION_SEC or dur > _MAX_HOOK_DURATION_SEC:
+        return None
+    if _looks_like_default_hook(hook_start, hook_end):
+        return None
+    return float(hook_start), float(hook_end)
+
+
+# ---------------------------------------------------------------------------
+# Apply decisions -> new clips
+# ---------------------------------------------------------------------------
+
+
+def apply_hook_decisions(
+    clips: list[Clip],
+    decisions: list[_HookDecision],
+) -> list[Clip]:
+    """Return new clips whose hook fields reflect validated decisions.
+
+    Clips without a matching valid decision are returned unchanged (their
+    original hook metadata, placeholder or not, is preserved).
+    """
+    by_id = {d.clip_id: d for d in decisions}
+    out: list[Clip] = []
+    changed = 0
+    rejected = 0
+    for clip in clips:
+        d = by_id.get(clip.clip_id)
+        if d is None:
+            out.append(clip)
+            continue
+        validated = _validate_hook_window(clip, d.hook_start_sec, d.hook_end_sec)
+        if validated is None:
+            logger.info(
+                "Clip %s: rejected hook window [%.2f, %.2f] (failed validation); "
+                "keeping selector hook.",
+                clip.clip_id,
+                d.hook_start_sec,
+                d.hook_end_sec,
+            )
+            rejected += 1
+            out.append(clip)
+            continue
+        hs, he = validated
+        if (
+            clip.hook_start_sec is not None
+            and clip.hook_end_sec is not None
+            and abs(clip.hook_start_sec - hs) < 1e-3
+            and abs(clip.hook_end_sec - he) < 1e-3
+        ):
+            out.append(clip)
+            continue
+        changed += 1
+        logger.info(
+            "Clip %s: hook set to [%.2f, %.2f] (was [%s, %s]) -- %s",
+            clip.clip_id,
+            hs,
+            he,
+            f"{clip.hook_start_sec:.2f}" if clip.hook_start_sec is not None else "None",
+            f"{clip.hook_end_sec:.2f}" if clip.hook_end_sec is not None else "None",
+            d.reason[:120] if d.reason else "(no reason)",
+        )
+        out.append(
+            clip.model_copy(update={"hook_start_sec": hs, "hook_end_sec": he})
+        )
+    logger.info(
+        "Hook detection: updated %d / %d clips (%d rejected, %d kept as-is).",
+        changed,
+        len(clips),
+        rejected,
+        len(clips) - changed - rejected,
+    )
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Cache
+# ---------------------------------------------------------------------------
+
+
+def _clips_fingerprint(clips: list[Clip]) -> str:
+    payload = json.dumps(
+        [
+            {"id": c.clip_id, "s": round(c.start_time_sec, 3), "e": round(c.end_time_sec, 3)}
+            for c in clips
+        ],
+        sort_keys=True,
+        ensure_ascii=False,
+    )
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def _resolved_gemini_model(config: PipelineConfig) -> str:
+    return (config.gemini_model or GEMINI_MODEL).strip()
+
+
+def _hook_meta(
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    config: PipelineConfig,
+) -> dict[str, Any]:
+    return {
+        "version": HOOK_META_VERSION,
+        "transcript_sha256": transcript_fp,
+        "clips_sha256": clips_fp,
+        "gemini_model": _resolved_gemini_model(config),
+        "llm_backend": current_llm_provider() or "google",
+        "hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)),
+    }
+
+
+def _hook_cache_valid(
+    work_dir: Path,
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    config: PipelineConfig,
+) -> bool:
+    meta_path = work_dir / HOOK_META_FILENAME
+    if not meta_path.is_file():
+        return False
+    try:
+        with open(meta_path, encoding="utf-8") as f:
+            meta = json.load(f)
+    except Exception:
+        return False
+    if meta.get("version") != HOOK_META_VERSION:
+        return False
+    if meta.get("transcript_sha256") != transcript_fp:
+        return False
+    if meta.get("clips_sha256") != clips_fp:
+        return False
+    current_provider = current_llm_provider()
+    meta_provider = meta.get("llm_backend")
+    if current_provider == "openrouter":
+        if meta_provider != "openrouter":
+            return False
+    elif current_provider == "google":
+        if meta_provider not in (None, "google"):
+            return False
+    if meta.get("gemini_model") != _resolved_gemini_model(config):
+        return False
+    if meta.get("hook_library_sha256", "") != hook_library_fingerprint(resolve_hook_library_path(config)):
+        return False
+    return True
+
+
+def _load_cached_hooks(
+    work_dir: Path, clips: list[Clip]
+) -> list[Clip] | None:
+    artifact = work_dir / HOOK_ARTIFACT_FILENAME
+    if not artifact.is_file():
+        return None
+    try:
+        with open(artifact, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        cached = {item["clip_id"]: item for item in data.get("hooks", [])}
+    except Exception as e:  # noqa: BLE001 - surfaced as warning below
+        logger.warning("Hook cache artifact unreadable (%s); re-running.", e)
+        return None
+    out: list[Clip] = []
+    for clip in clips:
+        c = cached.get(clip.clip_id)
+        if c is None:
+            out.append(clip)
+            continue
+        hs = c.get("hook_start_sec")
+        he = c.get("hook_end_sec")
+        if hs is None or he is None:
+            out.append(clip)
+            continue
+        out.append(
+            clip.model_copy(
+                update={"hook_start_sec": float(hs), "hook_end_sec": float(he)}
+            )
+        )
+    return out
+
+
+def _write_cache(
+    work_dir: Path,
+    *,
+    clips_with_hooks: list[Clip],
+    decisions: list[_HookDecision],
+    meta: dict[str, Any],
+    raw_response: str,
+) -> None:
+    work_dir.mkdir(parents=True, exist_ok=True)
+    reasons = {d.clip_id: d for d in decisions}
+    payload = {
+        "hooks": [
+            {
+                "clip_id": c.clip_id,
+                "hook_start_sec": c.hook_start_sec,
+                "hook_end_sec": c.hook_end_sec,
+                "hook_text": (reasons.get(c.clip_id).hook_text if reasons.get(c.clip_id) else ""),
+                "reason": (reasons.get(c.clip_id).reason if reasons.get(c.clip_id) else ""),
+            }
+            for c in clips_with_hooks
+        ]
+    }
+    (work_dir / HOOK_ARTIFACT_FILENAME).write_text(
+        json.dumps(payload, indent=2), encoding="utf-8"
+    )
+    (work_dir / HOOK_RAW_FILENAME).write_text(raw_response, encoding="utf-8")
+    with open(work_dir / HOOK_META_FILENAME, "w", encoding="utf-8") as f:
+        json.dump(meta, f, indent=2)
+        f.write("\n")
+    logger.info(
+        "Wrote %s, %s and %s",
+        HOOK_META_FILENAME,
+        HOOK_ARTIFACT_FILENAME,
+        HOOK_RAW_FILENAME,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Gemini call
+# ---------------------------------------------------------------------------
+
+
+def _parse_decisions(raw_json: str) -> list[_HookDecision]:
+    data = json.loads(raw_json)
+    if isinstance(data, dict) and "hooks" in data:
+        try:
+            return _HookResponse.model_validate(data).hooks
+        except ValidationError as e:
+            logger.warning("Hook response failed validation: %s", e)
+            return []
+    if isinstance(data, list):
+        out: list[_HookDecision] = []
+        for item in data:
+            try:
+                out.append(_HookDecision.model_validate(item))
+            except ValidationError:
+                continue
+        return out
+    return []
+
+
+def request_hook_decisions(
+    clips: list[Clip],
+    transcript: dict,
+    *,
+    gemini_model: str | None = None,
+    hook_library_path: Path | None = None,
+) -> tuple[list[_HookDecision], str]:
+    """Ask Gemini to localise the hook sentence for each clip.
+
+    Returns ``(decisions, raw_response)``. ``raw_response`` is the literal
+    JSON text from Gemini (cached to ``hooks_raw.json`` for audit). On
+    transport/parse failure this raises; callers should catch and treat as
+    no-op.
+    """
+    if not clips:
+        return [], '{"hooks": []}'
+
+    example_query = " ".join(
+        filter(None, [*(clip.topic for clip in clips[:4]), *(clip.viral_hook for clip in clips[:4])])
+    )
+    hook_examples = format_hook_examples(
+        retrieve_hook_examples(example_query, path=hook_library_path, limit=8)
+    )
+    system = hook_detection_system_prompt(hook_examples=hook_examples)
+    user_text = _build_user_message(clips, transcript)
+
+    provider = resolve_llm_provider()
+    model_name = model_name_for_provider((gemini_model or GEMINI_MODEL).strip(), provider)
+
+    def _call() -> str:
+        logger.info(
+            "%s hook detection (model=%s, clips=%d)...", provider, model_name, len(clips)
+        )
+        if provider == "google":
+            client = genai.Client(api_key=resolve_gemini_api_key())
+            response = client.models.generate_content(
+                model=model_name,
+                contents=user_text,
+                config=gemini_generate_config(
+                    system_instruction=system,
+                    temperature=0.2,
+                    response_mime_type="application/json",
+                ),
+            )
+            if not response.text:
+                raise RuntimeError("Gemini returned empty response text for hook detection")
+            return response.text
+
+        client = OpenAI(
+            api_key=resolve_openrouter_api_key(),
+            base_url=OPENROUTER_BASE_URL,
+            default_headers=openrouter_default_headers(),
+        )
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user_text},
+            ],
+            temperature=0.2,
+            response_format={"type": "json_object"},
+        )
+        text = _openai_message_text(response.choices[0].message.content)
+        if not text:
+            raise RuntimeError("OpenRouter returned empty response text for hook detection")
+        return text
+
+    raw = _retry_llm("Gemini hook detection", _call)
+    decisions = _parse_decisions(raw)
+    return decisions, raw
+
+
+# ---------------------------------------------------------------------------
+# Public stage entrypoint
+# ---------------------------------------------------------------------------
+
+
+def run_hook_detection_stage(
+    work_dir: Path,
+    clips: list[Clip],
+    transcript: dict,
+    *,
+    transcript_fp: str,
+    config: PipelineConfig,
+) -> list[Clip]:
+    """Run Stage 2.25 hook detection and return clips with localised hooks.
+
+    - Disabled (``config.detect_hooks is False``): return clips unchanged.
+    - Cache hit: read ``hooks.json`` and apply cached windows.
+    - LLM failure: log a warning and return clips unchanged. The downstream
+      content pruner's fingerprint guard will treat any remaining placeholder
+      hooks as "no hook" so pruning still runs.
+    """
+    if not config.detect_hooks:
+        logger.info("Hook detection disabled (detect_hooks=False); skipping Stage 2.25.")
+        return clips
+    if not clips:
+        return clips
+
+    clips_fp = _clips_fingerprint(clips)
+
+    if not config.force_hook_detection and _hook_cache_valid(
+        work_dir,
+        transcript_fp=transcript_fp,
+        clips_fp=clips_fp,
+        config=config,
+    ):
+        cached = _load_cached_hooks(work_dir, clips)
+        if cached is not None:
+            logger.info(
+                "Hook detection cache hit (%d clips); skipping LLM.", len(clips)
+            )
+            return cached
+
+    try:
+        decisions, raw = request_hook_decisions(
+            clips,
+            transcript,
+            gemini_model=config.gemini_model,
+            hook_library_path=resolve_hook_library_path(config),
+        )
+    except Exception as e:  # noqa: BLE001 - pipeline must not die here
+        logger.warning(
+            "Hook detection call failed (%s); continuing with selector hooks. "
+            "Content pruning will treat any [0.0, 3.0] placeholder as 'no hook'.",
+            e,
+        )
+        return clips
+
+    updated = apply_hook_decisions(clips, decisions)
+
+    meta = _hook_meta(
+        transcript_fp=transcript_fp, clips_fp=clips_fp, config=config
+    )
+    try:
+        _write_cache(
+            work_dir,
+            clips_with_hooks=updated,
+            decisions=decisions,
+            meta=meta,
+            raw_response=raw,
+        )
+    except Exception as e:  # noqa: BLE001 - cache failure is not fatal
+        logger.warning("Failed to write hook cache (%s); continuing.", e)
+
+    return updated
diff --git a/src/humeo/hook_library.py b/src/humeo/hook_library.py
new file mode 100644
index 0000000000000000000000000000000000000000..19da6a94a81027aef80e12ba1217694361a3b675
--- /dev/null
+++ b/src/humeo/hook_library.py
@@ -0,0 +1,193 @@
+"""Parse and retrieve viral hook examples from a local zip or directory."""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import re
+import zipfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+from humeo.config import PipelineConfig
+
+_ENTRY_RE = re.compile(
+    r"^\s*\d+\.\s*Hook:\s*(?P<hook>.+?)<br>Example:\s*(?P<example>.+?)<br>Psychology:\s*(?P<psychology>.+?)\s*$",
+    re.IGNORECASE,
+)
+_TOKEN_RE = re.compile(r"[a-z0-9']+")
+
+
+@dataclass(frozen=True)
+class HookExample:
+    category: str
+    hook: str
+    example: str
+    psychology: str
+
+
+_LIB_CACHE: dict[str, list[HookExample]] = {}
+
+
+def resolve_hook_library_path(config: PipelineConfig | None = None) -> Path | None:
+    if config is not None and config.hook_library_path is not None:
+        return Path(config.hook_library_path)
+    raw = (os.environ.get("HUMEO_HOOK_LIBRARY_PATH") or "").strip()
+    if raw:
+        return Path(raw).expanduser()
+    return None
+
+
+def require_hook_library_path(config: PipelineConfig | None = None) -> Path:
+    path = resolve_hook_library_path(config)
+    if path is None:
+        raise FileNotFoundError(
+            "HUMEO_HOOK_LIBRARY_PATH is required for the hook retrieval workflow."
+        )
+    if not path.exists():
+        raise FileNotFoundError(f"Hook library path does not exist: {path}")
+    return path
+
+
+def hook_library_fingerprint(path: Path | None) -> str:
+    if path is None:
+        return ""
+    if not path.exists():
+        return ""
+    hasher = hashlib.sha256()
+    if path.is_file():
+        hasher.update(path.read_bytes())
+        return hasher.hexdigest()
+
+    for md_path in sorted(p for p in path.rglob("*.md") if p.is_file()):
+        hasher.update(str(md_path.relative_to(path)).encode("utf-8"))
+        hasher.update(md_path.read_bytes())
+    return hasher.hexdigest()
+
+
+def _tokenize(text: str) -> set[str]:
+    return {m.group(0) for m in _TOKEN_RE.finditer(text.lower()) if len(m.group(0)) > 2}
+
+
+def _ordered_tokens(text: str) -> list[str]:
+    return [m.group(0) for m in _TOKEN_RE.finditer(text.lower()) if len(m.group(0)) > 2]
+
+
+def _iter_markdown_files(path: Path) -> Iterable[tuple[str, str]]:
+    if path.is_file():
+        with zipfile.ZipFile(path) as zf:
+            for name in sorted(n for n in zf.namelist() if n.endswith(".md")):
+                yield name, zf.read(name).decode("utf-8", errors="replace")
+        return
+
+    for md_path in sorted(p for p in path.rglob("*.md") if p.is_file()):
+        yield str(md_path.relative_to(path)).replace("\\", "/"), md_path.read_text(
+            encoding="utf-8", errors="replace"
+        )
+
+
+def _category_from_name(name: str) -> str:
+    stem = Path(name).stem
+    stem = stem.replace("_Hooks", "").replace("_", " ").strip()
+    return stem
+
+
+def _parse_examples(path: Path) -> list[HookExample]:
+    examples: list[HookExample] = []
+    for name, content in _iter_markdown_files(path):
+        category = _category_from_name(name)
+        for raw_line in content.splitlines():
+            line = raw_line.strip()
+            if not line or not line[0].isdigit():
+                continue
+            match = _ENTRY_RE.match(line)
+            if not match:
+                continue
+            examples.append(
+                HookExample(
+                    category=category,
+                    hook=match.group("hook").strip(),
+                    example=match.group("example").strip(),
+                    psychology=match.group("psychology").strip(),
+                )
+            )
+    return examples
+
+
+def load_hook_library(path: Path | None) -> list[HookExample]:
+    if path is None:
+        return []
+    fingerprint = hook_library_fingerprint(path)
+    if not fingerprint:
+        return []
+    cached = _LIB_CACHE.get(fingerprint)
+    if cached is not None:
+        return cached
+    parsed = _parse_examples(path)
+    _LIB_CACHE[fingerprint] = parsed
+    return parsed
+
+
+def retrieve_hook_examples(
+    query_text: str,
+    *,
+    topic: str = "",
+    path: Path | None,
+    limit: int = 8,
+) -> list[HookExample]:
+    items = load_hook_library(path)
+    if not items:
+        return []
+
+    query_tokens = _tokenize(f"{topic} {query_text}")
+    query_phrases = [
+        " ".join(pair)
+        for pair in zip(_ordered_tokens(f"{topic} {query_text}"), _ordered_tokens(f"{topic} {query_text}")[1:])
+    ]
+    if not query_tokens:
+        return items[:limit]
+
+    scored: list[tuple[tuple[int, int, int], HookExample]] = []
+    for item in items:
+        hook_tokens = _tokenize(item.hook)
+        example_tokens = _tokenize(item.example)
+        category_tokens = _tokenize(item.category)
+        hook_overlap = len(query_tokens & hook_tokens)
+        example_overlap = len(query_tokens & example_tokens)
+        category_overlap = len(query_tokens & category_tokens)
+        overlap = hook_overlap + example_overlap + category_overlap
+        if overlap == 0:
+            continue
+        psychology_overlap = len(query_tokens & _tokenize(item.psychology))
+        phrase_bonus = sum(1 for phrase in query_phrases if phrase in item.example.lower())
+        scored.append(
+            (
+                (
+                    phrase_bonus * 5 + example_overlap * 3 + hook_overlap + category_overlap,
+                    phrase_bonus,
+                    example_overlap,
+                    category_overlap + psychology_overlap,
+                ),
+                item,
+            )
+        )
+
+    if not scored:
+        return items[:limit]
+
+    scored.sort(key=lambda pair: pair[0], reverse=True)
+    return [item for _, item in scored[:limit]]
+
+
+def format_hook_examples(examples: list[HookExample]) -> str:
+    if not examples:
+        return ""
+    lines: list[str] = []
+    for idx, item in enumerate(examples, start=1):
+        lines.append(
+            f"{idx}. [{item.category}] Hook: {item.hook}\n"
+            f"   Example: {item.example}\n"
+            f"   Psychology: {item.psychology}"
+        )
+    return "\n".join(lines)
diff --git a/src/humeo/ingest.py b/src/humeo/ingest.py
new file mode 100644
index 0000000000000000000000000000000000000000..edbfb2a6f221acd4639f140b14f2a1f66ccf7279
--- /dev/null
+++ b/src/humeo/ingest.py
@@ -0,0 +1,564 @@
+"""
+Step 1 - Ingestion: Download video and generate word-level transcript.
+
+Responsibilities:
+  - Download source video from YouTube using yt-dlp.
+  - Extract audio track for transcription.
+  - Generate word-level timestamped transcript.
+"""
+
+import json
+import logging
+import os
+import shutil
+import subprocess
+from math import ceil
+from pathlib import Path
+
+import httpx
+
+from humeo.video_cache import local_source_matches, write_local_source_info
+
+logger = logging.getLogger(__name__)
+
+OPENAI_MAX_UPLOAD_BYTES = 25 * 1024 * 1024
+OPENAI_TARGET_UPLOAD_BYTES = 20 * 1024 * 1024
+OPENAI_MIN_CHUNK_SEC = 300.0
+ELEVENLABS_TRANSCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
+TRANSCRIPT_META_FILENAME = "transcript.meta.json"
+ELEVENLABS_SCRIBE_MODEL = "scribe_v2"
+_ELEVENLABS_SEGMENT_MAX_GAP_SEC = 0.65
+_ELEVENLABS_SEGMENT_MAX_DURATION_SEC = 6.0
+_ELEVENLABS_SEGMENT_MAX_WORDS = 18
+
+
+def stage_local_video(source: str | Path, output_dir: Path) -> Path:
+    """
+    Copy a local source video into ``output_dir/source.mp4`` for cacheable reruns.
+    """
+    source_path = Path(source).expanduser().resolve(strict=False)
+    if not source_path.is_file():
+        raise FileNotFoundError(f"Local source video does not exist: {source_path}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    staged_path = output_dir / "source.mp4"
+    staged_resolved = staged_path.resolve(strict=False)
+
+    if source_path == staged_resolved:
+        logger.info("Using local source video in place: %s", source_path)
+        write_local_source_info(output_dir, source_path)
+        return staged_path
+
+    if staged_path.exists() and local_source_matches(output_dir, str(source_path)):
+        logger.info("Local source already staged at: %s", staged_path)
+        return staged_path
+
+    if source_path.suffix.lower() != ".mp4":
+        logger.warning(
+            "Local source uses %s; staging it as source.mp4 anyway.",
+            source_path.suffix or "<no extension>",
+        )
+
+    action = "Replacing" if staged_path.exists() else "Staging"
+    logger.info("%s local video: %s -> %s", action, source_path, staged_path)
+    shutil.copy2(source_path, staged_path)
+    write_local_source_info(output_dir, source_path)
+    return staged_path
+
+
+def download_video(youtube_url: str, output_dir: Path) -> Path:
+    """
+    Download the best quality video+audio from YouTube.
+
+    Returns the path to the downloaded MP4 file.
+    """
+    output_template = str(output_dir / "source.%(ext)s")
+    cmd = [
+        "yt-dlp",
+        "--format", "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
+        "--merge-output-format", "mp4",
+        "--output", output_template,
+        "--no-playlist",
+        "--write-info-json",
+        "--quiet",
+        youtube_url,
+    ]
+
+    logger.info("Downloading video: %s", youtube_url)
+    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+    if result.stderr:
+        logger.warning(result.stderr.strip())
+
+    # yt-dlp should produce source.mp4
+    video_path = output_dir / "source.mp4"
+    if not video_path.exists():
+        # Fallback: find any mp4 in the output dir
+        mp4_files = list(output_dir.glob("source.*"))
+        if mp4_files:
+            video_path = mp4_files[0]
+        else:
+            raise FileNotFoundError(f"Download failed - no output found in {output_dir}")
+
+    logger.info("Downloaded to: %s", video_path)
+    return video_path
+
+
+def extract_audio(video_path: Path, output_dir: Path) -> Path:
+    """
+    Extract audio track from video as WAV (required by most ASR models).
+    """
+    audio_path = output_dir / "source_audio.wav"
+    cmd = [
+        "ffmpeg", "-y",
+        "-i", str(video_path),
+        "-vn",                        # no video
+        "-acodec", "pcm_s16le",       # raw PCM
+        "-ar", "16000",               # 16kHz sample rate (standard for ASR)
+        "-ac", "1",                   # mono
+        str(audio_path),
+    ]
+
+    logger.info("Extracting audio to: %s", audio_path)
+    subprocess.run(cmd, check=True, capture_output=True)
+    return audio_path
+
+
+def _resolve_elevenlabs_api_key() -> str:
+    key = (os.environ.get("ELEVENLABS_API_KEY") or "").strip()
+    if key:
+        return key
+    raise ValueError("Set ELEVENLABS_API_KEY to use ElevenLabs Scribe v2 transcription.")
+
+
+def _elevenlabs_no_verbatim_enabled() -> bool:
+    raw = (os.environ.get("ELEVENLABS_NO_VERBATIM") or "true").strip().lower()
+    return raw not in {"0", "false", "no", "off"}
+
+
+def resolved_transcribe_settings() -> dict[str, object]:
+    provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "elevenlabs").strip().lower()
+    if provider in ("", "auto"):
+        if (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
+            provider = "elevenlabs"
+        else:
+            provider = "openai"
+
+    if provider in ("api",):
+        provider = "openai"
+    if provider in ("local",):
+        provider = "whisperx"
+
+    settings: dict[str, object] = {"provider": provider}
+    if provider == "elevenlabs":
+        settings.update(
+            {
+                "model_id": ELEVENLABS_SCRIBE_MODEL,
+                "no_verbatim": _elevenlabs_no_verbatim_enabled(),
+            }
+        )
+    return settings
+
+
+def transcript_cache_valid(output_dir: Path) -> bool:
+    transcript_path = output_dir / "transcript.json"
+    meta_path = output_dir / TRANSCRIPT_META_FILENAME
+    if not transcript_path.is_file() or not meta_path.is_file():
+        return False
+    try:
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    except Exception:
+        return False
+    return meta == resolved_transcribe_settings()
+
+
+def _write_transcript(output_dir: Path, transcript: dict) -> None:
+    transcript_path = output_dir / "transcript.json"
+    with open(transcript_path, "w", encoding="utf-8") as f:
+        json.dump(transcript, f, indent=2, ensure_ascii=False)
+    with open(output_dir / TRANSCRIPT_META_FILENAME, "w", encoding="utf-8") as f:
+        json.dump(resolved_transcribe_settings(), f, indent=2, ensure_ascii=False)
+        f.write("\n")
+
+
+def _normalize_elevenlabs_word(raw_word: dict) -> dict | None:
+    if not isinstance(raw_word, dict):
+        return None
+    if str(raw_word.get("type", "word")).strip().lower() not in {"word", ""}:
+        return None
+    text = str(raw_word.get("text", raw_word.get("word", ""))).strip()
+    if not text:
+        return None
+    try:
+        start = float(raw_word["start"])
+        end = float(raw_word["end"])
+    except (KeyError, TypeError, ValueError):
+        return None
+    if end <= start:
+        return None
+    return {"word": text, "start": start, "end": end}
+
+
+def _segment_words_into_transcript(words: list[dict], *, language: str) -> dict:
+    segments: list[dict] = []
+    chunk: list[dict] = []
+
+    def flush() -> None:
+        if not chunk:
+            return
+        segments.append(
+            {
+                "start": chunk[0]["start"],
+                "end": chunk[-1]["end"],
+                "text": " ".join(str(word["word"]) for word in chunk).strip(),
+                "words": list(chunk),
+            }
+        )
+        chunk.clear()
+
+    for word in words:
+        if chunk:
+            gap = float(word["start"]) - float(chunk[-1]["end"])
+            dur = float(word["end"]) - float(chunk[0]["start"])
+            if (
+                gap >= _ELEVENLABS_SEGMENT_MAX_GAP_SEC
+                or dur >= _ELEVENLABS_SEGMENT_MAX_DURATION_SEC
+                or len(chunk) >= _ELEVENLABS_SEGMENT_MAX_WORDS
+            ):
+                flush()
+        chunk.append(word)
+    flush()
+    return {"segments": segments, "language": language}
+
+
+def _normalize_elevenlabs_response(data: dict) -> dict:
+    words = [
+        word
+        for raw_word in data.get("words", []) or []
+        if (word := _normalize_elevenlabs_word(raw_word)) is not None
+    ]
+    language = str(
+        data.get("language_code") or data.get("language") or "en"
+    ).strip() or "en"
+    return _segment_words_into_transcript(words, language=language)
+
+
+def _transcribe_elevenlabs_scribe(audio_path: Path) -> dict:
+    headers = {"xi-api-key": _resolve_elevenlabs_api_key()}
+    form = {
+        "model_id": ELEVENLABS_SCRIBE_MODEL,
+        "timestamps_granularity": "word",
+        "diarize": "false",
+        "tag_audio_events": "false",
+        "file_format": "pcm_s16le_16",
+        "no_verbatim": "true" if _elevenlabs_no_verbatim_enabled() else "false",
+    }
+    with audio_path.open("rb") as handle:
+        files = {"file": (audio_path.name, handle, "audio/wav")}
+        response = httpx.post(
+            ELEVENLABS_TRANSCRIBE_URL,
+            headers=headers,
+            data=form,
+            files=files,
+            timeout=600.0,
+        )
+    response.raise_for_status()
+    return _normalize_elevenlabs_response(response.json())
+
+
+def _transcribe_whisperx_local(audio_path: Path) -> dict:
+    """Word-level transcript via WhisperX (local). Raises ImportError if not installed."""
+    import whisperx
+
+    logger.info("Transcribing with WhisperX...")
+    device = "cpu"  # Use "cuda" if GPU available
+    model = whisperx.load_model("base", device=device, compute_type="int8")
+    audio = whisperx.load_audio(str(audio_path))
+    result = model.transcribe(audio, batch_size=16)
+
+    align_model, metadata = whisperx.load_align_model(
+        language_code=result["language"], device=device
+    )
+    result = whisperx.align(
+        result["segments"], align_model, metadata, audio, device,
+        return_char_alignments=False,
+    )
+
+    logger.info("Transcription complete: %d segments", len(result["segments"]))
+    return result
+
+
+def transcribe_whisperx(audio_path: Path, output_dir: Path) -> dict:
+    """
+    Transcribe audio for word-level timestamps.
+
+    Provider is controlled by **HUMEO_TRANSCRIBE_PROVIDER** (default ``auto``):
+
+    - ``auto`` — WhisperX if installed, else OpenAI Whisper API.
+    - ``openai`` / ``api`` — OpenAI Whisper API (uses ``OPENAI_API_KEY``), even when WhisperX is installed.
+    - ``whisperx`` / ``local`` — WhisperX only; fails clearly if not installed.
+
+    The result is written to ``output_dir / "transcript.json"``. Re-runs with an
+    existing transcript are skipped by the pipeline before this function runs.
+    """
+    settings = resolved_transcribe_settings()
+    provider = str(settings["provider"])
+
+    if provider == "elevenlabs":
+        logger.info(
+            "Transcribing with ElevenLabs Scribe v2 (no_verbatim=%s).",
+            bool(settings.get("no_verbatim", False)),
+        )
+        result = _transcribe_elevenlabs_scribe(audio_path)
+    elif provider == "openai":
+        logger.info(
+            "Transcribing with OpenAI Whisper API (HUMEO_TRANSCRIBE_PROVIDER=%s).",
+            provider,
+        )
+        result = _transcribe_openai_api(audio_path)
+    elif provider == "whisperx":
+        try:
+            result = _transcribe_whisperx_local(audio_path)
+        except ImportError as e:
+            raise RuntimeError(
+                "WhisperX requested (HUMEO_TRANSCRIBE_PROVIDER=whisperx) but whisperx is not installed. "
+                "Install with: uv sync --extra whisper"
+            ) from e
+    else:
+        raise RuntimeError(
+            f"Unknown HUMEO_TRANSCRIBE_PROVIDER={provider!r}. "
+            "Use elevenlabs, openai, or whisperx."
+        )
+
+    _write_transcript(output_dir, result)
+
+    return result
+
+
+def _transcribe_openai_api(audio_path: Path) -> dict:
+    """
+    Fallback transcription using OpenAI's Whisper API.
+    Requires OPENAI_API_KEY environment variable.
+    """
+    from openai import OpenAI
+
+    client = OpenAI()
+
+    work_dir = audio_path.parent / "openai_transcribe"
+    work_dir.mkdir(parents=True, exist_ok=True)
+    duration_sec = _probe_media_duration(audio_path)
+    chunk_ranges = _plan_openai_chunk_ranges(
+        duration_sec=duration_sec,
+        file_size_bytes=audio_path.stat().st_size,
+    )
+
+    if len(chunk_ranges) == 1:
+        return _transcribe_openai_file(client, audio_path)
+
+    logger.info("Audio exceeds OpenAI upload limit; transcribing in %d chunks.", len(chunk_ranges))
+    chunk_transcripts: list[dict] = []
+    for idx, (offset_sec, chunk_duration_sec) in enumerate(chunk_ranges, start=1):
+        chunk_path = work_dir / f"{audio_path.stem}_part_{idx:03d}.wav"
+        if not chunk_path.exists():
+            _extract_openai_audio_chunk(
+                input_path=audio_path,
+                output_path=chunk_path,
+                offset_sec=offset_sec,
+                duration_sec=chunk_duration_sec,
+            )
+        logger.info(
+            "Transcribing chunk %d/%d (%.1fs-%.1fs)",
+            idx,
+            len(chunk_ranges),
+            offset_sec,
+            offset_sec + chunk_duration_sec,
+        )
+        chunk_transcript = _transcribe_openai_file(client, chunk_path)
+        chunk_transcripts.append(_offset_transcript_timestamps(chunk_transcript, offset_sec))
+
+    return _merge_transcripts(chunk_transcripts)
+
+
+def _extract_openai_audio_chunk(
+    input_path: Path,
+    output_path: Path,
+    offset_sec: float,
+    duration_sec: float,
+) -> Path:
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-loglevel",
+        "error",
+        "-ss",
+        f"{offset_sec:.3f}",
+        "-t",
+        f"{duration_sec:.3f}",
+        "-i",
+        str(input_path),
+        "-vn",
+        "-acodec",
+        "pcm_s16le",
+        "-ac",
+        "1",
+        "-ar",
+        "16000",
+        str(output_path),
+    ]
+    subprocess.run(cmd, check=True, capture_output=True)
+    return output_path
+
+
+def _probe_media_duration(media_path: Path) -> float:
+    cmd = [
+        "ffprobe",
+        "-v",
+        "error",
+        "-show_entries",
+        "format=duration",
+        "-of",
+        "json",
+        str(media_path),
+    ]
+    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+    data = json.loads(result.stdout)
+    return float(data["format"]["duration"])
+
+
+def _plan_openai_chunk_ranges(
+    *,
+    duration_sec: float,
+    file_size_bytes: int,
+    max_upload_bytes: int = OPENAI_MAX_UPLOAD_BYTES,
+    target_upload_bytes: int = OPENAI_TARGET_UPLOAD_BYTES,
+) -> list[tuple[float, float]]:
+    if file_size_bytes <= max_upload_bytes:
+        return [(0.0, duration_sec)]
+
+    chunk_sec = max(
+        OPENAI_MIN_CHUNK_SEC,
+        duration_sec * (target_upload_bytes / file_size_bytes),
+    )
+    chunk_count = max(2, ceil(duration_sec / chunk_sec))
+    exact_chunk_sec = duration_sec / chunk_count
+
+    ranges: list[tuple[float, float]] = []
+    for idx in range(chunk_count):
+        start = idx * exact_chunk_sec
+        end = min(duration_sec, (idx + 1) * exact_chunk_sec)
+        ranges.append((round(start, 3), round(end - start, 3)))
+    return ranges
+
+
+def _transcribe_openai_file(client, audio_path: Path) -> dict:
+    with open(audio_path, "rb") as f:
+        response = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=f,
+            response_format="verbose_json",
+            timestamp_granularities=["word", "segment"],
+        )
+    return _normalize_openai_response(response)
+
+
+def _normalize_openai_response(response: object) -> dict:
+    data = response.model_dump() if hasattr(response, "model_dump") else response
+    if not isinstance(data, dict):
+        raise TypeError(f"Unexpected transcription payload type: {type(data)!r}")
+
+    top_words = [_normalize_word(word) for word in data.get("words", []) or []]
+    segments: list[dict] = []
+    word_index = 0
+
+    for raw_segment in data.get("segments", []) or []:
+        segment = raw_segment.model_dump() if hasattr(raw_segment, "model_dump") else raw_segment
+        if not isinstance(segment, dict):
+            continue
+
+        start = float(segment.get("start", 0.0))
+        end = float(segment.get("end", 0.0))
+        text = str(segment.get("text", "")).strip()
+
+        segment_words = [_normalize_word(word) for word in segment.get("words", []) or []]
+        if not segment_words and top_words:
+            while word_index < len(top_words) and top_words[word_index]["end"] <= start:
+                word_index += 1
+
+            probe_index = word_index
+            while probe_index < len(top_words) and top_words[probe_index]["start"] < end:
+                word = top_words[probe_index]
+                if word["end"] > start:
+                    segment_words.append(word)
+                probe_index += 1
+            word_index = probe_index
+
+        segments.append(
+            {
+                "start": start,
+                "end": end,
+                "text": text,
+                "words": segment_words,
+            }
+        )
+
+    if not segments and top_words:
+        segments.append(
+            {
+                "start": top_words[0]["start"],
+                "end": top_words[-1]["end"],
+                "text": " ".join(word["word"] for word in top_words).strip(),
+                "words": top_words,
+            }
+        )
+
+    return {
+        "segments": segments,
+        "language": str(data.get("language", "en") or "en"),
+    }
+
+
+def _normalize_word(raw_word: object) -> dict:
+    word = raw_word.model_dump() if hasattr(raw_word, "model_dump") else raw_word
+    if not isinstance(word, dict):
+        return {"word": "", "start": 0.0, "end": 0.0}
+    return {
+        "word": str(word.get("word", "")).strip(),
+        "start": float(word.get("start", 0.0)),
+        "end": float(word.get("end", 0.0)),
+    }
+
+
+def _offset_transcript_timestamps(transcript: dict, offset_sec: float) -> dict:
+    shifted_segments = []
+    for segment in transcript.get("segments", []):
+        shifted_segments.append(
+            {
+                "start": float(segment["start"]) + offset_sec,
+                "end": float(segment["end"]) + offset_sec,
+                "text": segment["text"],
+                "words": [
+                    {
+                        "word": word["word"],
+                        "start": float(word["start"]) + offset_sec,
+                        "end": float(word["end"]) + offset_sec,
+                    }
+                    for word in segment.get("words", [])
+                ],
+            }
+        )
+    return {
+        "segments": shifted_segments,
+        "language": transcript.get("language", "en"),
+    }
+
+
+def _merge_transcripts(transcripts: list[dict]) -> dict:
+    merged_segments = []
+    language = "en"
+    for transcript in transcripts:
+        merged_segments.extend(transcript.get("segments", []))
+        if transcript.get("language"):
+            language = transcript["language"]
+    return {
+        "segments": merged_segments,
+        "language": language,
+    }
diff --git a/src/humeo/interactive.py b/src/humeo/interactive.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e193248cdbecc27cc8713103ba3ced458f3025
--- /dev/null
+++ b/src/humeo/interactive.py
@@ -0,0 +1,140 @@
+"""Plain stdin interactive gates for the pipeline."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from humeo_core.schemas import ApprovalResult, Clip, RatingFeedback
+
+_ISSUE_MAP = {
+    "a": "wrong_moments",
+    "b": "bad_cuts",
+    "c": "boring",
+    "d": "confusing",
+    "e": "wrong_layout",
+    "f": "length_off",
+    "g": "other",
+}
+
+
+def _preview(text: str, limit: int = 100) -> str:
+    compact = " ".join(text.split())
+    if len(compact) <= limit:
+        return compact
+    return compact[: limit - 3].rstrip() + "..."
+
+
+def approve_clips(clips: list[Clip]) -> ApprovalResult:
+    """Prompt the user to approve or refine the selected clips."""
+    clip_ids = [clip.clip_id for clip in clips]
+
+    for clip in clips:
+        print(
+            f'[{clip.clip_id}] score={clip.virality_score:.2f}  '
+            f'duration={clip.duration_sec:.1f}s  "{clip.topic}"'
+        )
+        print(f'       "{_preview(clip.transcript)}"')
+
+    print()
+    print("Actions:")
+    print("  numbers in order (e.g. '3,1,5')  — select these clips to proceed")
+    print("  'all'                            — accept all clips as-is")
+    print("  'refine <note>'                  — re-run selection with steering")
+    print("  'quit'                           — abort pipeline")
+    print()
+
+    while True:
+        raw = input("> ").strip()
+        lowered = raw.lower()
+
+        if lowered == "all":
+            return ApprovalResult(action="accept_all", selected_ids=list(clip_ids))
+        if lowered == "quit":
+            return ApprovalResult(action="quit")
+        if lowered.startswith("refine"):
+            note = raw[6:].strip()
+            if not note:
+                print("Refine requires a note. Try: refine more emotional clips")
+                continue
+            return ApprovalResult(action="refine", steering_note=note)
+
+        tokens = [token.strip() for token in raw.split(",") if token.strip()]
+        if not tokens:
+            print("Enter clip numbers, 'all', 'refine <note>', or 'quit'.")
+            continue
+
+        selected_ids: list[str] = []
+        seen_ids: set[str] = set()
+        invalid = False
+        for token in tokens:
+            clip_id: str | None = None
+            if token in clip_ids:
+                clip_id = token
+            elif token.isdigit():
+                idx = int(token)
+                if 1 <= idx <= len(clips):
+                    clip_id = clips[idx - 1].clip_id
+
+            if clip_id is None:
+                print(f"Unknown clip selection: {token}")
+                invalid = True
+                break
+            if clip_id in seen_ids:
+                print(f"Duplicate clip selection: {token}")
+                invalid = True
+                break
+            seen_ids.add(clip_id)
+            selected_ids.append(clip_id)
+
+        if invalid:
+            continue
+
+        return ApprovalResult(action="proceed", selected_ids=selected_ids)
+
+
+def rate_output(outputs: list[Path]) -> RatingFeedback:
+    """Prompt the user to rate the rendered outputs."""
+    print("Outputs:")
+    for path in outputs:
+        print(f"  {path}")
+    print()
+    print("Watch them, then rate:")
+    print("  1. slop   2. good   3. great")
+
+    while True:
+        rating_raw = input("> ").strip()
+        if rating_raw in {"1", "2", "3"}:
+            rating = int(rating_raw)
+            break
+        print("Enter 1, 2, or 3.")
+
+    if rating == 3:
+        return RatingFeedback(rating=3)
+
+    print("What's wrong? (space-separated letters, or empty for skip):")
+    print("  [a] wrong_moments  [b] bad_cuts  [c] boring  [d] confusing")
+    print("  [e] wrong_layout   [f] length_off  [g] other (free text)")
+
+    while True:
+        issues_raw = input("> ").strip()
+        if not issues_raw:
+            return RatingFeedback(rating=rating)
+
+        tokens = issues_raw.lower().split()
+        issues: list[str] = []
+        invalid = [token for token in tokens if token not in _ISSUE_MAP]
+        if invalid:
+            print(f"Unknown issue selection: {' '.join(invalid)}")
+            continue
+
+        for token in tokens:
+            issue = _ISSUE_MAP[token]
+            if issue not in issues:
+                issues.append(issue)
+
+        free_text = None
+        if "other" in issues:
+            other_text = input("> ").strip()
+            free_text = other_text or None
+
+        return RatingFeedback(rating=rating, issues=issues, free_text=free_text)
diff --git a/src/humeo/layout_vision.py b/src/humeo/layout_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e7c3e42689c1e4b4ee6b023d5dec3ebd5aa0a6
--- /dev/null
+++ b/src/humeo/layout_vision.py
@@ -0,0 +1,1582 @@
+"""Per-clip layout + bbox via Gemini vision (no pixel heuristics in the product pipeline)."""
+
+from __future__ import annotations
+
+import base64
+import hashlib
+import json
+import logging
+import os
+import struct
+import subprocess
+from collections.abc import Iterable
+from io import BytesIO
+from pathlib import Path
+from typing import Any
+
+from google import genai
+from google.genai import types
+from openai import OpenAI
+
+from humeo_core.schemas import (
+    BoundingBox,
+    LayoutInstruction,
+    LayoutKind,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+    TimedCenterPoint,
+)
+from humeo_core.primitives.vision import layout_instruction_from_regions
+
+from humeo.config import GEMINI_MODEL, GEMINI_VISION_MODEL, PipelineConfig
+from humeo.env import (
+    OPENROUTER_BASE_URL,
+    current_llm_provider,
+    model_name_for_provider,
+    openrouter_default_headers,
+    resolve_gemini_api_key,
+    resolve_llm_provider,
+    resolve_openrouter_api_key,
+)
+from humeo.gemini_generate import gemini_generate_config
+
+logger = logging.getLogger(__name__)
+
+LAYOUT_VISION_CACHE_VERSION = 8
+LAYOUT_VISION_META = "layout_vision.meta.json"
+LAYOUT_VISION_JSON = "layout_vision.json"
+TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10))
+TRACKING_MIN_SPREAD_NORM = 0.08
+TRACKING_OUTLIER_DELTA_NORM = 0.16
+TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10
+TRACKING_DEADBAND_NORM = 0.025
+TRACKING_MIN_USABLE_POINTS = 5
+TRACKING_UNSTABLE_JUMP_NORM = 0.18
+FOCUS_SWITCH_LEAD_SEC = 0.35
+SPEAKER_FOLLOW_MAX_INTERVAL_SEC = 2.0
+TWO_SPEAKER_ACTIVE_ZOOM = 1.28
+TWO_SPEAKER_BOTH_ZOOM = 1.0
+TWO_SPEAKER_WIDE_ACTIVE_ZOOM = 1.12
+TWO_SPEAKER_BOTH_FIT_MARGIN = 0.88
+REPLICATE_SAM2_VIDEO_PINNED = (
+    "meta/sam-2-video:2d7219877ca847f463d749d9b224e62f7b078fe035d60a74b58889b455d5cbad"
+)
+_MIN_SPLIT_STRIP_FRAC = 0.2
+_SPLIT_TOP_RATIO_MIN = 0.32
+_SPLIT_TOP_RATIO_MAX = 0.48
+_SPLIT_FACE_REGION_MIN_HEIGHT = 0.62
+_SPLIT_FACE_REGION_HEIGHT_MULT = 2.0
+_SPLIT_FACE_TOP_PAD_MULT = 0.30
+
+GEMINI_LAYOUT_VISION_PROMPT = """You are framing a vertical short (9:16) from a 16:9 video frame.
+
+HARD RULE: the final short shows AT MOST TWO on-screen items. An "item" is one
+of person (a human speaker) or chart (slide, graph, data visual, screenshare).
+That gives exactly five layouts to choose from.
+
+Return ONLY a JSON object with this exact shape:
+{
+  "layout": "zoom_call_center" | "sit_center" | "split_chart_person" | "split_two_persons" | "split_two_charts",
+  "person_bbox":        {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "face_bbox":          {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "chart_bbox":         {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "second_person_bbox": {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "second_face_bbox":   {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "second_chart_bbox":  {"x1": 0.0, "y1": 0.0, "x2": 1.0, "y2": 1.0} | null,
+  "reason": "short rationale"
+}
+
+Bbox rules:
+- All bbox coordinates are normalized 0..1 (left/top = 0, right/bottom = 1). Require x2 > x1 and y2 > y1 when a bbox is non-null.
+- person_bbox / second_person_bbox: tight box around each speaker's head AND upper body. If two speakers are visible, ``person_bbox`` is the LEFT speaker and ``second_person_bbox`` is the RIGHT speaker (by x-center).
+- face_bbox / second_face_bbox: TIGHT box around the SPEAKER'S FACE ONLY (forehead to chin, ear to ear). This is NOT the full body — exclude torso, arms, shoulders, tank top, mug, table. The face bbox drives horizontal framing in the 9:16 crop, so putting torso or arms in it will push the face off-screen.
+  * If the subject is shown in profile, the face_bbox still surrounds only the visible half of the head (ear to nose, forehead to chin). It should be roughly square-ish, not a tall body rectangle.
+  * ``face_bbox`` matches ``person_bbox`` (same speaker), ``second_face_bbox`` matches ``second_person_bbox``.
+  * Set face bbox to null ONLY if no face is visible at all (back of head, occluded, off-frame).
+- chart_bbox / second_chart_bbox: slide, chart, graph, or large on-screen graphic. If two charts are visible, ``chart_bbox`` is the LEFT chart and ``second_chart_bbox`` is the RIGHT chart.
+- The two bboxes of the same kind must not overlap meaningfully; they should partition the source frame into distinct regions.
+
+Layout selection (pick exactly one):
+- zoom_call_center: ONE person, tight webcam / video-call headshot filling much of the frame. person_bbox + face_bbox set; others null.
+- sit_center: ONE person, interview / seated framing, or when unsure. person_bbox + face_bbox set; others null.
+- split_chart_person: ONE chart + ONE person in distinct regions (webinar / explainer). person_bbox + face_bbox + chart_bbox set; second_* null.
+- split_two_persons: TWO visible speakers (interview two-up, podcast panel). person_bbox + face_bbox AND second_person_bbox + second_face_bbox set; chart bboxes null.
+- split_two_charts: TWO charts / slides side-by-side. chart_bbox AND second_chart_bbox set; person/face bboxes null.
+
+When in doubt prefer ``sit_center``. Never output more than two of {person, chart} items in total.
+No markdown. JSON only."""
+
+ACTIVE_SPEAKER_VISION_PROMPT = """You are analyzing a single frame from a two-person talking video.
+
+Return ONLY a JSON object:
+{
+  "speaker": "left" | "right" | "both" | "unclear",
+  "reason": "short rationale"
+}
+
+Rules:
+- "left" means the LEFT visible person appears to be the one speaking in this exact frame.
+- "right" means the RIGHT visible person appears to be the one speaking in this exact frame.
+- Use visible cues only: open mouth mid-word, facial expression while talking, hand gesture timing, body engagement.
+- If both appear to be talking at once, return "both".
+- If it is impossible to tell from this frame, return "unclear".
+- No markdown. JSON only."""
+
+
+def _openai_message_text(content: object) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for item in content:
+            if isinstance(item, dict) and item.get("type") == "text":
+                text = item.get("text")
+                if isinstance(text, str):
+                    parts.append(text)
+        return "".join(parts)
+    return ""
+
+
+def _json_object_from_vision_response(raw: object) -> dict[str, Any]:
+    if isinstance(raw, dict):
+        return raw
+    if isinstance(raw, list):
+        for item in raw:
+            if isinstance(item, dict):
+                return item
+    raise TypeError(f"Expected vision JSON object, got {type(raw).__name__}")
+
+
+def _clips_fingerprint(clips_path: Path) -> str:
+    if not clips_path.is_file():
+        return ""
+    return hashlib.sha256(clips_path.read_bytes()).hexdigest()
+
+
+def layout_cache_valid(
+    work_dir: Path,
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    vision_model: str,
+    segmentation_provider: str = "off",
+    segmentation_model: str = "meta/sam-2-video",
+) -> bool:
+    meta_path = work_dir / LAYOUT_VISION_META
+    data_path = work_dir / LAYOUT_VISION_JSON
+    if not meta_path.is_file() or not data_path.is_file():
+        return False
+    try:
+        meta: dict[str, Any] = json.loads(meta_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return False
+    return (
+        meta.get("layout_vision_cache_version") == LAYOUT_VISION_CACHE_VERSION
+        and
+        meta.get("transcript_sha256") == transcript_fp
+        and meta.get("clips_sha256") == clips_fp
+        and meta.get("gemini_vision_model") == vision_model
+        and meta.get("segmentation_provider", "off") == segmentation_provider
+        and meta.get("segmentation_model", "meta/sam-2-video") == segmentation_model
+        and (
+            current_llm_provider() is None
+            or (
+                current_llm_provider() == "google"
+                and meta.get("llm_backend") in (None, "google")
+            )
+            or meta.get("llm_backend") == current_llm_provider()
+        )
+    )
+
+
+def load_layout_cache(work_dir: Path) -> dict[str, dict[str, Any]] | None:
+    p = work_dir / LAYOUT_VISION_JSON
+    if not p.is_file():
+        return None
+    try:
+        data = json.loads(p.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return None
+    clips = data.get("clips")
+    return clips if isinstance(clips, dict) else None
+
+
+def write_layout_cache(
+    work_dir: Path,
+    *,
+    transcript_fp: str,
+    clips_fp: str,
+    vision_model: str,
+    clips_payload: dict[str, dict[str, Any]],
+    segmentation_provider: str = "off",
+    segmentation_model: str = "meta/sam-2-video",
+) -> None:
+    work_dir.mkdir(parents=True, exist_ok=True)
+    meta = {
+        "layout_vision_cache_version": LAYOUT_VISION_CACHE_VERSION,
+        "transcript_sha256": transcript_fp,
+        "clips_sha256": clips_fp,
+        "gemini_vision_model": vision_model,
+        "llm_backend": current_llm_provider() or "google",
+        "segmentation_provider": segmentation_provider,
+        "segmentation_model": segmentation_model,
+    }
+    (work_dir / LAYOUT_VISION_META).write_text(
+        json.dumps(meta, indent=2) + "\n", encoding="utf-8"
+    )
+    (work_dir / LAYOUT_VISION_JSON).write_text(
+        json.dumps({"clips": clips_payload}, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    logger.info("Wrote %s and %s", LAYOUT_VISION_META, LAYOUT_VISION_JSON)
+
+
+def _png_dims(path: Path) -> tuple[int, int] | None:
+    try:
+        with path.open("rb") as f:
+            head = f.read(24)
+        if head[:8] != b"\x89PNG\r\n\x1a\n":
+            return None
+        width, height = struct.unpack(">II", head[16:24])
+        return int(width), int(height)
+    except Exception:
+        return None
+
+
+def _jpeg_dims(path: Path) -> tuple[int, int] | None:
+    try:
+        with path.open("rb") as f:
+            if f.read(2) != b"\xff\xd8":
+                return None
+            sof_markers = {
+                0xC0,
+                0xC1,
+                0xC2,
+                0xC3,
+                0xC5,
+                0xC6,
+                0xC7,
+                0xC9,
+                0xCA,
+                0xCB,
+                0xCD,
+                0xCE,
+                0xCF,
+            }
+            while True:
+                marker_start = f.read(1)
+                if not marker_start:
+                    return None
+                if marker_start != b"\xff":
+                    continue
+                marker = f.read(1)
+                while marker == b"\xff":
+                    marker = f.read(1)
+                if not marker:
+                    return None
+                marker_byte = marker[0]
+                if marker_byte in (0xD8, 0xD9, 0x01) or 0xD0 <= marker_byte <= 0xD7:
+                    continue
+                seg_len_bytes = f.read(2)
+                if len(seg_len_bytes) != 2:
+                    return None
+                seg_len = struct.unpack(">H", seg_len_bytes)[0]
+                if seg_len < 2:
+                    return None
+                if marker_byte in sof_markers:
+                    frame_header = f.read(5)
+                    if len(frame_header) != 5:
+                        return None
+                    _, height, width = struct.unpack(">BHH", frame_header)
+                    return int(width), int(height)
+                f.seek(seg_len - 2, 1)
+    except Exception:
+        return None
+
+
+def _keyframe_dimensions(keyframe_path: str) -> tuple[int, int] | None:
+    path = Path(keyframe_path)
+    try:
+        from PIL import Image  # type: ignore
+
+        with Image.open(path) as img:
+            width, height = img.size
+        return int(width), int(height)
+    except Exception:
+        pass
+
+    png_dims = _png_dims(path)
+    if png_dims is not None:
+        return png_dims
+    return _jpeg_dims(path)
+
+
+def _normalize_bbox_payload(
+    raw: dict[str, Any], image_size: tuple[int, int] | None
+) -> dict[str, Any]:
+    if image_size is None:
+        return dict(raw)
+
+    width, height = image_size
+    normalized = dict(raw)
+    x_values = [
+        float(normalized[key])
+        for key in ("x1", "x2")
+        if isinstance(normalized.get(key), (int, float))
+    ]
+    y_values = [
+        float(normalized[key])
+        for key in ("y1", "y2")
+        if isinstance(normalized.get(key), (int, float))
+    ]
+
+    if not x_values and not y_values:
+        return normalized
+
+    use_thousand_grid = False
+    if any(v > 1.0 for v in x_values + y_values):
+        max_coord = max(x_values + y_values)
+        fits_image_pixels = (
+            all(v <= float(width) for v in x_values)
+            and all(v <= float(height) for v in y_values)
+        )
+        if max_coord <= 1000.0 and not fits_image_pixels:
+            use_thousand_grid = True
+
+    x_scale = 1000.0 if use_thousand_grid else float(width)
+    y_scale = 1000.0 if use_thousand_grid else float(height)
+
+    axis_scales = {
+        "x1": x_scale,
+        "x2": x_scale,
+        "y1": y_scale,
+        "y2": y_scale,
+    }
+    for key, axis_scale in axis_scales.items():
+        value = normalized.get(key)
+        if not isinstance(value, (int, float)):
+            continue
+        coord = float(value)
+        if coord > 1.0 and axis_scale > 0.0:
+            coord = coord / axis_scale
+        normalized[key] = max(0.0, min(coord, 1.0))
+    return normalized
+
+
+def _parse_bbox(
+    raw: object, *, image_size: tuple[int, int] | None = None
+) -> BoundingBox | None:
+    if not raw or not isinstance(raw, dict):
+        return None
+    try:
+        return BoundingBox.model_validate(_normalize_bbox_payload(raw, image_size))
+    except Exception:
+        return None
+
+
+def _instruction_from_gemini_json(
+    scene_id: str,
+    data: dict[str, Any],
+    *,
+    image_size: tuple[int, int] | None = None,
+) -> LayoutInstruction:
+    """Translate Gemini's JSON into a validated :class:`LayoutInstruction`.
+
+    Falls back to ``sit_center`` whenever the LLM returns something the
+    contract doesn't support, so a bad vision call can never crash the
+    pipeline. Also downgrades "two-item" layouts when the second bbox is
+    missing -- e.g. ``split_two_persons`` with only one person_bbox drops
+    to ``sit_center`` rather than rendering a silently-broken split.
+    """
+
+    layout_str = str(data.get("layout", "sit_center")).strip()
+    try:
+        kind = LayoutKind(layout_str)
+    except ValueError:
+        kind = LayoutKind.SIT_CENTER
+
+    pb = _parse_bbox(data.get("person_bbox"), image_size=image_size)
+    fb = _parse_bbox(data.get("face_bbox"), image_size=image_size)
+    cb = _parse_bbox(data.get("chart_bbox"), image_size=image_size)
+    p2 = _parse_bbox(data.get("second_person_bbox"), image_size=image_size)
+    f2 = _parse_bbox(data.get("second_face_bbox"), image_size=image_size)
+    c2 = _parse_bbox(data.get("second_chart_bbox"), image_size=image_size)
+    reason = str(data.get("reason", ""))[:400]
+
+    # Downgrade any split that is missing its required bboxes, so we never
+    # emit a split layout that will render as garbage.
+    if kind == LayoutKind.SPLIT_CHART_PERSON and (pb is None or cb is None):
+        kind = LayoutKind.SIT_CENTER if pb is not None else LayoutKind.SIT_CENTER
+    if kind == LayoutKind.SPLIT_TWO_PERSONS and (pb is None or p2 is None):
+        kind = LayoutKind.SIT_CENTER
+    if kind == LayoutKind.SPLIT_TWO_CHARTS and (cb is None or c2 is None):
+        kind = LayoutKind.SIT_CENTER
+
+    regions = SceneRegions(
+        scene_id=scene_id, person_bbox=pb, chart_bbox=cb, raw_reason=reason
+    )
+    classification = SceneClassification(
+        scene_id=scene_id, layout=kind, confidence=1.0, reason=reason
+    )
+    instr = layout_instruction_from_regions(
+        regions, classification, clip_id=scene_id
+    )
+
+    updates: dict[str, Any] = {}
+
+    # CENTERING FIX: the single-person 9:16 crop is driven by ``person_x_norm``.
+    # A ``person_bbox`` that spans head + torso + arms is fine for framing
+    # *extent* but its center_x can drift far from the actual face when the
+    # subject is in profile or asymmetric (one arm up, mug on the table, etc).
+    # Prefer the tight ``face_bbox`` center when the model gave us one so the
+    # face lands in the visual center of the vertical crop instead of the
+    # torso doing.
+    face_center = _face_center_x(fb, pb)
+    if face_center is not None:
+        updates["person_x_norm"] = face_center
+
+    if kind == LayoutKind.SPLIT_CHART_PERSON and pb is not None and cb is not None:
+        render_person = _render_safe_split_person_region(pb, fb)
+        updates["split_chart_region"] = cb
+        updates["split_person_region"] = render_person
+        updates["top_band_ratio"] = _split_chart_person_top_band_ratio(cb, render_person)
+    elif kind == LayoutKind.SPLIT_TWO_PERSONS and pb is not None and p2 is not None:
+        # Order by x-center so ``split_person_region`` is always the LEFT speaker.
+        left, right = sorted((pb, p2), key=lambda b: b.center_x)
+        updates["split_person_region"] = left
+        updates["split_second_person_region"] = right
+    elif kind == LayoutKind.SPLIT_TWO_CHARTS and cb is not None and c2 is not None:
+        left, right = sorted((cb, c2), key=lambda b: b.center_x)
+        updates["split_chart_region"] = left
+        updates["split_second_chart_region"] = right
+
+    if updates:
+        instr = instr.model_copy(update=updates)
+    return instr
+
+
+def _face_center_x(
+    face: BoundingBox | None, person: BoundingBox | None
+) -> float | None:
+    """Pick a horizontal center to aim the 9:16 crop at.
+
+    Priority:
+    1. ``face`` bbox center when it looks reasonable (narrow, plausibly
+       inside the matching person bbox).
+    2. No override (caller keeps the person-bbox center, or the default 0.5
+       when neither was provided).
+
+    We sanity-check the face box because Gemini sometimes echoes the full
+    person bbox into ``face_bbox``. If the face bbox is as wide as the
+    person bbox, it gives us nothing new; fall back to the person center
+    rather than pretending we have a tighter signal.
+    """
+    if face is None:
+        return None
+    face_w = max(0.0, face.x2 - face.x1)
+    if face_w <= 0.0:
+        return None
+    # A real face in a 16:9 frame is rarely wider than ~35% of frame width,
+    # even for tight webcam framing. A face "bbox" that's wider than that
+    # almost certainly includes torso and is no better than person_bbox.
+    if face_w > 0.40:
+        return None
+    # If we have a person bbox too, require the face center to sit inside it
+    # — otherwise the model got confused and matched the wrong subject.
+    if person is not None:
+        if not (person.x1 - 0.02 <= face.center_x <= person.x2 + 0.02):
+            return None
+    return float(face.center_x)
+
+
+def _render_safe_split_person_region(
+    person: BoundingBox,
+    face: BoundingBox | None,
+) -> BoundingBox:
+    """Bias split speaker crops toward head-and-shoulders instead of torso."""
+
+    if face is None or _face_center_x(face, person) is None:
+        return person
+
+    face_h = max(0.0, face.y2 - face.y1)
+    if face_h <= 0.0:
+        return person
+
+    target_h = min(
+        person.y2 - person.y1,
+        max(_SPLIT_FACE_REGION_MIN_HEIGHT, face_h * _SPLIT_FACE_REGION_HEIGHT_MULT),
+    )
+    top = max(0.0, min(person.y1, face.y1 - face_h * _SPLIT_FACE_TOP_PAD_MULT))
+    bottom = min(person.y2, top + target_h)
+    if bottom - top < target_h:
+        top = max(0.0, bottom - target_h)
+    if bottom - top <= face_h:
+        return person
+
+    return person.model_copy(update={"y1": top, "y2": bottom})
+
+
+def _split_chart_person_top_band_ratio(
+    chart: BoundingBox,
+    person: BoundingBox,
+) -> float:
+    """Allocate top/bottom band height from the chart/person aspect needs."""
+
+    seam = (chart.x2 + person.x1) / 2.0
+    seam = max(_MIN_SPLIT_STRIP_FRAC, min(1.0 - _MIN_SPLIT_STRIP_FRAC, seam))
+    chart_w = max(1e-6, seam)
+    person_w = max(1e-6, 1.0 - seam)
+    chart_need = max(1e-6, (chart.y2 - chart.y1) / chart_w)
+    person_need = max(1e-6, (person.y2 - person.y1) / person_w)
+    ratio = chart_need / (chart_need + person_need)
+    return round(max(_SPLIT_TOP_RATIO_MIN, min(_SPLIT_TOP_RATIO_MAX, ratio)), 3)
+
+
+def _person_center_x_from_data(
+    data: dict[str, Any], image_size: tuple[int, int] | None = None
+) -> float | None:
+    person_bbox = _parse_bbox(data.get("person_bbox"), image_size=image_size)
+    face_bbox = _parse_bbox(data.get("face_bbox"), image_size=image_size)
+    face_center = _face_center_x(face_bbox, person_bbox)
+    if face_center is not None:
+        return face_center
+    if person_bbox is not None:
+        return float(person_bbox.center_x)
+    return None
+
+
+def _tracking_sample_times(duration_sec: float) -> list[float]:
+    seen: set[float] = set()
+    out: list[float] = []
+    for fraction in TRACKING_SAMPLE_FRACTIONS:
+        t_sec = max(0.0, min(duration_sec, duration_sec * fraction))
+        key = round(t_sec, 3)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(t_sec)
+    return out
+
+
+def _tracking_points_from_centers(
+    duration_sec: float, centers: list[tuple[float, float]]
+) -> list[TimedCenterPoint]:
+    deduped: list[tuple[float, float]] = []
+    for t_sec, x_norm in sorted(centers, key=lambda item: item[0]):
+        clamped_t = max(0.0, min(duration_sec, float(t_sec)))
+        clamped_x = max(0.0, min(1.0, float(x_norm)))
+        if deduped and abs(clamped_t - deduped[-1][0]) < 1e-6:
+            deduped[-1] = (clamped_t, clamped_x)
+        else:
+            deduped.append((clamped_t, clamped_x))
+
+    if len(deduped) < 2:
+        return []
+
+    filtered = list(deduped)
+    for idx in range(1, len(filtered) - 1):
+        prev_x = filtered[idx - 1][1]
+        curr_t, curr_x = filtered[idx]
+        next_x = filtered[idx + 1][1]
+        if (
+            abs(prev_x - next_x) <= TRACKING_OUTLIER_NEIGHBOR_MAX_NORM
+            and abs(curr_x - prev_x) >= TRACKING_OUTLIER_DELTA_NORM
+            and abs(curr_x - next_x) >= TRACKING_OUTLIER_DELTA_NORM
+        ):
+            filtered[idx] = (curr_t, (prev_x + next_x) / 2.0)
+
+    smoothed = list(filtered)
+    for idx in range(1, len(filtered) - 1):
+        prev_x = filtered[idx - 1][1]
+        curr_t, curr_x = filtered[idx]
+        next_x = filtered[idx + 1][1]
+        median_x = sorted((prev_x, curr_x, next_x))[1]
+        if abs(curr_x - median_x) > TRACKING_DEADBAND_NORM:
+            smoothed[idx] = (curr_t, median_x)
+
+    if len(smoothed) >= 5:
+        wider_smoothed = list(smoothed)
+        for idx in range(1, len(smoothed) - 1):
+            window = smoothed[max(0, idx - 2) : min(len(smoothed), idx + 3)]
+            median_x = sorted(x for _, x in window)[len(window) // 2]
+            curr_t, curr_x = smoothed[idx]
+            if abs(curr_x - median_x) >= TRACKING_OUTLIER_DELTA_NORM:
+                wider_smoothed[idx] = (curr_t, median_x)
+        smoothed = wider_smoothed
+
+    filtered = list(smoothed)
+    for idx in range(1, len(filtered)):
+        prev_t, prev_x = filtered[idx - 1]
+        curr_t, curr_x = filtered[idx]
+        if abs(curr_x - prev_x) < TRACKING_DEADBAND_NORM:
+            filtered[idx] = (curr_t, prev_x)
+
+    spread = max(x for _, x in filtered) - min(x for _, x in filtered)
+    if spread < TRACKING_MIN_SPREAD_NORM:
+        stable_x = sum(x for _, x in filtered) / len(filtered)
+        return [
+            TimedCenterPoint(t_sec=0.0, x_norm=stable_x),
+            TimedCenterPoint(t_sec=duration_sec, x_norm=stable_x),
+        ]
+
+    if filtered[0][0] > 0.0:
+        filtered.insert(0, (0.0, filtered[0][1]))
+    else:
+        filtered[0] = (0.0, filtered[0][1])
+
+    if filtered[-1][0] < duration_sec:
+        filtered.append((duration_sec, filtered[-1][1]))
+    else:
+        filtered[-1] = (duration_sec, filtered[-1][1])
+
+    return [TimedCenterPoint(t_sec=t_sec, x_norm=x_norm) for t_sec, x_norm in filtered]
+
+
+def _tracking_is_unstable(points: list[TimedCenterPoint]) -> bool:
+    if len(points) < TRACKING_MIN_USABLE_POINTS:
+        return True
+    return any(
+        abs(points[idx].x_norm - points[idx - 1].x_norm) > TRACKING_UNSTABLE_JUMP_NORM
+        for idx in range(1, len(points))
+    )
+
+
+def _interpolate_tracking_x(points: list[TimedCenterPoint], t_sec: float) -> float | None:
+    if not points:
+        return None
+    if t_sec <= points[0].t_sec:
+        return float(points[0].x_norm)
+    if t_sec >= points[-1].t_sec:
+        return float(points[-1].x_norm)
+    for idx in range(1, len(points)):
+        left = points[idx - 1]
+        right = points[idx]
+        if right.t_sec < t_sec:
+            continue
+        span = right.t_sec - left.t_sec
+        if span <= 1e-6:
+            return float(right.x_norm)
+        alpha = (t_sec - left.t_sec) / span
+        return float(left.x_norm + (right.x_norm - left.x_norm) * alpha)
+    return float(points[-1].x_norm)
+
+
+def _speaker_seed_boxes(
+    data: dict[str, Any], image_size: tuple[int, int] | None
+) -> tuple[BoundingBox, BoundingBox] | None:
+    first_person = _parse_bbox(data.get("person_bbox"), image_size=image_size)
+    first_face = _parse_bbox(data.get("face_bbox"), image_size=image_size)
+    second_person = _parse_bbox(data.get("second_person_bbox"), image_size=image_size)
+    second_face = _parse_bbox(data.get("second_face_bbox"), image_size=image_size)
+    left = first_face or first_person
+    right = second_face or second_person
+    if left is None or right is None:
+        return None
+    ordered = sorted((left, right), key=lambda box: box.center_x)
+    return ordered[0], ordered[1]
+
+
+def _nearest_seed_side(
+    center_x: float,
+    *,
+    left_seed: BoundingBox,
+    right_seed: BoundingBox,
+) -> str:
+    left_delta = abs(center_x - left_seed.center_x)
+    right_delta = abs(center_x - right_seed.center_x)
+    return "left" if left_delta <= right_delta else "right"
+
+
+def _focus_frame_visible_speaker_centers(
+    data: dict[str, Any] | None,
+    image_size: tuple[int, int] | None,
+    *,
+    left_seed: BoundingBox,
+    right_seed: BoundingBox,
+) -> tuple[dict[str, float], bool]:
+    if not data:
+        return {}, False
+
+    first_person = _parse_bbox(data.get("person_bbox"), image_size=image_size)
+    first_face = _parse_bbox(data.get("face_bbox"), image_size=image_size)
+    second_person = _parse_bbox(data.get("second_person_bbox"), image_size=image_size)
+    second_face = _parse_bbox(data.get("second_face_bbox"), image_size=image_size)
+
+    visible_boxes = [box for box in (first_face or first_person, second_face or second_person) if box]
+    if not visible_boxes:
+        return {}, False
+
+    if len(visible_boxes) >= 2:
+        ordered = sorted(visible_boxes, key=lambda box: box.center_x)
+        return {"left": ordered[0].center_x, "right": ordered[1].center_x}, True
+
+    only_box = visible_boxes[0]
+    side = _nearest_seed_side(only_box.center_x, left_seed=left_seed, right_seed=right_seed)
+    return {side: only_box.center_x}, False
+
+
+def _two_speaker_full_width_span_norm(image_size: tuple[int, int] | None) -> float:
+    if image_size is None:
+        return 1.0
+    width, height = image_size
+    if width <= 0 or height <= 0:
+        return 1.0
+    target_aspect = 9 / 16
+    if width / height >= target_aspect:
+        return min(1.0, (height * target_aspect) / width)
+    return 1.0
+
+
+def _can_fit_both_speakers(
+    left_x: float,
+    right_x: float,
+    *,
+    image_size: tuple[int, int] | None,
+) -> bool:
+    span = abs(right_x - left_x)
+    allowed = _two_speaker_full_width_span_norm(image_size) * TWO_SPEAKER_BOTH_FIT_MARGIN
+    return span <= allowed
+
+
+def _speaker_follow_sample_times(duration_sec: float) -> list[float]:
+    seen: set[float] = set()
+    out: list[float] = []
+    dense_times: list[float] = []
+    if duration_sec > 0:
+        steps = max(1, int(duration_sec / SPEAKER_FOLLOW_MAX_INTERVAL_SEC))
+        dense_times = [
+            min(duration_sec, idx * SPEAKER_FOLLOW_MAX_INTERVAL_SEC)
+            for idx in range(1, steps + 1)
+        ]
+    for t_sec in [0.0, *_tracking_sample_times(duration_sec), *dense_times, duration_sec]:
+        key = round(max(0.0, min(duration_sec, t_sec)), 3)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(key)
+    return out
+
+
+def _resolve_speaker_focus_samples(
+    samples: list[tuple[float, str]],
+    *,
+    default_side: str = "left",
+) -> list[tuple[float, str]]:
+    normalized: list[tuple[float, str | None]] = []
+    allowed = {"left", "right", "both"}
+    for t_sec, side in samples:
+        normalized.append((float(t_sec), side if side in allowed else None))
+
+    out: list[tuple[float, str]] = []
+    for idx, (t_sec, side) in enumerate(normalized):
+        if side is not None:
+            out.append((t_sec, side))
+            continue
+
+        prev_side = out[-1][1] if out else None
+        next_side: str | None = None
+        for _, future_side in normalized[idx + 1 :]:
+            if future_side is not None:
+                next_side = future_side
+                break
+
+        resolved_side: str
+        if prev_side is not None and next_side is not None:
+            resolved_side = prev_side if prev_side == next_side else "both"
+        else:
+            resolved_side = prev_side or next_side or default_side
+        out.append((t_sec, resolved_side))
+    return out
+
+
+def _tracking_points_from_focus_states(
+    duration_sec: float,
+    framings: list[tuple[float, float, float]],
+) -> list[TimedCenterPoint]:
+    deduped: list[tuple[float, float, float]] = []
+    for t_sec, x_norm, zoom in sorted(framings, key=lambda item: item[0]):
+        clamped_t = max(0.0, min(duration_sec, float(t_sec)))
+        clamped_x = max(0.0, min(1.0, float(x_norm)))
+        clamped_zoom = max(1.0, min(4.0, float(zoom)))
+        if deduped and abs(clamped_t - deduped[-1][0]) < 1e-6:
+            deduped[-1] = (clamped_t, clamped_x, clamped_zoom)
+        else:
+            deduped.append((clamped_t, clamped_x, clamped_zoom))
+
+    if len(deduped) < 2:
+        return []
+
+    if deduped[0][0] > 0.0:
+        deduped.insert(0, (0.0, deduped[0][1], deduped[0][2]))
+    else:
+        deduped[0] = (0.0, deduped[0][1], deduped[0][2])
+
+    if deduped[-1][0] < duration_sec:
+        deduped.append((duration_sec, deduped[-1][1], deduped[-1][2]))
+    else:
+        deduped[-1] = (duration_sec, deduped[-1][1], deduped[-1][2])
+
+    expanded: list[tuple[float, float, float]] = [deduped[0]]
+    for t_sec, x_norm, zoom in deduped[1:]:
+        prev_t, prev_x, prev_zoom = expanded[-1]
+        switch_changed = (
+            abs(x_norm - prev_x) > TRACKING_DEADBAND_NORM
+            or abs(zoom - prev_zoom) > 0.05
+        )
+        if switch_changed:
+            hold_t = max(prev_t, min(t_sec, t_sec - FOCUS_SWITCH_LEAD_SEC))
+            if hold_t - prev_t > 1e-6:
+                expanded.append((hold_t, prev_x, prev_zoom))
+        if abs(t_sec - expanded[-1][0]) < 1e-6:
+            expanded[-1] = (t_sec, x_norm, zoom)
+        else:
+            expanded.append((t_sec, x_norm, zoom))
+
+    return [
+        TimedCenterPoint(t_sec=t_sec, x_norm=x_norm, zoom=zoom)
+        for t_sec, x_norm, zoom in expanded
+    ]
+
+
+def _nearest_non_both_focus_side(
+    resolved_focus: list[tuple[float, str]],
+    start_idx: int,
+    *,
+    step: int,
+) -> str | None:
+    idx = start_idx
+    while 0 <= idx < len(resolved_focus):
+        side = resolved_focus[idx][1]
+        if side in ("left", "right"):
+            return side
+        idx += step
+    return None
+
+
+def _extract_frame_at_time(source_path: Path, time_sec: float, output_path: Path) -> Path:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    subprocess.run(
+        [
+            "ffmpeg",
+            "-y",
+            "-loglevel",
+            "error",
+            "-ss",
+            f"{time_sec:.3f}",
+            "-i",
+            str(source_path),
+            "-frames:v",
+            "1",
+            "-q:v",
+            "2",
+            str(output_path),
+        ],
+        check=True,
+        capture_output=True,
+    )
+    return output_path
+
+
+def _probe_video_fps(source_path: Path) -> float:
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "error",
+            "-select_streams",
+            "v:0",
+            "-show_entries",
+            "stream=r_frame_rate",
+            "-of",
+            "default=noprint_wrappers=1:nokey=1",
+            str(source_path),
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    rate = (result.stdout or "").strip()
+    if "/" in rate:
+        num, den = rate.split("/", 1)
+        try:
+            return max(1.0, float(num) / max(float(den), 1.0))
+        except ValueError:
+            return 30.0
+    try:
+        return max(1.0, float(rate))
+    except ValueError:
+        return 30.0
+
+
+def _segmentation_center_x_from_url(mask_url: str) -> float | None:
+    try:
+        import httpx
+        from PIL import Image  # type: ignore
+    except ImportError:
+        return None
+
+    response = httpx.get(mask_url, timeout=120.0)
+    response.raise_for_status()
+    with Image.open(BytesIO(response.content)) as image:
+        image = image.convert("L")
+        width, height = image.size
+        pixels = image.load()
+        xs: list[int] = []
+        for y in range(height):
+            for x in range(width):
+                if pixels[x, y] > 16:
+                    xs.append(x)
+        if not xs or width <= 0:
+            return None
+        return float(sum(xs) / len(xs) / width)
+
+
+def _segmentation_mask_urls(output: object) -> list[str]:
+    def _coerce_urls(items: Iterable[object]) -> list[str]:
+        urls: list[str] = []
+        for item in items:
+            if item is None:
+                continue
+            if isinstance(item, (str, Path)):
+                text = str(item).strip()
+            else:
+                url = getattr(item, "url", None)
+                text = str(url).strip() if isinstance(url, str) else str(item).strip()
+            if text:
+                urls.append(text)
+        return urls
+
+    if isinstance(output, dict):
+        for key in ("black_white_masks", "masks", "output"):
+            value = output.get(key)
+            if isinstance(value, (str, bytes, bytearray)) or value is None:
+                continue
+            try:
+                urls = _coerce_urls(value)
+            except TypeError:
+                continue
+            if urls:
+                return urls
+        return []
+    if isinstance(output, (str, bytes, bytearray)) or output is None:
+        return []
+    try:
+        return _coerce_urls(output)
+    except TypeError:
+        return []
+
+
+def _infer_person_tracking_with_segmentation(
+    scene: Scene,
+    *,
+    source_video: Path,
+    segmentation_model: str,
+    initial_data: dict[str, Any] | None = None,
+    initial_image_size: tuple[int, int] | None = None,
+    seed_bbox: BoundingBox | None = None,
+    object_id: str = "speaker",
+) -> tuple[list[TimedCenterPoint], dict[str, Any] | None]:
+    token = (os.environ.get("REPLICATE_API_TOKEN") or "").strip()
+    if not token:
+        raise RuntimeError("REPLICATE_API_TOKEN is not set")
+    if initial_image_size is None:
+        raise RuntimeError("Segmentation fallback requires the keyframe dimensions")
+    if seed_bbox is None and initial_data is None:
+        raise RuntimeError("Segmentation fallback requires an initial vision bbox")
+
+    if seed_bbox is None:
+        face_bbox = _parse_bbox(initial_data.get("face_bbox"), image_size=initial_image_size)
+        person_bbox = _parse_bbox(initial_data.get("person_bbox"), image_size=initial_image_size)
+        seed_bbox = face_bbox or person_bbox
+    if seed_bbox is None:
+        raise RuntimeError("No seed bbox available for segmentation fallback")
+
+    try:
+        import replicate
+    except ImportError as exc:
+        raise RuntimeError("replicate package is not installed") from exc
+
+    width, height = initial_image_size
+    fps = _probe_video_fps(source_video)
+    midpoint_frame = max(0, int(round((scene.duration / 2.0) * fps)))
+    output_frame_interval = max(1, int(round(max(1.0, scene.duration * fps) / 10.0)))
+    click_x = int(round(seed_bbox.center_x * width))
+    click_y = int(round(seed_bbox.center_y * height))
+    prompt_frames = [0]
+    if midpoint_frame > 0:
+        prompt_frames.append(midpoint_frame)
+    prompt_coordinates = ",".join(f"[{click_x},{click_y}]" for _ in prompt_frames)
+    prompt_labels = ",".join("1" for _ in prompt_frames)
+    prompt_frame_str = ",".join(str(frame_idx) for frame_idx in prompt_frames)
+    prompt_object_ids = ",".join(object_id for _ in prompt_frames)
+    run_input = {
+        "input_video": None,
+        "click_coordinates": prompt_coordinates,
+        "click_labels": prompt_labels,
+        "click_frames": prompt_frame_str,
+        "click_object_ids": prompt_object_ids,
+        "mask_type": "binary",
+        "annotation_type": "mask",
+        "output_video": False,
+        "output_format": "png",
+        "output_frame_interval": output_frame_interval,
+    }
+
+    with source_video.open("rb") as handle:
+        client = replicate.Client(api_token=token)
+        run_input["input_video"] = handle
+        try:
+            output = client.run(segmentation_model, input=run_input)
+            resolved_model = segmentation_model
+        except Exception as exc:
+            if ":" in segmentation_model or "404" not in str(exc):
+                raise
+            handle.seek(0)
+            output = client.run(REPLICATE_SAM2_VIDEO_PINNED, input=run_input)
+            resolved_model = REPLICATE_SAM2_VIDEO_PINNED
+
+    urls = _segmentation_mask_urls(output)
+    if not urls:
+        raise RuntimeError("Segmentation fallback returned no masks")
+
+    centers: list[tuple[float, float]] = []
+    for idx, mask_url in enumerate(urls):
+        center_x = _segmentation_center_x_from_url(mask_url)
+        if center_x is None:
+            continue
+        rel_time = min(scene.duration, (idx * output_frame_interval) / fps)
+        centers.append((rel_time, center_x))
+
+    points = _tracking_points_from_centers(scene.duration, centers)
+    detail = {
+        "provider": "replicate",
+        "model": resolved_model,
+        "seed_point_px": [click_x, click_y],
+        "seed_frame": midpoint_frame,
+        "prompt_frames": prompt_frames,
+        "output_frame_interval": output_frame_interval,
+        "mask_count": len(urls),
+    }
+    return points, detail
+
+
+def _infer_two_speaker_focus_tracking_with_segmentation(
+    scene: Scene,
+    *,
+    source_video: Path,
+    tracking_dir: Path,
+    model_name: str,
+    segmentation_model: str,
+    initial_data: dict[str, Any],
+    initial_image_size: tuple[int, int] | None,
+) -> tuple[list[TimedCenterPoint], dict[str, Any] | None]:
+    seeds = _speaker_seed_boxes(initial_data, initial_image_size)
+    if seeds is None:
+        raise RuntimeError("Two-speaker SAM follow requires both speaker bboxes")
+
+    left_seed, right_seed = seeds
+    left_points, left_detail = _infer_person_tracking_with_segmentation(
+        scene,
+        source_video=source_video,
+        segmentation_model=segmentation_model,
+        initial_data=initial_data,
+        initial_image_size=initial_image_size,
+        seed_bbox=left_seed,
+        object_id="left_speaker",
+    )
+    right_points, right_detail = _infer_person_tracking_with_segmentation(
+        scene,
+        source_video=source_video,
+        segmentation_model=segmentation_model,
+        initial_data=initial_data,
+        initial_image_size=initial_image_size,
+        seed_bbox=right_seed,
+        object_id="right_speaker",
+    )
+    if not left_points or not right_points:
+        raise RuntimeError("Two-speaker SAM follow did not return both speaker tracks")
+
+    focus_dir = tracking_dir / scene.scene_id / "speaker_focus"
+    focus_samples: list[dict[str, Any]] = []
+    focus_choices: list[tuple[float, str]] = []
+    for rel_time in _speaker_follow_sample_times(max(0.0, scene.duration)):
+        abs_time = scene.start_time + rel_time
+        frame_path = focus_dir / f"{scene.scene_id}_{int(round(rel_time * 1000)):06d}.jpg"
+        visible_centers: dict[str, float] = {}
+        both_visible = False
+        try:
+            _extract_frame_at_time(source_video, abs_time, frame_path)
+            frame_image_size = _keyframe_dimensions(str(frame_path))
+            layout_data: dict[str, Any] | None = None
+            layout_error: str | None = None
+            try:
+                layout_data = _call_gemini_vision(str(frame_path), model_name)
+                visible_centers, both_visible = _focus_frame_visible_speaker_centers(
+                    layout_data,
+                    frame_image_size,
+                    left_seed=left_seed,
+                    right_seed=right_seed,
+                )
+            except Exception as exc:
+                layout_error = str(exc)
+
+            data = _call_active_speaker_vision(str(frame_path), model_name)
+            speaker = str(data.get("speaker", "unclear")).strip().lower()
+            if speaker not in ("left", "right", "both", "unclear"):
+                speaker = "unclear"
+            if speaker == "unclear" and len(visible_centers) == 1 and not both_visible:
+                speaker = next(iter(visible_centers))
+            focus_choices.append((rel_time, speaker))
+            sample = {
+                "time_sec": rel_time,
+                "frame_path": str(frame_path),
+                "speaker": speaker,
+                "raw": data,
+                "visible_centers": visible_centers,
+                "both_visible": both_visible,
+            }
+            if layout_data is not None:
+                sample["layout_raw"] = layout_data
+            if layout_error:
+                sample["layout_error"] = layout_error
+            focus_samples.append(sample)
+        except Exception as exc:
+            focus_choices.append((rel_time, "unclear"))
+            focus_samples.append(
+                {
+                    "time_sec": rel_time,
+                    "frame_path": str(frame_path),
+                    "speaker": "unclear",
+                    "visible_centers": visible_centers,
+                    "both_visible": both_visible,
+                    "error": str(exc),
+                }
+            )
+
+    resolved_focus = _resolve_speaker_focus_samples(focus_choices, default_side="left")
+    framings: list[tuple[float, float, float]] = []
+    for idx, (rel_time, speaker) in enumerate(resolved_focus):
+        sample = focus_samples[idx] if idx < len(focus_samples) else {}
+        sample_visible_centers = sample.get("visible_centers", {})
+        frame_left_x = (
+            float(sample_visible_centers["left"])
+            if isinstance(sample_visible_centers, dict) and "left" in sample_visible_centers
+            else None
+        )
+        frame_right_x = (
+            float(sample_visible_centers["right"])
+            if isinstance(sample_visible_centers, dict) and "right" in sample_visible_centers
+            else None
+        )
+        both_visible = bool(sample.get("both_visible"))
+
+        left_x = frame_left_x if frame_left_x is not None else _interpolate_tracking_x(left_points, rel_time)
+        right_x = (
+            frame_right_x if frame_right_x is not None else _interpolate_tracking_x(right_points, rel_time)
+        )
+        if left_x is None:
+            left_x = left_seed.center_x
+        if right_x is None:
+            right_x = right_seed.center_x
+
+        prev_side = _nearest_non_both_focus_side(resolved_focus, idx - 1, step=-1)
+        next_side = _nearest_non_both_focus_side(resolved_focus, idx + 1, step=1)
+        should_widen = False
+        if both_visible and _can_fit_both_speakers(left_x, right_x, image_size=initial_image_size):
+            if speaker == "both":
+                should_widen = True
+            elif (
+                prev_side is not None
+                and next_side is not None
+                and prev_side != next_side
+            ):
+                should_widen = True
+
+        if should_widen:
+            x_norm = (left_x + right_x) / 2.0
+            zoom = TWO_SPEAKER_BOTH_ZOOM
+        elif speaker == "left":
+            x_norm = left_x
+            zoom = TWO_SPEAKER_ACTIVE_ZOOM
+        elif speaker == "right":
+            x_norm = right_x
+            zoom = TWO_SPEAKER_ACTIVE_ZOOM
+        else:
+            fallback_side = prev_side or next_side or "left"
+            x_norm = left_x if fallback_side == "left" else right_x
+            zoom = TWO_SPEAKER_WIDE_ACTIVE_ZOOM
+        framings.append((rel_time, x_norm, zoom))
+
+    points = _tracking_points_from_focus_states(scene.duration, framings)
+    detail = {
+        "mode": "two_speaker_follow",
+        "left_segmentation": left_detail,
+        "right_segmentation": right_detail,
+        "focus_samples": focus_samples,
+        "resolved_focus": [
+            {"time_sec": rel_time, "speaker": speaker} for rel_time, speaker in resolved_focus
+        ],
+        "framing_samples": [
+            {"time_sec": rel_time, "x_norm": x_norm, "zoom": zoom}
+            for rel_time, x_norm, zoom in framings
+        ],
+    }
+    return points, detail
+
+
+def _infer_person_tracking(
+    scene: Scene,
+    *,
+    source_video: Path,
+    tracking_dir: Path,
+    model_name: str,
+    initial_data: dict[str, Any] | None = None,
+    initial_image_size: tuple[int, int] | None = None,
+) -> tuple[list[TimedCenterPoint], list[dict[str, Any]]]:
+    duration_sec = max(0.0, scene.duration)
+    if duration_sec <= 0.0:
+        return [], []
+
+    midpoint_rel = duration_sec / 2.0
+    centers: list[tuple[float, float]] = []
+    samples: list[dict[str, Any]] = []
+
+    if initial_data is not None:
+        center_x = _person_center_x_from_data(initial_data, image_size=initial_image_size)
+        samples.append(
+            {
+                "sample_kind": "midpoint_keyframe",
+                "time_sec": midpoint_rel,
+                "frame_path": scene.keyframe_path,
+                "center_x_norm": center_x,
+                "raw": initial_data,
+            }
+        )
+        if center_x is not None:
+            centers.append((midpoint_rel, center_x))
+
+    scene_tracking_dir = tracking_dir / scene.scene_id
+    for rel_time in _tracking_sample_times(duration_sec):
+        if abs(rel_time - midpoint_rel) < 1e-3:
+            continue
+        abs_time = scene.start_time + rel_time
+        frame_path = scene_tracking_dir / f"{scene.scene_id}_{int(round(rel_time * 1000)):06d}.jpg"
+        try:
+            _extract_frame_at_time(source_video, abs_time, frame_path)
+            data = _call_gemini_vision(str(frame_path), model_name)
+            image_size = _keyframe_dimensions(str(frame_path))
+            center_x = _person_center_x_from_data(data, image_size=image_size)
+            samples.append(
+                {
+                    "sample_kind": "tracking_frame",
+                    "time_sec": rel_time,
+                    "frame_path": str(frame_path),
+                    "center_x_norm": center_x,
+                    "raw": data,
+                }
+            )
+            if center_x is not None:
+                centers.append((rel_time, center_x))
+        except Exception as e:
+            logger.warning(
+                "Speaker tracking sample failed for %s at %.2fs: %s",
+                scene.scene_id,
+                rel_time,
+                e,
+            )
+            samples.append(
+                {
+                    "sample_kind": "tracking_frame",
+                    "time_sec": rel_time,
+                    "frame_path": str(frame_path),
+                    "error": str(e),
+                }
+            )
+
+    return _tracking_points_from_centers(duration_sec, centers), samples
+
+
+def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[str, Any]:
+    path = Path(keyframe_path)
+    data = path.read_bytes()
+    mime = "image/jpeg" if path.suffix.lower() in (".jpg", ".jpeg") else "image/png"
+    provider = resolve_llm_provider()
+    resolved_model = model_name_for_provider(model_name, provider)
+
+    if provider == "google":
+        client = genai.Client(api_key=resolve_gemini_api_key())
+        response = client.models.generate_content(
+            model=resolved_model,
+            contents=[
+                types.Part.from_text(text=prompt),
+                types.Part.from_bytes(data=data, mime_type=mime),
+            ],
+            config=gemini_generate_config(
+                temperature=0.2,
+                response_mime_type="application/json",
+            ),
+        )
+        if not response.text:
+            raise RuntimeError("Gemini vision returned empty response")
+        return _json_object_from_vision_response(json.loads(response.text))
+
+    client = OpenAI(
+        api_key=resolve_openrouter_api_key(),
+        base_url=OPENROUTER_BASE_URL,
+        default_headers=openrouter_default_headers(),
+    )
+    data_url = f"data:{mime};base64,{base64.b64encode(data).decode('ascii')}"
+    response = client.chat.completions.create(
+        model=resolved_model,
+        messages=[
+            {"role": "system", "content": prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Analyze this keyframe and return only JSON."},
+                    {"type": "image_url", "image_url": {"url": data_url}},
+                ],
+            },
+        ],
+        temperature=0.2,
+        response_format={"type": "json_object"},
+    )
+    text = _openai_message_text(response.choices[0].message.content)
+    if not text:
+        raise RuntimeError("OpenRouter vision returned empty response")
+    return _json_object_from_vision_response(json.loads(text))
+
+
+def _call_gemini_vision(keyframe_path: str, model_name: str) -> dict[str, Any]:
+    return _call_vision_json(keyframe_path, model_name, GEMINI_LAYOUT_VISION_PROMPT)
+
+
+def _call_active_speaker_vision(frame_path: str, model_name: str) -> dict[str, Any]:
+    return _call_vision_json(frame_path, model_name, ACTIVE_SPEAKER_VISION_PROMPT)
+
+
+def infer_layout_instructions(
+    scenes: list[Scene],
+    *,
+    gemini_vision_model: str,
+    source_video: Path,
+    tracking_dir: Path,
+    source_videos_by_scene: dict[str, Path] | None = None,
+    segmentation_provider: str = "off",
+    segmentation_model: str = "meta/sam-2-video",
+) -> tuple[dict[str, LayoutInstruction], dict[str, dict[str, Any]]]:
+    """Return ``(clip_id -> LayoutInstruction, clip_id -> raw_gemini_json)``."""
+
+    out: dict[str, LayoutInstruction] = {}
+    raw_by_clip: dict[str, dict[str, Any]] = {}
+    model_name = gemini_vision_model.strip()
+
+    for s in scenes:
+        sid = s.scene_id
+        if not s.keyframe_path:
+            logger.warning("No keyframe for %s; using sit_center.", sid)
+            out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
+            raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"}
+            continue
+        try:
+            data = _call_gemini_vision(s.keyframe_path, model_name)
+            image_size = _keyframe_dimensions(s.keyframe_path)
+            instr = _instruction_from_gemini_json(
+                sid,
+                data,
+                image_size=image_size,
+            )
+            raw_data = dict(data)
+            speaker_follow_applied = False
+            tracking_source = (
+                source_videos_by_scene.get(sid, source_video)
+                if source_videos_by_scene
+                else source_video
+            )
+            if instr.layout == LayoutKind.SPLIT_TWO_PERSONS and segmentation_provider == "replicate":
+                try:
+                    focus_points, focus_detail = _infer_two_speaker_focus_tracking_with_segmentation(
+                        s,
+                        source_video=tracking_source,
+                        tracking_dir=tracking_dir,
+                        model_name=model_name,
+                        segmentation_model=segmentation_model,
+                        initial_data=data,
+                        initial_image_size=image_size,
+                    )
+                    if focus_detail:
+                        raw_data["speaker_follow_tracking"] = focus_detail
+                    if focus_points:
+                        instr = LayoutInstruction(
+                            clip_id=sid,
+                            layout=LayoutKind.SIT_CENTER,
+                            zoom=focus_points[0].zoom or 1.0,
+                            person_x_norm=focus_points[0].x_norm,
+                            person_tracking=focus_points,
+                        )
+                        speaker_follow_applied = True
+                except Exception as exc:
+                    raw_data["speaker_follow_tracking"] = {"error": str(exc)}
+            if instr.layout in (LayoutKind.SIT_CENTER, LayoutKind.ZOOM_CALL_CENTER):
+                if speaker_follow_applied:
+                    raw_by_clip[sid] = raw_data
+                    out[sid] = instr
+                    continue
+                tracking_points: list[TimedCenterPoint] = []
+                tracking_samples: list[dict[str, Any]] = []
+                attempted_segmentation = False
+
+                if segmentation_provider == "replicate":
+                    attempted_segmentation = True
+                    try:
+                        sam_points, sam_detail = _infer_person_tracking_with_segmentation(
+                            s,
+                            source_video=tracking_source,
+                            segmentation_model=segmentation_model,
+                            initial_data=data,
+                            initial_image_size=image_size,
+                        )
+                        if sam_points:
+                            tracking_points = sam_points
+                        if sam_detail:
+                            raw_data["segmentation_tracking"] = sam_detail
+                    except Exception as exc:
+                        raw_data["segmentation_tracking"] = {"error": str(exc)}
+                if not tracking_points:
+                    tracking_points, tracking_samples = _infer_person_tracking(
+                        s,
+                        source_video=tracking_source,
+                        tracking_dir=tracking_dir,
+                        model_name=model_name,
+                        initial_data=data,
+                        initial_image_size=image_size,
+                    )
+                    if (
+                        segmentation_provider == "replicate"
+                        and _tracking_is_unstable(tracking_points)
+                        and not attempted_segmentation
+                    ):
+                        try:
+                            sam_points, sam_detail = _infer_person_tracking_with_segmentation(
+                                s,
+                                source_video=tracking_source,
+                                segmentation_model=segmentation_model,
+                                initial_data=data,
+                                initial_image_size=image_size,
+                            )
+                            if sam_points:
+                                tracking_points = sam_points
+                            if sam_detail:
+                                raw_data["segmentation_tracking"] = sam_detail
+                        except Exception as exc:
+                            raw_data.setdefault("segmentation_tracking", {"error": str(exc)})
+                if tracking_points:
+                    instr = instr.model_copy(update={"person_tracking": tracking_points})
+                if tracking_samples:
+                    raw_data["person_tracking_samples"] = tracking_samples
+            raw_by_clip[sid] = raw_data
+            out[sid] = instr
+        except Exception as e:
+            logger.warning("Gemini vision failed for %s: %s — defaulting sit_center", sid, e)
+            out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
+            raw_by_clip[sid] = {"error": str(e), "layout": "sit_center"}
+
+    return out, raw_by_clip
+
+
+def _apply_layout_hint_fallbacks(
+    instructions: dict[str, LayoutInstruction],
+    raw_by_clip: dict[str, dict[str, Any]],
+    layout_hints_by_clip: dict[str, LayoutKind],
+) -> None:
+    for clip_id, hint in layout_hints_by_clip.items():
+        instr = instructions.get(clip_id)
+        raw = raw_by_clip.get(clip_id)
+        if instr is None or raw is None or "error" not in raw:
+            continue
+        if instr.layout != LayoutKind.SIT_CENTER:
+            continue
+        instructions[clip_id] = instr.model_copy(update={"layout": hint})
+        updated_raw = dict(raw)
+        updated_raw["layout"] = hint.value
+        updated_raw["layout_hint_fallback"] = hint.value
+        raw_by_clip[clip_id] = updated_raw
+
+
+def resolved_vision_model(config: PipelineConfig) -> str:
+    if config.gemini_vision_model:
+        return config.gemini_vision_model.strip()
+    if GEMINI_VISION_MODEL:
+        return GEMINI_VISION_MODEL
+    return (config.gemini_model or GEMINI_MODEL).strip()
+
+
+def run_layout_vision_stage(
+    work_dir: Path,
+    scenes: list[Scene],
+    *,
+    source_video: Path,
+    source_videos_by_scene: dict[str, Path] | None = None,
+    transcript_fp: str,
+    clips_path: Path,
+    config: PipelineConfig,
+) -> dict[str, LayoutInstruction]:
+    """Load cache or call Gemini vision for each keyframe; persist JSON artifacts."""
+    from humeo.clip_selector import load_clips
+
+    clips_fp = _clips_fingerprint(clips_path)
+    vm = resolved_vision_model(config)
+    layout_hints_by_clip = {
+        clip.clip_id: hint
+        for clip in load_clips(clips_path)
+        if (hint := (clip.layout_hint or clip.layout)) is not None
+    }
+
+    if (
+        not config.force_layout_vision
+        and layout_cache_valid(
+            work_dir,
+            transcript_fp=transcript_fp,
+            clips_fp=clips_fp,
+            vision_model=vm,
+            segmentation_provider=config.segmentation_provider,
+            segmentation_model=config.segmentation_model,
+        )
+    ):
+        cached = load_layout_cache(work_dir)
+        if cached:
+            logger.info("Layout vision cache hit; skipping Gemini vision calls.")
+            return {
+                k: LayoutInstruction.model_validate(v["instruction"])
+                for k, v in cached.items()
+                if isinstance(v, dict) and "instruction" in v
+            }
+
+    instructions, raw_by_clip = infer_layout_instructions(
+        scenes,
+        gemini_vision_model=vm,
+        source_video=source_video,
+        tracking_dir=work_dir / "layout_tracking",
+        source_videos_by_scene=source_videos_by_scene,
+        segmentation_provider=config.segmentation_provider,
+        segmentation_model=config.segmentation_model,
+    )
+    _apply_layout_hint_fallbacks(instructions, raw_by_clip, layout_hints_by_clip)
+
+    payload: dict[str, dict[str, Any]] = {}
+    for sid, instr in instructions.items():
+        payload[sid] = {
+            "instruction": json.loads(instr.model_dump_json()),
+            "raw": raw_by_clip.get(sid, {}),
+        }
+    write_layout_cache(
+        work_dir,
+        transcript_fp=transcript_fp,
+        clips_fp=clips_fp,
+        vision_model=vm,
+        clips_payload=payload,
+        segmentation_provider=config.segmentation_provider,
+        segmentation_model=config.segmentation_model,
+    )
+    return instructions
diff --git a/src/humeo/pipeline.py b/src/humeo/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..e390087d9af1f129d1000860f89bebbd1f51e087
--- /dev/null
+++ b/src/humeo/pipeline.py
@@ -0,0 +1,797 @@
+"""End-to-end product pipeline."""
+
+import dataclasses
+import json
+import logging
+import re
+from pathlib import Path
+
+from humeo_core.primitives.ingest import extract_keyframes
+from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RatingFeedback, RenderTheme, Scene
+
+from humeo import interactive, session_state
+from humeo.clip_assembly import apply_render_spans, assemble_clip, write_clip_plan
+from humeo.clip_selection_cache import cache_valid, load_meta, transcript_fingerprint, write_artifacts
+from humeo.clip_selector import (
+    clip_quality_priority_score,
+    load_clips,
+    renumber_clips_dense,
+    save_clips,
+    select_clips,
+)
+from humeo.config import MAX_CLIP_DURATION_SEC, MIN_CLIP_DURATION_SEC, PipelineConfig
+from humeo.content_pruning import run_content_pruning_stage, snap_render_windows_to_sentence_boundaries
+from humeo.cutter import generate_ass
+from humeo.hook_detector import run_hook_detection_stage
+from humeo.hook_library import resolve_hook_library_path
+from humeo.ingest import (
+    download_video,
+    extract_audio,
+    stage_local_video,
+    transcript_cache_valid,
+    transcribe_whisperx,
+)
+from humeo.layout_vision import run_layout_vision_stage
+from humeo.render_qa import qa_record_flags, run_render_qa
+from humeo.render_window import clip_for_render
+from humeo.reframe_ffmpeg import reframe_clip_ffmpeg
+from humeo.transcript_align import clip_subtitle_words, group_words_to_cue_chunks
+from humeo.video_cache import (
+    extract_youtube_video_id,
+    ingest_complete,
+    normalize_local_source_path,
+    read_youtube_info_json,
+    resolve_work_directory,
+    upsert_manifest_from_info,
+)
+
+logger = logging.getLogger(__name__)
+
+_WEAK_HOOK_START_WORDS = {
+    "actually",
+    "basically",
+    "honestly",
+    "look",
+    "listen",
+    "okay",
+    "ok",
+    "right",
+    "so",
+    "well",
+    "yeah",
+}
+_WEAK_HOOK_START_PHRASES = {"i mean", "kind of", "sort of", "you know"}
+_STRONG_HOOK_LATEST_START_SEC = 6.0
+_FINAL_QUALITY_THRESHOLD = 0.68
+_NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2 = 0.68
+_NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH = 0.42
+_NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1 = 0.12
+_NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20
+_PRESENTATION_REFERENCE_RE = re.compile(
+    r"\b("
+    r"as you can see|you can see|what you can see|look at|take a look|shown here|"
+    r"shown on|on the screen|on this slide|this chart|the chart|this graph|"
+    r"the graph|this slide|this matrix|the matrix|red line|yellow line|"
+    r"blue line|green line|top there|bottom there|x-axis|y-axis"
+    r")\b",
+    flags=re.IGNORECASE,
+)
+
+
+def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig:
+    return dataclasses.replace(
+        config,
+        steering_notes=list(steering_notes),
+        force_clip_selection=True,
+        overwrite_outputs=True,
+    )
+
+
+def _build_steering_from_feedback(feedback: RatingFeedback) -> str:
+    parts: list[str] = []
+    if "wrong_moments" in feedback.issues:
+        parts.append("Previous selection picked the wrong moments. Reselect with different candidates.")
+    if "bad_cuts" in feedback.issues:
+        parts.append(
+            "Clip boundaries were bad. Prefer clips starting on clean sentence beginnings and ending on completed thoughts."
+        )
+    if "boring" in feedback.issues:
+        parts.append("Previous selection lacked energy. Bias strongly toward high-emotion, high-hook moments.")
+    if "confusing" in feedback.issues:
+        parts.append("Previous clips needed too much context. Pick moments that make sense standalone.")
+    if "wrong_layout" in feedback.issues:
+        logger.warning("Received wrong_layout feedback, but layout overrides are not available until Gate 2 ships.")
+    if "length_off" in feedback.issues:
+        parts.append("Clip durations felt off. Respect the duration bounds strictly.")
+    if "other" in feedback.issues and feedback.free_text:
+        parts.append(feedback.free_text)
+    return " ".join(parts).strip()
+
+
+def _ensure_work_dir(config: PipelineConfig) -> None:
+    """Resolve ``config.work_dir`` when unset (per-video cache) or ensure it exists."""
+    if config.work_dir is not None:
+        return
+    config.work_dir = resolve_work_directory(
+        youtube_url=config.youtube_url,
+        explicit_work_dir=None,
+        use_video_cache=config.use_video_cache,
+        cache_root=config.cache_root,
+    )
+
+
+def _filter_render_valid_clips(clips: list, *, stage_label: str) -> list:
+    """Drop clips whose actual render window violates the duration contract."""
+    valid: list = []
+    dropped = 0
+    for clip in clips:
+        render_clip = clip_for_render(clip)
+        render_duration = render_clip.duration_sec
+        if MIN_CLIP_DURATION_SEC <= render_duration <= MAX_CLIP_DURATION_SEC:
+            valid.append(clip)
+            continue
+        dropped += 1
+        logger.warning(
+            "%s: dropping clip %s because render-window duration %.1fs is outside [%ds, %ds] "
+            "(trim_start=%.1fs trim_end=%.1fs).",
+            stage_label,
+            clip.clip_id,
+            render_duration,
+            MIN_CLIP_DURATION_SEC,
+            MAX_CLIP_DURATION_SEC,
+            clip.trim_start_sec,
+            clip.trim_end_sec,
+        )
+    if dropped:
+        logger.warning("%s: dropped %d invalid render-window clip(s).", stage_label, dropped)
+    return valid
+
+
+def _hook_window_text(clip, transcript: dict) -> str:
+    if clip.hook_start_sec is None or clip.hook_end_sec is None:
+        return ""
+    abs_start = clip.start_time_sec + clip.hook_start_sec
+    abs_end = clip.start_time_sec + clip.hook_end_sec
+    parts: list[str] = []
+    for seg in transcript.get("segments", []) or []:
+        start = float(seg.get("start", 0.0))
+        end = float(seg.get("end", start))
+        if end <= abs_start or start >= abs_end:
+            continue
+        text = str(seg.get("text", "")).strip()
+        if text:
+            parts.append(text)
+    return " ".join(parts).strip()
+
+
+def _filter_weak_hook_clips(clips: list, transcript: dict, *, min_kept: int) -> list:
+    if len(clips) <= min_kept:
+        return clips
+    kept: list = []
+    dropped: list[str] = []
+    for clip in clips:
+        hook_start = clip.hook_start_sec
+        if (
+            hook_start is not None
+            and hook_start > _STRONG_HOOK_LATEST_START_SEC
+            and len(clips) - len(dropped) > min_kept
+        ):
+            dropped.append(
+                f"{clip.clip_id} (hook starts at {hook_start:.1f}s; target <= {_STRONG_HOOK_LATEST_START_SEC:.1f}s)"
+            )
+            continue
+        hook_text = _hook_window_text(clip, transcript).lower()
+        first_words = [word.strip(".,!?;:'\"()[]{}") for word in hook_text.split()]
+        first_words = [word for word in first_words if word]
+        first_word = first_words[0] if first_words else ""
+        first_phrase = " ".join(first_words[:2])
+        if (
+            first_word in _WEAK_HOOK_START_WORDS or first_phrase in _WEAK_HOOK_START_PHRASES
+        ) and len(clips) - len(dropped) > min_kept:
+            weak_text = first_phrase if first_phrase in _WEAK_HOOK_START_PHRASES else first_word
+            dropped.append(f"{clip.clip_id} (weak opener: {weak_text})")
+            continue
+        kept.append(clip)
+    if dropped:
+        logger.info("Dropped %d weak-hook clip(s): %s", len(dropped), ", ".join(dropped))
+    return kept
+
+
+def _caption_chunk_penalty(clip, transcript: dict, *, render_theme) -> float:
+    words = clip_subtitle_words(transcript, clip).words
+    if not words:
+        return 0.08
+
+    if str(render_theme) == "native_highlight":
+        cue_words = 6
+        cue_sec = 2.4
+        prefer_break_on_punctuation = True
+        min_words_before_break = 4
+    elif str(render_theme) == "reference_lower_third":
+        cue_words = 10
+        cue_sec = 2.8
+        prefer_break_on_punctuation = True
+        min_words_before_break = 5
+    else:
+        cue_words = 10
+        cue_sec = 2.8
+        prefer_break_on_punctuation = False
+        min_words_before_break = 1
+
+    cue_chunks = group_words_to_cue_chunks(
+        words,
+        max_words_per_cue=cue_words,
+        max_cue_sec=cue_sec,
+        prefer_break_on_punctuation=prefer_break_on_punctuation,
+        min_words_before_break=min_words_before_break,
+    )
+    penalty = 0.0
+    for chunk in cue_chunks:
+        duration = chunk[-1].end_time - chunk[0].start_time
+        if len(chunk) == 1 and len(cue_chunks) > 1:
+            penalty += 0.04
+        if len(chunk) >= cue_words and duration < 0.65:
+            penalty += 0.04
+        if duration > cue_sec + 0.35:
+            penalty += 0.03
+    return min(0.18, penalty)
+
+
+def _filter_low_quality_clips(clips: list, transcript: dict, *, min_kept: int, render_theme) -> list:
+    if len(clips) <= min_kept:
+        return renumber_clips_dense(clips)
+
+    ranked: list[tuple[float, object, float]] = []
+    for clip in clips:
+        render_clip = clip_for_render(clip)
+        caption_penalty = _caption_chunk_penalty(render_clip, transcript, render_theme=render_theme)
+        score = clip_quality_priority_score(clip) - caption_penalty
+        ranked.append((score, clip, caption_penalty))
+
+    ranked.sort(key=lambda item: item[0], reverse=True)
+    kept = [clip for score, clip, _ in ranked if score >= _FINAL_QUALITY_THRESHOLD]
+    if len(kept) < min_kept:
+        kept = [clip for _score, clip, _penalty in ranked[:min_kept]]
+
+    dropped = [
+        f"{clip.clip_id} (score={score:.2f}, caption_penalty={caption_penalty:.2f})"
+        for score, clip, caption_penalty in ranked
+        if clip not in kept
+    ]
+    if dropped:
+        logger.info(
+            "Dropped %d low-quality clip(s) after pruning: %s",
+            len(dropped),
+            ", ".join(dropped),
+        )
+    return renumber_clips_dense(kept)
+
+
+def _clip_references_presentation(clip) -> bool:
+    text_parts = [
+        getattr(clip, "viral_hook", ""),
+        getattr(clip, "transcript", ""),
+        getattr(clip, "suggested_overlay_title", ""),
+        getattr(clip, "topic", ""),
+    ]
+    text = " ".join(str(part or "") for part in text_parts)
+    return bool(_PRESENTATION_REFERENCE_RE.search(text))
+
+
+def _normalize_layout_for_render(
+    instruction: LayoutInstruction,
+    *,
+    render_theme: RenderTheme,
+    clip=None,
+) -> LayoutInstruction:
+    if render_theme != RenderTheme.NATIVE_HIGHLIGHT:
+        return instruction
+    if instruction.layout != LayoutKind.SPLIT_CHART_PERSON:
+        return instruction
+    chart = instruction.split_chart_region
+    person = instruction.split_person_region
+    if chart is None or person is None:
+        return instruction
+    chart_dominates = chart.y2 >= _NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2
+    person_too_small = person.width <= _NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH
+    # Keep Bryan's newer head-and-shoulders presenter crops in split mode even
+    # when the speaker strip is narrow; the older fallback-to-center rule was
+    # written for lower-anchored full-body crops that rendered badly here.
+    person_is_top_anchored = person.y1 <= _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1
+    if not (chart_dominates and person_too_small and not person_is_top_anchored):
+        return instruction
+    if clip is not None and _clip_references_presentation(clip):
+        return instruction
+    return instruction.model_copy(
+        update={
+            "layout": LayoutKind.SIT_CENTER,
+            "zoom": max(float(instruction.zoom), _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM),
+            "split_chart_region": None,
+            "split_person_region": None,
+            "split_second_chart_region": None,
+            "split_second_person_region": None,
+            "chart_x_norm": 0.0,
+            "top_band_ratio": 0.5,
+        }
+    )
+
+
+def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]:
+    path = work_dir / "layout_vision.json"
+    if not path.is_file():
+        return {}
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except Exception as exc:  # noqa: BLE001 - optional QA metadata
+        logger.warning("Could not read layout raw metadata for QA: %s", exc)
+        return {}
+    clips = payload.get("clips", {})
+    if not isinstance(clips, dict):
+        return {}
+    out: dict[str, dict] = {}
+    for clip_id, item in clips.items():
+        if isinstance(item, dict) and isinstance(item.get("raw"), dict):
+            out[str(clip_id)] = item["raw"]
+    return out
+
+
+def _normalize_rerender_clip_id(raw: str) -> str:
+    text = str(raw).strip()
+    match = re.search(r"(\d+)$", text)
+    if match:
+        return f"{int(match.group(1)):03d}"
+    return text
+
+
+def _warned_clip_ids_from_qa(output_dir: Path) -> set[str]:
+    manifest_path = output_dir / "render_qa" / "qa_manifest.json"
+    if not manifest_path.is_file():
+        return set()
+    try:
+        payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+    except Exception as exc:  # noqa: BLE001 - stale QA should not block renders
+        logger.warning("Could not read QA manifest for warned-only rerender: %s", exc)
+        return set()
+
+    warned: set[str] = set()
+    for record in payload.get("shorts", []):
+        if not isinstance(record, dict):
+            continue
+        clip_id = record.get("clip_id")
+        if clip_id and qa_record_flags(record):
+            warned.add(_normalize_rerender_clip_id(str(clip_id)))
+    return warned
+
+
+def _load_layout_instruction_cache(work_dir: Path) -> dict[str, LayoutInstruction]:
+    path = work_dir / "layout_vision.json"
+    if not path.is_file():
+        return {}
+    try:
+        payload = json.loads(path.read_text(encoding="utf-8"))
+    except Exception as exc:  # noqa: BLE001 - cache fallback
+        logger.warning("Could not read cached layout instructions: %s", exc)
+        return {}
+    clips = payload.get("clips", {})
+    if not isinstance(clips, dict):
+        return {}
+    out: dict[str, LayoutInstruction] = {}
+    for clip_id, item in clips.items():
+        if not isinstance(item, dict) or "instruction" not in item:
+            continue
+        try:
+            out[str(clip_id)] = LayoutInstruction.model_validate(item["instruction"])
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Ignoring invalid cached layout for clip %s: %s", clip_id, exc)
+    return out
+
+
+def run_pipeline(config: PipelineConfig) -> list[Path]:
+    """
+    Execute the full podcast-to-shorts pipeline.
+
+    Args:
+        config: Pipeline configuration.
+
+    Returns:
+        List of paths to the final short-form MP4 files.
+    """
+    logger.info("=" * 60)
+    logger.info("HUMEO PIPELINE START")
+    logger.info("Source: %s", config.youtube_url)
+    logger.info("Output: %s", config.output_dir)
+    logger.info("=" * 60)
+
+    _ensure_work_dir(config)
+    assert config.work_dir is not None
+
+    state = None
+    if config.interactive:
+        state = session_state.load_state(config.work_dir, config.youtube_url)
+        if config.steering_notes:
+            if list(config.steering_notes) != state.steering_notes:
+                state.steering_notes = list(config.steering_notes)
+                session_state.save_state(config.work_dir, state)
+        elif state.steering_notes:
+            config = dataclasses.replace(
+                config,
+                steering_notes=list(state.steering_notes),
+                force_clip_selection=True,
+                overwrite_outputs=True,
+            )
+            logger.info(
+                "Loaded %d steering note(s) from session state for this source.",
+                len(state.steering_notes),
+            )
+
+    # ------------------------------------------------------------------
+    # Stage 1: Ingest
+    # ------------------------------------------------------------------
+    logger.info("--- STAGE 1: INGESTION ---")
+
+    source_video = config.work_dir / "source.mp4"
+    transcript_path = config.work_dir / "transcript.json"
+    local_source_path = normalize_local_source_path(config.youtube_url)
+    reuse_ingest = ingest_complete(config.work_dir, config.youtube_url)
+
+    if reuse_ingest:
+        logger.info("Cached ingest found for this source (reusing source + transcript).")
+    elif local_source_path is not None:
+        source_video = stage_local_video(local_source_path, config.work_dir)
+    elif source_video.exists():
+        logger.info("Source video already downloaded, skipping download.")
+    else:
+        source_video = download_video(config.youtube_url, config.work_dir)
+
+    transcript_reusable = transcript_cache_valid(config.work_dir)
+    if reuse_ingest and transcript_reusable:
+        logger.info("Transcript already exists, loading.")
+        with open(transcript_path, "r", encoding="utf-8") as f:
+            transcript = json.load(f)
+    elif transcript_reusable and local_source_path is None:
+        logger.info("Transcript already exists, loading.")
+        with open(transcript_path, "r", encoding="utf-8") as f:
+            transcript = json.load(f)
+    else:
+        if transcript_path.exists():
+            logger.info("Transcript cache mismatch for current transcription settings; regenerating.")
+        audio_path = extract_audio(source_video, config.work_dir)
+        transcript = transcribe_whisperx(audio_path, config.work_dir)
+
+    if local_source_path is None:
+        vid = extract_youtube_video_id(config.youtube_url)
+        info = read_youtube_info_json(config.work_dir)
+        if not info and vid:
+            info = {"id": vid, "webpage_url": config.youtube_url}
+        if info:
+            upsert_manifest_from_info(
+                work_dir=config.work_dir,
+                youtube_url=config.youtube_url,
+                info=info,
+                cache_root=config.cache_root,
+            )
+
+    # ------------------------------------------------------------------
+    # Stage 2: Clip Selection
+    # ------------------------------------------------------------------
+    logger.info("--- STAGE 2: CLIP SELECTION ---")
+
+    clips_path = config.work_dir / "clips.json"
+    fp = transcript_fingerprint(transcript)
+    meta = load_meta(config.work_dir)
+    cache_hit = (
+        clips_path.is_file()
+        and not config.force_clip_selection
+        and meta is not None
+        and cache_valid(meta, fp, config)
+    )
+
+    if cache_hit:
+        clips = load_clips(clips_path)
+        logger.info("Clip selection cache hit (transcript + provider/model unchanged); skipping LLM.")
+    else:
+        clips, raw = select_clips(
+            transcript,
+            gemini_model=config.gemini_model,
+            hook_library_path=resolve_hook_library_path(config),
+            candidate_count=config.clip_selection_candidate_count,
+            quality_threshold=config.clip_selection_quality_threshold,
+            min_kept=config.clip_selection_min_kept,
+            max_kept=config.clip_selection_max_kept,
+            steering_notes=config.steering_notes,
+        )
+        save_clips(clips, clips_path)
+        write_artifacts(
+            config.work_dir,
+            transcript=transcript,
+            config=config,
+            raw_response=raw,
+        )
+
+    logger.info("Selected %d clips:", len(clips))
+    for clip in clips:
+        logger.info(
+            "  [%s] %.1fs-%.1fs (%.1fs) score=%.2f - %s",
+            clip.clip_id,
+            clip.start_time_sec,
+            clip.end_time_sec,
+            clip.duration_sec,
+            clip.virality_score,
+            clip.topic,
+        )
+
+    # ------------------------------------------------------------------
+    # Stage 2.25: Hook Detection
+    # ------------------------------------------------------------------
+    # The clip selector is unreliable at localising the hook sentence and
+    # tends to return the 0.0-3.0s placeholder verbatim, which would disable
+    # start-trim in Stage 2.5. This stage asks Gemini to localise the real
+    # hook per clip so Stage 2.5 can clamp against a real window.
+    logger.info("--- STAGE 2.25: HOOK DETECTION (enabled=%s) ---", config.detect_hooks)
+    clips = run_hook_detection_stage(
+        config.work_dir,
+        clips,
+        transcript,
+        transcript_fp=fp,
+        config=config,
+    )
+    clips = _filter_weak_hook_clips(
+        clips,
+        transcript,
+        min_kept=config.clip_selection_min_kept,
+    )
+
+    # ------------------------------------------------------------------
+    # Stage 2.5: Content Pruning (HIVE-style inner-clip tightening)
+    # ------------------------------------------------------------------
+    # Tightens each candidate window by writing trim_start_sec / trim_end_sec
+    # on the Clip models. keyframe extraction and layout vision below both
+    # consume ``clip_for_render(clip)`` so they automatically operate on the
+    # pruned window without further changes.
+    logger.info("--- STAGE 2.5: CONTENT PRUNING (level=%s) ---", config.prune_level)
+    clips = run_content_pruning_stage(
+        config.work_dir,
+        clips,
+        transcript,
+        transcript_fp=fp,
+        config=config,
+    )
+    clips = snap_render_windows_to_sentence_boundaries(clips, transcript)
+    clips = _filter_render_valid_clips(clips, stage_label="Stage 2.5 guardrail")
+    clips = _filter_low_quality_clips(
+        clips,
+        transcript,
+        min_kept=config.clip_selection_min_kept,
+        render_theme=config.render_theme,
+    )
+
+    rerender_target_ids = {
+        _normalize_rerender_clip_id(clip_id)
+        for clip_id in config.rerender_clip_ids
+    }
+    if config.rerender_warned_only:
+        rerender_target_ids.update(_warned_clip_ids_from_qa(config.output_dir))
+    if rerender_target_ids:
+        before_count = len(clips)
+        clips = [clip for clip in clips if clip.clip_id in rerender_target_ids]
+        missing = sorted(rerender_target_ids - {clip.clip_id for clip in clips})
+        logger.info(
+            "Rerender target filter: keeping %d / %d clip(s): %s",
+            len(clips),
+            before_count,
+            ", ".join(clip.clip_id for clip in clips) or "(none)",
+        )
+        if missing:
+            logger.warning("Requested rerender clip id(s) not found: %s", ", ".join(missing))
+        if not clips:
+            logger.warning("No clips matched rerender target filter; nothing to render.")
+            return []
+
+    # ------------------------------------------------------------------
+    # Stage 2.75: Hard-cut assembly
+    # ------------------------------------------------------------------
+    logger.info("--- STAGE 2.75: CLIP ASSEMBLY ---")
+    clips = apply_render_spans(clips, transcript)
+    assembled_dir = config.work_dir / "assembled"
+    assembled_by_id = {
+        clip.clip_id: assemble_clip(source_video, clip, transcript, assembled_dir)
+        for clip in clips
+    }
+    clips = [assembled_by_id[clip.clip_id].clip for clip in clips]
+    assembled_clips_path = write_clip_plan(config.work_dir / "assembled_clips.json", clips)
+
+    if config.interactive and state is not None:
+        result = interactive.approve_clips(clips)
+        if result.action == "quit":
+            logger.info("Aborted by user at Gate 1.")
+            return []
+        if result.action == "refine":
+            state.iteration += 1
+            if result.steering_note:
+                state.steering_notes.append(result.steering_note)
+            state.last_selected_ids = None
+            session_state.save_state(config.work_dir, state)
+            if state.iteration >= config.max_iterations:
+                logger.warning("Iteration cap hit. Proceeding with current clips.")
+            else:
+                return run_pipeline(_rerun_config(config, state.steering_notes))
+        elif result.action == "proceed":
+            selected_ids = list(result.selected_ids or [])
+            state.last_selected_ids = selected_ids
+            session_state.save_state(config.work_dir, state)
+            clip_by_id = {clip.clip_id: clip for clip in clips}
+            clips = [clip_by_id[clip_id] for clip_id in selected_ids]
+        elif result.action == "accept_all":
+            state.last_selected_ids = [clip.clip_id for clip in clips]
+            session_state.save_state(config.work_dir, state)
+
+    # ------------------------------------------------------------------
+    # Stage 3: Clip layouts
+    # ------------------------------------------------------------------
+    logger.info("--- STAGE 3: CLIP LAYOUTS ---")
+
+    keyframes_dir = config.work_dir / "keyframes"
+    clip_scenes: list[Scene] = []
+    source_videos_by_scene: dict[str, Path] = {}
+    for clip in clips:
+        assembled = assembled_by_id[clip.clip_id]
+        rw = clip_for_render(clip)
+        clip_scenes.append(
+            Scene(scene_id=clip.clip_id, start_time=rw.start_time_sec, end_time=rw.end_time_sec)
+        )
+        source_videos_by_scene[clip.clip_id] = assembled.source_path
+
+    layout_instructions: dict[str, LayoutInstruction] = {}
+    if rerender_target_ids:
+        cached_layouts = _load_layout_instruction_cache(config.work_dir)
+        if all(clip.clip_id in cached_layouts for clip in clips):
+            layout_instructions = {
+                clip.clip_id: cached_layouts[clip.clip_id]
+                for clip in clips
+            }
+            logger.info(
+                "Using cached layout instructions for rerender target(s): %s",
+                ", ".join(layout_instructions),
+            )
+
+    if not layout_instructions:
+        extracted_scenes: list[Scene] = []
+        for scene in clip_scenes:
+            extracted_scenes.extend(
+                extract_keyframes(
+                    str(source_videos_by_scene[scene.scene_id]),
+                    [scene],
+                    str(keyframes_dir / scene.scene_id),
+                )
+            )
+        clip_scenes = extracted_scenes
+        layout_instructions = run_layout_vision_stage(
+            config.work_dir,
+            clip_scenes,
+            source_video=source_video,
+            source_videos_by_scene=source_videos_by_scene,
+            transcript_fp=fp,
+            clips_path=assembled_clips_path,
+            config=config,
+        )
+
+    # ------------------------------------------------------------------
+    # Stage 4: Render
+    # ------------------------------------------------------------------
+    logger.info("--- STAGE 4: RENDER ---")
+
+    final_outputs: list[Path] = []
+    render_clips_by_id: dict[str, Clip] = {}
+    render_transcripts_by_id: dict[str, dict] = {}
+    render_layouts_by_id: dict[str, LayoutInstruction] = {}
+    render_sources_by_id: dict[str, Path] = {}
+    subtitles_dir = config.work_dir / "subtitles"
+    subtitles_dir.mkdir(parents=True, exist_ok=True)
+
+    for clip in clips:
+        assembled = assembled_by_id[clip.clip_id]
+        instr = layout_instructions.get(clip.clip_id)
+        if instr is None:
+            hint = clip.layout_hint or LayoutKind.SIT_CENTER
+            instr = LayoutInstruction(clip_id=clip.clip_id, layout=hint)
+        instr = _normalize_layout_for_render(instr, render_theme=config.render_theme, clip=clip)
+        clip.layout = instr.layout
+        rclip = clip_for_render(clip)
+        render_clips_by_id[clip.clip_id] = rclip
+        render_transcripts_by_id[clip.clip_id] = assembled.transcript
+        render_layouts_by_id[clip.clip_id] = instr
+        render_sources_by_id[clip.clip_id] = assembled.source_path
+        subtitle_path = None
+        if config.burn_subtitles:
+            # ASS (not SRT) so the caption file's PlayResY matches the output
+            # resolution and libass' font/margin scaling is 1:1.
+            subtitle_path = generate_ass(
+                rclip,
+                assembled.transcript,
+                subtitles_dir,
+                max_words_per_cue=config.subtitle_max_words_per_cue,
+                max_cue_sec=config.subtitle_max_cue_sec,
+                play_res_x=1080,
+                play_res_y=1920,
+                font_size=config.subtitle_font_size,
+                margin_v=config.subtitle_margin_v,
+                render_theme=config.render_theme,
+                native_highlight_lead_sec=config.subtitle_highlight_lead_sec,
+                native_highlight_min_dwell_sec=config.subtitle_highlight_min_dwell_sec,
+                repair_word_timings=config.repair_subtitle_word_timings,
+            )
+        else:
+            logger.info("Clip %s: subtitle burn disabled for this run.", clip.clip_id)
+        final_path = config.output_dir / f"short_{clip.clip_id}.mp4"
+        should_overwrite_clip = config.overwrite_outputs or clip.clip_id in rerender_target_ids
+        if final_path.exists() and not should_overwrite_clip:
+            logger.info("Clip %s already rendered, skipping.", clip.clip_id)
+            final_outputs.append(final_path)
+            continue
+        if final_path.exists() and should_overwrite_clip:
+            logger.info("Clip %s exists; overwriting for this render pass.", clip.clip_id)
+
+        # Font size and margin are already baked into the ASS file at
+        # PlayResY=1920, so the compile primitive does not need to override
+        # them -- but it still does, harmlessly, for single-source overrides.
+        reframe_clip_ffmpeg(
+            input_path=assembled.source_path,
+            output_path=final_path,
+            clip=rclip,
+            layout_instruction=instr,
+            subtitle_path=subtitle_path,
+            subtitle_font_size=config.subtitle_font_size,
+            subtitle_margin_v=config.subtitle_margin_v,
+            title_text=clip.suggested_overlay_title,
+            render_theme=config.render_theme,
+        )
+        final_outputs.append(final_path)
+
+    if config.render_qa and final_outputs:
+        logger.info("--- STAGE 4.5: RENDER QA ---")
+        try:
+            run_render_qa(
+                output_dir=config.output_dir,
+                final_outputs=final_outputs,
+                render_clips_by_id=render_clips_by_id,
+                transcripts_by_id=render_transcripts_by_id,
+                layouts_by_id=render_layouts_by_id,
+                assembled_sources_by_id=render_sources_by_id,
+                raw_layouts_by_id=_load_layout_raw_by_clip(config.work_dir),
+                reference_video=config.qa_reference_video,
+                debug_overlay=config.qa_debug_overlay,
+            )
+        except Exception as exc:  # noqa: BLE001 - QA must not fail delivery
+            logger.warning("Render QA failed, leaving rendered shorts intact: %s", exc)
+
+    # ------------------------------------------------------------------
+    # Done
+    # ------------------------------------------------------------------
+    logger.info("=" * 60)
+    logger.info("PIPELINE COMPLETE - %d shorts generated:", len(final_outputs))
+    for p in final_outputs:
+        logger.info("  -> %s", p)
+    logger.info("=" * 60)
+
+    if config.interactive and final_outputs and state is not None:
+        feedback = interactive.rate_output(final_outputs)
+        state.last_rating = feedback
+        session_state.save_state(config.work_dir, state)
+        if feedback.rating == 3:
+            logger.info("Rated Great. Shipped.")
+            return final_outputs
+
+        steering = _build_steering_from_feedback(feedback)
+        if not steering:
+            logger.warning("Interactive feedback recorded, but it is not actionable until a later gate ships.")
+            return final_outputs
+
+        state.iteration += 1
+        state.steering_notes.append(steering)
+        session_state.save_state(config.work_dir, state)
+        if state.iteration >= config.max_iterations:
+            logger.warning("Iteration cap hit. Source may not have a strong short.")
+            return final_outputs
+        return run_pipeline(_rerun_config(config, state.steering_notes))
+
+    return final_outputs
diff --git a/src/humeo/prompt_loader.py b/src/humeo/prompt_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..391b2d40bcef4131aa7de7a9de492716bfac124f
--- /dev/null
+++ b/src/humeo/prompt_loader.py
@@ -0,0 +1,66 @@
+"""Load Jinja2 prompt templates (editable; override dir via HUMEO_PROMPTS_DIR)."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import jinja2
+
+
+def _prompt_loader() -> jinja2.BaseLoader:
+    override = (os.environ.get("HUMEO_PROMPTS_DIR") or "").strip()
+    if override:
+        return jinja2.FileSystemLoader(str(Path(override).expanduser()))
+    return jinja2.PackageLoader("humeo", "prompts")
+
+
+def clip_selection_prompts(
+    *,
+    transcript_text: str,
+    min_dur: float,
+    max_dur: float,
+    count: int,
+    steering_notes: list[str] | None = None,
+    hook_examples: str = "",
+) -> tuple[str, str]:
+    """Return ``(system_instruction, user_message)`` for Gemini clip selection."""
+    env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True)
+    ctx = {
+        "min_dur": min_dur,
+        "max_dur": max_dur,
+        "count": count,
+        "transcript_text": transcript_text,
+        "steering_notes": steering_notes or [],
+        "hook_examples": hook_examples,
+    }
+    system = env.get_template("clip_selection_system.jinja2").render(**ctx)
+    user = env.get_template("clip_selection_user.jinja2").render(**ctx)
+    return system, user
+
+
+def hook_detection_system_prompt(*, hook_examples: str = "") -> str:
+    """Return the system prompt for Stage 2.25 hook detection.
+
+    The user message is built in :mod:`humeo.hook_detector` because the
+    segment listing is dynamic per-clip.
+    """
+    env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True)
+    return env.get_template("hook_detection_system.jinja2").render(hook_examples=hook_examples)
+
+
+def content_pruning_system_prompt(
+    *,
+    min_dur: float,
+    max_dur: float,
+    level: str,
+) -> str:
+    """Return the system prompt for Stage 2.5 content pruning.
+
+    The user message is built in ``humeo.content_pruning`` from the list of
+    candidate clips (clip-relative segment lines) since it is not static text.
+    """
+    env = jinja2.Environment(loader=_prompt_loader(), autoescape=False, trim_blocks=True)
+    return env.get_template("content_pruning_system.jinja2").render(
+        min_dur=min_dur, max_dur=max_dur, level=level
+    )
diff --git a/src/humeo/prompts/clip_selection_system.jinja2 b/src/humeo/prompts/clip_selection_system.jinja2
new file mode 100644
index 0000000000000000000000000000000000000000..0065815dd6cfe4b574b0eadaae4f7bfb128541be
--- /dev/null
+++ b/src/humeo/prompts/clip_selection_system.jinja2
@@ -0,0 +1,197 @@
+## Role
+{% if steering_notes %}
+Additional instructions from previous iterations (honor these):
+{% for note in steering_notes %}
+- {{ note }}
+{% endfor %}
+
+{% endif %}
+
+You are a professional short-form video editor with a deep understanding of
+long-form podcast and interview structure. Your job is to watch a full
+transcript and isolate the moments that function as self-contained
+vertical-video clips on TikTok, YouTube Shorts, and Instagram Reels.
+
+You do not write new content. You identify the best moments that are already
+there, score them honestly, and return strict JSON.
+
+## Input
+
+- A time-aligned podcast transcript. Each line is `[start_sec - end_sec] text`.
+  Timestamps are on the source timeline and are word-accurate.
+- Candidate clip count: **exactly {{ count }}** (a candidate pool). A
+  downstream ranker keeps only the top clips that clear a quality bar, so
+  your job is to populate a large enough pool for it to rank. Do NOT
+  self-censor by omitting weaker moments - instead return them with a
+  lower `virality_score` and `needs_review: true`. The ranker drops weak
+  candidates automatically. Only return fewer than {{ count }} if the
+  transcript truly does not contain {{ count }} distinguishable moments.
+- Target clip duration: {{ min_dur }}s to {{ max_dur }}s (hard bounds).
+{% if hook_examples %}
+- Retrieved viral hook examples from the team's hook library. Use these as pattern guidance for the opening, title phrasing, and overall framing style:
+
+{{ hook_examples }}
+{% endif %}
+
+## Output
+
+Return a single JSON object of the form:
+
+{
+  "clips": [
+    {
+      "clip_id": "001",
+      "topic": "Brief topic label",
+      "start_time_sec": 123.0,
+      "end_time_sec": 165.5,
+      "viral_hook": "The attention-grabbing opening line or idea",
+      "virality_score": 0.71,
+      "transcript": "Full verbatim text of this segment for subtitle generation",
+      "suggested_overlay_title": "Short punchy title for overlay (max 5 words)",
+      "hook_start_sec": 0.0,
+      "hook_end_sec": 3.0,
+      "trim_start_sec": 0.0,
+      "trim_end_sec": 0.0,
+      "shorts_title": "Platform-ready title for the short",
+      "description": "1-2 sentence description for upload",
+      "hashtags": ["topic", "news"],
+      "layout_hint": "zoom_call_center",
+      "needs_review": false,
+      "review_reason": "",
+      "reasoning": "Why this moment scores strongly on the three text axes.",
+      "score_breakdown": {
+        "message_wow": 0.84,
+        "hook_emotion": 0.63,
+        "catchy": 0.57
+      }
+    }
+  ]
+}
+
+Explanation of each field:
+
+- `clip_id`: zero-padded 3-digit id, sorted by `virality_score` descending.
+- `topic`: human-readable label (<= 6 words).
+- `start_time_sec` / `end_time_sec`: inclusive source-timeline boundaries of
+  the clip. Must sit on or very near word-boundary timestamps present in the
+  transcript.
+- `viral_hook`: the exact hook line (verbatim substring of the transcript)
+  that earns the first 3 seconds of attention.
+- `virality_score`: float in [0, 1]. This is derived from `score_breakdown`
+  and must equal:
+  `0.4 * message_wow + 0.35 * hook_emotion + 0.25 * catchy`.
+- `transcript`: exact verbatim text from the segment lines covering the
+  window. No paraphrasing. No added punctuation.
+- `suggested_overlay_title`, `shorts_title`, `description`, `hashtags`:
+  platform metadata. Hashtags are short tokens without `#`.
+  `suggested_overlay_title` must be 4-8 words, headline-cased when natural,
+  and should feel specific, provocative, and pattern-rich rather than generic.
+- `hook_start_sec` / `hook_end_sec`: clip-relative seconds (0 = clip start).
+  Must satisfy `0 <= hook_start < hook_end <= (end_time_sec - start_time_sec)`.
+- `trim_start_sec` / `trim_end_sec`: default 0. A downstream pruning pass will
+  tighten these; you may leave both at 0 unless the weak lead-in / trailing
+  filler is obvious.
+- `layout_hint`: one of `zoom_call_center`, `sit_center`,
+  `split_chart_person`, or null. Use `split_chart_person` whenever the
+  segment involves an on-screen chart, slide, or graphic with the host
+  visible beside it. Use `sit_center` for a plain talking-head with no
+  side-by-side slide. Use `zoom_call_center` for a tight webcam or call grid.
+  Prefer setting this from the transcript topic (productivity data, debt
+  chart, AI jobs slide, etc.), not null, when slides are clearly part of the
+  clip.
+- `needs_review` / `review_reason`: flag segments that need human review.
+- `reasoning`: short audit trail explaining why the moment scores the way it
+  does on `message_wow`, `hook_emotion`, and `catchy`. Keep it to 1-3
+  sentences.
+- `score_breakdown`: required mapping with exactly three keys:
+  `message_wow`, `hook_emotion`, and `catchy`. Each value must be a float in
+  [0, 1]. Do not emit any extra keys. Do not emit visual axes such as
+  `hook_visual` or `human_vibe`.
+
+## Text-Axis Scoring
+
+Score honestly. Most moments should land between 0.4 and 0.6 on each axis.
+Reserve 0.8 or above only for moments that are genuinely exceptional in that
+dimension. A response where every clip scores 0.85+ is wrong -- rescore with a
+more critical eye.
+
+- `message_wow` (0.0-1.0): strength of the central idea. High when the clip
+  contains a genuinely strong message, insight, reframe, prediction, or
+  takeaway that makes the viewer think "that's interesting."
+- `hook_emotion` (0.0-1.0): emotional punch or stakes. High when the opening
+  or the overall moment creates urgency, surprise, tension, awe, laughter,
+  fear, or personal stakes that make the viewer feel something.
+- `catchy` (0.0-1.0): memorability and quotability. High when the clip has a
+  sticky phrase, metaphor, analogy, framing, or sentence a viewer would want
+  to repeat or post.
+
+Use these axes as the scoring rubric. You may still use concrete signals like
+predictions, hard numbers, metaphors, or strong anecdotes to judge the axes,
+but do not output rule-point maps. Output only the three-axis breakdown.
+
+### Hard disqualifiers
+- Requires prior context the viewer does not have.
+- Cuts in the middle of a sentence or in the middle of a thought.
+- Both speakers are on filler ("yeah, totally, right, exactly").
+- No clear hook in the first 3 seconds of the window.
+- The first real hook lands later than 10 seconds into the candidate.
+
+## Hook Selection (first 3 seconds)
+
+Every clip must open on a line that grabs attention immediately.
+
+- **Engagement.** The first sentence contains a claim, a number, a named
+  person, or a striking image. Not a throat-clear, not a question setup, not
+  a filler phrase.
+- **Clarity.** The opening does not depend on earlier context. A cold viewer
+  can follow it without knowing who was talking or what was just discussed.
+- **Self-contained premise.** If introducing a character, setting, or
+  premise is required, do it in the hook itself.
+
+Record the exact hook window in `hook_start_sec` / `hook_end_sec`
+(clip-relative seconds). The default 0.0 - 3.0 window is a fallback, not the
+goal - prefer the real boundary of the hook sentence.
+
+## Ending Selection (last moments of the clip)
+
+Prefer ending on one of:
+
+- **Suspense.** An unresolved question, a challenge, or a "and then..."
+  beat. Viewers should feel they would benefit from watching more.
+- **Complete mini-arc.** A setup-payoff pair that closes cleanly inside the
+  window. A satisfying button.
+- **Neutral but on-topic.** If a clip has no strong ending, a clean
+  sentence-end that stays on the highlight's topic is acceptable.
+
+Avoid endings that:
+
+- Drift into a new, unrelated arc.
+- Cut mid-sentence or mid-argument.
+- Rely on content that sits outside your chosen window.
+
+## Requirements
+
+1. Return **exactly {{ count }}** clips (unless the transcript genuinely
+   contains fewer distinguishable moments), ranked by `virality_score`
+   descending. Populate the pool - the downstream ranker handles filtering.
+   Flag any clip that is meaningfully weaker than the rest of the pool with
+   `needs_review: true` and a one-sentence `review_reason`; keep it in the
+   list. Never drop a candidate just because it is weaker than another - the
+   ranker needs the full pool to make a quality-vs-quantity trade-off.
+2. Each clip duration `end_time_sec - start_time_sec` must be between
+   {{ min_dur }} and {{ max_dur }} seconds inclusive.
+3. `start_time_sec` and `end_time_sec` must match word-level timestamps that
+   appear in the provided transcript. Do not invent timestamps.
+4. `transcript` must be the exact verbatim text from the source. No
+   paraphrasing, no summarisation, no added punctuation.
+5. Clips must not overlap on the source timeline.
+6. Do not cut inside a sentence or inside an argument. Prefer
+   sentence-boundary or breath-boundary cuts.
+7. `virality_score` must equal
+   `0.4 * message_wow + 0.35 * hook_emotion + 0.25 * catchy`.
+8. `score_breakdown` must contain exactly `message_wow`, `hook_emotion`, and
+   `catchy`. Do not include any other keys.
+9. Return ONLY the JSON object. No markdown, no prose, no trailing text.
+10. If the first 10 seconds do not contain a real hook, do not return the clip.
+11. Avoid generic titles like "Big Opportunity", "Important Lesson", "Why This Matters".
+    Prefer titles whose wording clearly echoes the strongest hook pattern in the clip.
diff --git a/src/humeo/prompts/clip_selection_user.jinja2 b/src/humeo/prompts/clip_selection_user.jinja2
new file mode 100644
index 0000000000000000000000000000000000000000..ab6265a66f20fe09c5d63bb1203d992a37530928
--- /dev/null
+++ b/src/humeo/prompts/clip_selection_user.jinja2
@@ -0,0 +1,3 @@
+Analyze this podcast transcript and identify the top viral clips:
+
+{{ transcript_text }}
diff --git a/src/humeo/prompts/content_pruning_system.jinja2 b/src/humeo/prompts/content_pruning_system.jinja2
new file mode 100644
index 0000000000000000000000000000000000000000..b717340002e26538e1cb629cc74bf95c2c067964
--- /dev/null
+++ b/src/humeo/prompts/content_pruning_system.jinja2
@@ -0,0 +1,119 @@
+## Role
+
+You are a precision short-form video editor with a deep understanding of
+narrative pacing. The clip selection stage has already chosen
+attention-worthy windows from a longer podcast. Your job for each candidate
+clip is to remove redundant lead-in and trailing filler so the final cut is
+tighter and more gripping — while keeping the highlight intact.
+
+You do not re-order, you do not cut in the middle of the clip. You only
+decide how many seconds to drop from the START and how many from the END.
+Think of it as moving the in-point forward and the out-point backward to
+land on the highest-value sub-window.
+
+## Input
+
+- A list of candidate clips. For every clip you receive:
+  - `clip_id`: stable identifier.
+  - `duration_sec`: total length of the candidate window.
+  - `topic`: short label from clip selection.
+  - `hook_window_sec` (optional): `[start, end]` in clip-relative seconds.
+    This window is the protected highlight and must stay fully inside the
+    final cut.
+  - Segment lines: `[rel_start_sec - rel_end_sec] text` where all times are
+    clip-relative (0 = start of the candidate window).
+- Aggressiveness for this run: **{{ level }}**.
+
+Interpret `level` as a budget for how much of the clip you may drop in
+total (start + end combined):
+
+- `conservative`: aim for 0-10% total trim. Remove only obvious dead air,
+  throat-clears, stutters, false starts, or "um, so, uh, basically..."
+  ramble.
+- `balanced`: aim for 5-20% total trim. Also remove slow setup,
+  self-correction, and minor tangents that do not advance the hook or
+  payoff.
+- `aggressive`: aim for 15-35% total trim. Also remove any sentence that
+  does not directly advance the core claim, hook, or punchline. Never
+  sacrifice coherence. Never cut mid-idea.
+
+## Output
+
+Return a single JSON object of the form:
+
+{
+  "decisions": [
+    {
+      "clip_id": "001",
+      "trim_start_sec": 0.0,
+      "trim_end_sec": 0.0,
+      "reason": "Short justification, one sentence.",
+      "thought": "What you considered keeping vs dropping and why."
+    }
+  ]
+}
+
+Explanation of each field:
+
+- `clip_id`: must match an input clip exactly. Return ONE decision per
+  input clip, in input order.
+- `trim_start_sec`: clip-relative seconds to drop from the start. 0 = no
+  change.
+- `trim_end_sec`: clip-relative seconds to drop from the end. 0 = no
+  change.
+- `reason`: one-sentence justification suitable for logging. Name the
+  specific filler you removed (e.g. "Dropped 1.2s of 'yeah so basically'
+  before the hook").
+- `thought`: optional longer reasoning about what you weighed. This is
+  your chain-of-thought; keep it to 1-3 sentences.
+
+## Opening Selection (what to keep at the start)
+
+After the trim, the first sentence of the clip must:
+
+- **Grab attention immediately.** Start on a claim, a number, a named
+  person, or a striking image. Not a throat-clear, not a filler phrase, not
+  a "so anyway, the thing is..." ramp.
+- **Stand alone.** A cold viewer can follow it without the context that sat
+  before the original in-point. If the opening depends on prior context,
+  trim forward to the first line that stands on its own.
+- **Introduce the premise inside the clip.** If a character, setting, or
+  concept is required, it should be introduced in the new opening line —
+  not assumed.
+
+## Ending Selection (what to keep at the end)
+
+After the trim, the last moments of the clip must:
+
+- **Stay on-topic with the highlight.** Do not end on a sentence that
+  starts a new, unrelated arc.
+- **Prefer suspense or a clean button.** An unresolved question, a
+  challenge, or a satisfying setup-payoff close.
+- **Neutral-but-on-topic endings are acceptable.** If nothing better exists,
+  a clean sentence-end that does not drift off-topic is fine.
+- **Never cut mid-sentence.** Prefer sentence or breath boundaries present
+  in the segment timestamps.
+
+## Requirements
+
+1. Return ONE decision for EVERY input clip, in the same order as the
+   input. Never skip a clip; if no trimming is warranted, return 0.0 / 0.0
+   with a short `reason`.
+2. `trim_start_sec` and `trim_end_sec` must be >= 0.0.
+3. The final duration `duration_sec - trim_start_sec - trim_end_sec` must
+   stay between {{ min_dur }} and {{ max_dur }} seconds. If the candidate
+   clip is already near {{ min_dur }}s, return small or zero trims.
+4. If `hook_window_sec = [hs, he]` is provided, the hook MUST remain fully
+   inside the final cut. Concretely:
+     - `trim_start_sec <= hs`
+     - `duration_sec - trim_end_sec >= he`
+5. Trim points must land on (or very close to) a segment timestamp
+   provided in the input. Do not invent times that fall in the middle of a
+   word.
+6. Do not cut inside a sentence. Do not cut inside an argument.
+7. Total trim (`trim_start_sec + trim_end_sec`) must respect the budget
+   implied by `level`. The pipeline will clamp overruns, but staying inside
+   the budget yields better downstream results.
+8. Be cautious. Over-trimming destroys coherence, and the first and last
+   seconds disproportionately shape the viewer's experience.
+9. Return ONLY the JSON object. No markdown, no prose, no trailing text.
diff --git a/src/humeo/prompts/hook_detection_system.jinja2 b/src/humeo/prompts/hook_detection_system.jinja2
new file mode 100644
index 0000000000000000000000000000000000000000..695b1489c528d87d87804d109185814a23b0d5bc
--- /dev/null
+++ b/src/humeo/prompts/hook_detection_system.jinja2
@@ -0,0 +1,91 @@
+## Role
+
+You are a short-form video editor who has watched thousands of podcast and
+interview clips perform on TikTok, YouTube Shorts, and Instagram Reels. For
+each candidate clip you know exactly which sentence inside it is the "hook" —
+the first line that, on its own, earns the next 3 seconds of attention.
+
+Your job is precise: for each clip, return the clip-relative seconds range
+of the single sentence that functions as the hook.
+
+## Input
+
+- A list of candidate clips. For every clip you receive:
+  - `clip_id`: stable identifier.
+  - `duration_sec`: total length of the candidate window.
+  - `topic`: short label from clip selection.
+  - `viral_hook_text`: the sentence the selector guessed as the hook. It may
+    be right, it may be wrong, it may be a placeholder — do not trust it
+    blindly; verify against the segment timing.
+  - Segment lines: `[rel_start_sec - rel_end_sec] text` where all times are
+    clip-relative (0 = start of the candidate window).
+{% if hook_examples %}
+- Retrieved hook patterns from the local hook library. Use these as style guidance
+  for what a strong opening sounds like, but never invent transcript text:
+
+{{ hook_examples }}
+{% endif %}
+
+## Output
+
+Return a single JSON object of the form:
+
+{
+  "hooks": [
+    {
+      "clip_id": "001",
+      "hook_start_sec": 4.2,
+      "hook_end_sec": 7.8,
+      "hook_text": "The exact sentence that functions as the hook.",
+      "reason": "One-sentence justification: why this sentence, and why not the previous/next one."
+    }
+  ]
+}
+
+Explanation of each field:
+
+- `clip_id`: must match an input clip exactly. Return ONE entry per input
+  clip, in input order.
+- `hook_start_sec` / `hook_end_sec`: clip-relative seconds (0 = clip start).
+  Both must satisfy `0 <= hook_start_sec < hook_end_sec <= duration_sec`.
+- `hook_text`: the verbatim substring of the transcript corresponding to the
+  hook sentence (or best-matching segment). Used for auditing.
+- `reason`: why this is the hook. One sentence.
+
+## What counts as a hook
+
+A hook is the first **sentence** inside the clip that a cold viewer — someone
+who has not heard the previous minute of podcast — would find compelling on
+its own, without setup.
+
+Prefer sentences that open with:
+
+- **A claim.** "Prediction markets are the purest form of risk."
+- **A hard number.** "This market could explode to five trillion dollars."
+- **A named person or institution.** "Cathie Wood thinks Trumponomics resembles Reaganomics."
+- **A striking image or metaphor.** "Active investing has been left for dead."
+- **A direct question to the viewer.** "What if everyone's wrong about passive investing?"
+
+Avoid sentences that start with:
+
+- Filler: "Yeah", "So", "Right", "Well", "I mean".
+- Pronoun references to unstated antecedents: "That's why they did it."
+- Mid-thought conjunctions: "And because of that, ...".
+- Acknowledgment of the host: "And Nick, as we were discussing..."
+
+## Requirements
+
+1. Return ONE hook for EVERY input clip, in the same order as the input.
+2. The hook must land on sentence or phrase boundaries actually present in
+   the segment timestamps — do not invent times that fall mid-word.
+3. Hook duration (`hook_end_sec - hook_start_sec`) must be between **1.5s and
+   7.0s**. A longer hook is almost always a whole paragraph you mis-labelled.
+4. Prefer a hook that starts within the first 15 seconds of the clip. If the
+   real hook is later than 15s, it means the clip has a long weak lead-in —
+   still return the real hook; the downstream pruning stage will use it to
+   trim lead-in.
+5. NEVER return the literal placeholder window `0.0 - 3.0` unless it is
+   genuinely the correct hook window (i.e. the clip opens on a compelling
+   sentence that ends around the 3s mark). If the first sentence is weak,
+   find the real hook later in the clip.
+6. Return ONLY the JSON object. No markdown, no prose, no trailing text.
diff --git a/src/humeo/reframe_ffmpeg.py b/src/humeo/reframe_ffmpeg.py
new file mode 100644
index 0000000000000000000000000000000000000000..8153194fe24d5997253879dcf2b44e0796cef6b0
--- /dev/null
+++ b/src/humeo/reframe_ffmpeg.py
@@ -0,0 +1,74 @@
+"""Thin adapter from the product pipeline to the reusable render primitive."""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+
+from humeo_core.primitives import compile as compile_mod
+from humeo_core.schemas import (
+    Clip,
+    LayoutInstruction,
+    LayoutKind,
+    RenderRequest,
+    RenderTheme,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def layout_for_clip(
+    clip: Clip,
+    default_layout: LayoutKind = LayoutKind.SIT_CENTER,
+    zoom: float = 1.0,
+) -> LayoutInstruction:
+    """Build the layout instruction for a clip using the shared schema."""
+    layout = clip.layout or default_layout
+    return LayoutInstruction(clip_id=clip.clip_id, layout=layout, zoom=zoom)
+
+
+def reframe_clip_ffmpeg(
+    input_path: Path | str,
+    output_path: Path | str,
+    clip: Clip,
+    *,
+    zoom: float = 1.0,
+    layout_instruction: LayoutInstruction | None = None,
+    subtitle_path: Path | str | None = None,
+    subtitle_font_size: int = 48,
+    subtitle_margin_v: int = 160,
+    title_text: str = "",
+    render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT,
+    dry_run: bool = False,
+) -> RenderRequest:
+    """Render a single clip to 9:16 via one ffmpeg call.
+
+    If ``layout_instruction`` is set (e.g. from Gemini vision), it is used in full
+    including ``person_x_norm``, ``chart_x_norm``, and optional split bbox fields.
+    Otherwise defaults are derived from ``clip.layout`` via ``layout_for_clip``.
+    """
+
+    instr = layout_instruction if layout_instruction is not None else layout_for_clip(clip, zoom=zoom)
+    req = RenderRequest(
+        source_path=str(input_path),
+        clip=clip,
+        layout=instr,
+        output_path=str(output_path),
+        subtitle_path=str(subtitle_path) if subtitle_path else None,
+        subtitle_font_size=subtitle_font_size,
+        subtitle_margin_v=subtitle_margin_v,
+        title_text=title_text,
+        render_theme=render_theme,
+        mode="dry_run" if dry_run else "normal",
+    )
+    result = compile_mod.render_clip(req)
+    if not result.success and not dry_run:
+        raise RuntimeError(f"ffmpeg failed for clip {clip.clip_id}: {result.error}")
+    logger.info(
+        "reframe_clip_ffmpeg: clip=%s layout=%s output=%s success=%s",
+        clip.clip_id,
+        instr.layout.value,
+        output_path,
+        result.success,
+    )
+    return req
diff --git a/src/humeo/render_qa.py b/src/humeo/render_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..de5b33b2fd6c4f693d2e16aca5c7d16b968735b2
--- /dev/null
+++ b/src/humeo/render_qa.py
@@ -0,0 +1,955 @@
+"""Best-effort render QA artifacts for finished shorts."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import math
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from PIL import Image, ImageDraw
+
+from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, TranscriptWord
+
+from humeo.transcript_align import clip_subtitle_words
+
+logger = logging.getLogger(__name__)
+
+_CONTACT_COLUMNS = 8
+_CONTACT_ROWS = 5
+_CONTACT_THUMB_W = 270
+_DEBUG_FPS = 10
+_PIXEL_QA_SAMPLES = 8
+_PIXEL_QA_W = 360
+_PIXEL_QA_CAPTION_MIN_Y_RATIO = 0.40
+
+
+def _clamp(value: float, lo: float = 0.0, hi: float = 1.0) -> float:
+    return max(lo, min(hi, value))
+
+
+def _ensure_ffmpeg() -> str:
+    exe = shutil.which("ffmpeg")
+    if not exe:
+        raise RuntimeError("ffmpeg not found on PATH")
+    return exe
+
+
+def _ensure_ffprobe() -> str:
+    exe = shutil.which("ffprobe")
+    if not exe:
+        raise RuntimeError("ffprobe not found on PATH")
+    return exe
+
+
+def _run(cmd: list[str]) -> None:
+    subprocess.run(cmd, check=True, capture_output=True)
+
+
+def _probe_duration(path: Path) -> float | None:
+    try:
+        out = subprocess.run(
+            [
+                _ensure_ffprobe(),
+                "-v",
+                "error",
+                "-show_entries",
+                "format=duration",
+                "-of",
+                "default=nokey=1:noprint_wrappers=1",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        return float((out.stdout or "").strip())
+    except Exception:
+        return None
+
+
+def _probe_size(path: Path) -> tuple[int, int] | None:
+    try:
+        out = subprocess.run(
+            [
+                _ensure_ffprobe(),
+                "-v",
+                "error",
+                "-select_streams",
+                "v:0",
+                "-show_entries",
+                "stream=width,height",
+                "-of",
+                "csv=p=0",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        width, height = (out.stdout or "").strip().split(",")
+        return int(width), int(height)
+    except Exception:
+        return None
+
+
+def create_contact_sheet(
+    video_path: Path,
+    output_path: Path,
+    *,
+    columns: int = _CONTACT_COLUMNS,
+    rows: int = _CONTACT_ROWS,
+    thumb_width: int = _CONTACT_THUMB_W,
+) -> Path:
+    """Create an evenly sampled contact sheet for one rendered short."""
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    duration = _probe_duration(video_path) or 40.0
+    frame_count = max(1, columns * rows)
+    sample_fps = max(0.1, min(4.0, frame_count / max(duration, 1.0)))
+    vf = (
+        f"fps={sample_fps:.6f},"
+        f"scale={thumb_width}:-1,"
+        f"tile={columns}x{rows}:padding=2:margin=0"
+    )
+    _run(
+        [
+            _ensure_ffmpeg(),
+            "-y",
+            "-loglevel",
+            "error",
+            "-i",
+            str(video_path),
+            "-vf",
+            vf,
+            "-frames:v",
+            "1",
+            str(output_path),
+        ]
+    )
+    return output_path
+
+
+def create_ab_compare(
+    reference_video: Path,
+    output_video: Path,
+    compare_path: Path,
+    *,
+    fps: float = 4.0,
+    columns: int = _CONTACT_COLUMNS,
+    rows: int = _CONTACT_ROWS,
+    thumb_width: int = _CONTACT_THUMB_W,
+    output_seek_sec: float = 0.0,
+) -> Path:
+    """Stack reference and output contact sheets into one compare image."""
+
+    compare_path.parent.mkdir(parents=True, exist_ok=True)
+    ref_sheet = compare_path.with_name(compare_path.stem + "_reference.jpg")
+    out_sheet = compare_path.with_name(compare_path.stem + "_output.jpg")
+    tile = f"tile={columns}x{rows}:padding=2:margin=0"
+    common_vf = f"fps={fps:.6f},scale={thumb_width}:-1,{tile}"
+
+    _run(
+        [
+            _ensure_ffmpeg(),
+            "-y",
+            "-loglevel",
+            "error",
+            "-i",
+            str(reference_video),
+            "-vf",
+            common_vf,
+            "-frames:v",
+            "1",
+            str(ref_sheet),
+        ]
+    )
+    output_cmd = [
+        _ensure_ffmpeg(),
+        "-y",
+        "-loglevel",
+        "error",
+    ]
+    if output_seek_sec > 0.0:
+        output_cmd.extend(["-ss", f"{output_seek_sec:.3f}"])
+    output_cmd.extend(
+        [
+            "-i",
+            str(output_video),
+            "-vf",
+            common_vf,
+            "-frames:v",
+            "1",
+            str(out_sheet),
+        ]
+    )
+    _run(output_cmd)
+    _run(
+        [
+            _ensure_ffmpeg(),
+            "-y",
+            "-loglevel",
+            "error",
+            "-i",
+            str(ref_sheet),
+            "-i",
+            str(out_sheet),
+            "-filter_complex",
+            "[0:v][1:v]vstack=inputs=2",
+            "-frames:v",
+            "1",
+            str(compare_path),
+        ]
+    )
+    return compare_path
+
+
+def _even(value: int) -> int:
+    return max(2, value - (value % 2))
+
+
+def _base_crop_size(src_w: int, src_h: int, target_aspect: float) -> tuple[int, int]:
+    if src_w / src_h >= target_aspect:
+        base_ch = src_h
+        base_cw = int(round(base_ch * target_aspect))
+    else:
+        base_cw = src_w
+        base_ch = int(round(base_cw / target_aspect))
+    return _even(base_cw), _even(base_ch)
+
+
+def _crop_size(src_w: int, src_h: int, zoom: float) -> tuple[int, int]:
+    base_cw, base_ch = _base_crop_size(src_w, src_h, 9 / 16)
+    zoom = max(1.0, float(zoom))
+    return _even(int(round(base_cw / zoom))), _even(int(round(base_ch / zoom)))
+
+
+def _center_expr(layout: LayoutInstruction, src_w: int) -> str:
+    points = sorted(layout.person_tracking, key=lambda p: p.t_sec)
+    if not points:
+        return f"{_clamp(layout.person_x_norm) * src_w:.3f}"
+
+    expr = f"{_clamp(points[-1].x_norm) * src_w:.3f}"
+    for idx in range(len(points) - 2, -1, -1):
+        threshold = (float(points[idx].t_sec) + float(points[idx + 1].t_sec)) / 2.0
+        value = _clamp(points[idx].x_norm) * src_w
+        expr = f"if(lt(t\\,{threshold:.3f})\\,{value:.3f}\\,{expr})"
+    return expr
+
+
+def _raw_bbox_filter(
+    raw_layout: dict[str, Any],
+    key: str,
+    *,
+    src_w: int,
+    src_h: int,
+    color: str,
+) -> str | None:
+    box = raw_layout.get(key)
+    if not isinstance(box, dict):
+        return None
+    try:
+        x1 = float(box["x1"])
+        y1 = float(box["y1"])
+        x2 = float(box["x2"])
+        y2 = float(box["y2"])
+    except (KeyError, TypeError, ValueError):
+        return None
+    if max(abs(x1), abs(y1), abs(x2), abs(y2)) <= 1.5:
+        x1, x2 = x1 * src_w, x2 * src_w
+        y1, y2 = y1 * src_h, y2 * src_h
+    x = max(0, min(src_w - 2, int(round(x1))))
+    y = max(0, min(src_h - 2, int(round(y1))))
+    w = max(2, min(src_w - x, int(round(x2 - x1))))
+    h = max(2, min(src_h - y, int(round(y2 - y1))))
+    return f"drawbox=x={x}:y={y}:w={w}:h={h}:color={color}:t=4"
+
+
+def create_crop_debug_overlay(
+    source_video: Path,
+    output_path: Path,
+    *,
+    clip: Clip,
+    layout: LayoutInstruction,
+    raw_layout: dict[str, Any] | None = None,
+) -> Path:
+    """Create a low-res source preview with crop, speaker center, and bbox overlays."""
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    src_w, src_h = _probe_size(source_video) or (1920, 1080)
+    zoom = (
+        max(layout.zoom, 1.25)
+        if layout.layout == LayoutKind.ZOOM_CALL_CENTER
+        else max(layout.zoom, 1.0)
+    )
+    cw, ch = _crop_size(src_w, src_h, zoom)
+    center_y = 0.5 if layout.layout == LayoutKind.ZOOM_CALL_CENTER else 0.48
+    y = _even(max(0, min(src_h - ch, int(round(center_y * src_h - ch / 2)))))
+    center = _center_expr(layout, src_w)
+    max_x = max(0, src_w - cw)
+    crop_x = f"floor(max(0\\,min({max_x}\\,({center})-{cw}/2))/2)*2"
+
+    filters = [
+        f"fps={_DEBUG_FPS}",
+        f"drawbox=x={crop_x}:y={y}:w={cw}:h={ch}:color=0x00FF00@0.85:t=6",
+        f"drawbox=x=({center})-3:y=0:w=6:h=ih:color=0xA855F7@0.45:t=fill",
+    ]
+    raw_layout = raw_layout or {}
+    for key, color in (
+        ("person_bbox", "0x38BDF8@0.85"),
+        ("face_bbox", "0xFACC15@0.9"),
+        ("second_person_bbox", "0xFB923C@0.85"),
+        ("second_face_bbox", "0xF97316@0.9"),
+    ):
+        bbox_filter = _raw_bbox_filter(raw_layout, key, src_w=src_w, src_h=src_h, color=color)
+        if bbox_filter:
+            filters.append(bbox_filter)
+    filters.append("scale=540:-2")
+
+    duration = max(0.1, clip.duration_sec)
+    _run(
+        [
+            _ensure_ffmpeg(),
+            "-y",
+            "-loglevel",
+            "error",
+            "-t",
+            f"{duration:.3f}",
+            "-i",
+            str(source_video),
+            "-vf",
+            ",".join(filters),
+            "-an",
+            "-c:v",
+            "libx264",
+            "-preset",
+            "ultrafast",
+            "-crf",
+            "26",
+            "-movflags",
+            "+faststart",
+            str(output_path),
+        ]
+    )
+    return output_path
+
+
+def _word_timing_metrics(words: list[TranscriptWord]) -> dict[str, Any]:
+    invalid = 0
+    very_short = 0
+    very_long = 0
+    overlaps = 0
+    max_gap = 0.0
+    prev_end: float | None = None
+    for word in words:
+        start = float(word.start_time)
+        end = float(word.end_time)
+        duration = end - start
+        if not (math.isfinite(start) and math.isfinite(end)) or duration <= 0.0:
+            invalid += 1
+        if 0.0 < duration < 0.055:
+            very_short += 1
+        if duration > 1.65:
+            very_long += 1
+        if prev_end is not None:
+            if start < prev_end - 0.06:
+                overlaps += 1
+            max_gap = max(max_gap, start - prev_end)
+        prev_end = end
+    count = len(words)
+    return {
+        "word_count": count,
+        "invalid_count": invalid,
+        "very_short_count": very_short,
+        "very_long_count": very_long,
+        "overlap_count": overlaps,
+        "max_gap_sec": round(max_gap, 3),
+    }
+
+
+def _tracking_metrics(layout: LayoutInstruction) -> dict[str, Any]:
+    points = sorted(layout.person_tracking, key=lambda p: p.t_sec)
+    jumps = [
+        abs(float(points[idx].x_norm) - float(points[idx - 1].x_norm))
+        for idx in range(1, len(points))
+    ]
+    edge_count = sum(1 for p in points if p.x_norm < 0.16 or p.x_norm > 0.84)
+    return {
+        "tracking_sample_count": len(points),
+        "max_tracking_jump_norm": round(max(jumps) if jumps else 0.0, 4),
+        "edge_sample_count": edge_count,
+    }
+
+
+def _bbox_from_mask(mask: np.ndarray) -> tuple[int, int, int, int] | None:
+    ys, xs = np.where(mask)
+    if len(xs) == 0 or len(ys) == 0:
+        return None
+    return int(xs.min()), int(ys.min()), int(xs.max()) + 1, int(ys.max()) + 1
+
+
+def _expand_bbox(
+    bbox: tuple[int, int, int, int],
+    *,
+    width: int,
+    height: int,
+    pad_x: int,
+    pad_y: int,
+) -> tuple[int, int, int, int]:
+    x1, y1, x2, y2 = bbox
+    return (
+        max(0, x1 - pad_x),
+        max(0, y1 - pad_y),
+        min(width, x2 + pad_x),
+        min(height, y2 + pad_y),
+    )
+
+
+def _bbox_area(bbox: tuple[int, int, int, int] | None) -> int:
+    if bbox is None:
+        return 0
+    x1, y1, x2, y2 = bbox
+    return max(0, x2 - x1) * max(0, y2 - y1)
+
+
+def _bbox_intersection_area(
+    first: tuple[int, int, int, int] | None,
+    second: tuple[int, int, int, int] | None,
+) -> int:
+    if first is None or second is None:
+        return 0
+    ax1, ay1, ax2, ay2 = first
+    bx1, by1, bx2, by2 = second
+    return _bbox_area((max(ax1, bx1), max(ay1, by1), min(ax2, bx2), min(ay2, by2)))
+
+
+def _sample_final_frames(
+    video_path: Path,
+    frames_dir: Path,
+    *,
+    sample_count: int = _PIXEL_QA_SAMPLES,
+    width: int = _PIXEL_QA_W,
+) -> list[tuple[float, Path]]:
+    duration = _probe_duration(video_path) or 0.0
+    if duration <= 0.0:
+        return []
+    frames_dir.mkdir(parents=True, exist_ok=True)
+    samples: list[tuple[float, Path]] = []
+    for idx in range(max(1, sample_count)):
+        time_sec = duration * float(idx + 1) / float(sample_count + 1)
+        frame_path = frames_dir / f"frame_{idx + 1:03d}.jpg"
+        try:
+            _run(
+                [
+                    _ensure_ffmpeg(),
+                    "-y",
+                    "-loglevel",
+                    "error",
+                    "-ss",
+                    f"{time_sec:.3f}",
+                    "-i",
+                    str(video_path),
+                    "-frames:v",
+                    "1",
+                    "-vf",
+                    f"scale={width}:-2",
+                    str(frame_path),
+                ]
+            )
+        except Exception as exc:  # noqa: BLE001 - keep QA warning-based
+            logger.warning(
+                "Pixel QA frame sample failed for %s at %.2fs: %s",
+                video_path,
+                time_sec,
+                exc,
+            )
+            continue
+        if frame_path.is_file():
+            samples.append((time_sec, frame_path))
+    return samples
+
+
+def _caption_masks(arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    rgb = arr.astype(np.int16)
+    r = rgb[:, :, 0]
+    g = rgb[:, :, 1]
+    b = rgb[:, :, 2]
+    purple = (
+        (r >= 85)
+        & (r <= 190)
+        & (g >= 35)
+        & (g <= 155)
+        & (b >= 145)
+        & ((b - r) >= 32)
+        & ((r - g) >= 8)
+    )
+    white = (r >= 205) & (g >= 205) & (b >= 205)
+    return purple, white
+
+
+def _frame_pixel_record(frame_path: Path, *, time_sec: float) -> dict[str, Any]:
+    image = Image.open(frame_path).convert("RGB")
+    arr = np.asarray(image)
+    height, width = arr.shape[:2]
+    brightness = float(arr.mean() / 255.0)
+    contrast = float(arr.std() / 255.0)
+    blank = brightness < 0.035 or contrast < 0.025
+
+    purple, white = _caption_masks(arr)
+    y_grid = np.arange(height)[:, None]
+    x_grid = np.arange(width)[None, :]
+    caption_region = y_grid >= int(round(height * _PIXEL_QA_CAPTION_MIN_Y_RATIO))
+    purple = purple & caption_region
+    purple_bbox = _bbox_from_mask(purple)
+    caption_bbox = None
+    if purple_bbox is not None:
+        expanded = _expand_bbox(
+            purple_bbox,
+            width=width,
+            height=height,
+            pad_x=max(36, width // 8),
+            pad_y=max(14, height // 34),
+        )
+        ex1, ey1, ex2, ey2 = expanded
+        nearby_white = (
+            white
+            & (x_grid >= ex1)
+            & (x_grid <= ex2)
+            & (y_grid >= ey1)
+            & (y_grid <= ey2)
+        )
+        caption_bbox = _bbox_from_mask(purple | nearby_white)
+        if caption_bbox is not None:
+            caption_bbox = _expand_bbox(
+                caption_bbox,
+                width=width,
+                height=height,
+                pad_x=4,
+                pad_y=4,
+            )
+
+    face_safe_zone = (
+        int(round(width * 0.10)),
+        int(round(height * 0.06)),
+        int(round(width * 0.90)),
+        int(round(height * 0.52)),
+    )
+    caption_area = _bbox_area(caption_bbox)
+    overlap_area = _bbox_intersection_area(caption_bbox, face_safe_zone)
+    overlap_ratio = overlap_area / max(1, caption_area)
+    edge_hit = False
+    edge_bbox = purple_bbox or caption_bbox
+    if edge_bbox is not None:
+        x1, y1, x2, y2 = edge_bbox
+        edge_margin_x = max(2, int(round(width * 0.015)))
+        edge_margin_y = max(2, int(round(height * 0.01)))
+        edge_hit = (
+            x1 <= edge_margin_x
+            or x2 >= width - edge_margin_x
+            or y2 >= height - edge_margin_y
+        )
+
+    flags: list[str] = []
+    if blank:
+        flags.append("blank_or_flat_frame")
+    if edge_hit:
+        flags.append("caption_edge_clip_check")
+    if caption_bbox is not None and overlap_ratio >= 0.18:
+        flags.append("caption_face_safe_zone_check")
+
+    return {
+        "time_sec": round(time_sec, 3),
+        "frame_path": str(frame_path),
+        "brightness": round(brightness, 4),
+        "contrast": round(contrast, 4),
+        "caption_bbox": list(caption_bbox) if caption_bbox is not None else None,
+        "purple_bbox": list(purple_bbox) if purple_bbox is not None else None,
+        "face_safe_zone": list(face_safe_zone),
+        "caption_face_safe_zone_overlap": round(overlap_ratio, 4),
+        "flags": flags,
+    }
+
+
+def _draw_bbox(
+    draw: ImageDraw.ImageDraw,
+    bbox: list[int] | tuple[int, int, int, int] | None,
+    *,
+    color: str,
+    width: int = 3,
+) -> None:
+    if not bbox:
+        return
+    draw.rectangle(tuple(int(v) for v in bbox), outline=color, width=width)
+
+
+def _write_pixel_qa_sheet(records: list[dict[str, Any]], output_path: Path) -> Path | None:
+    if not records:
+        return None
+    frames: list[Image.Image] = []
+    for record in records:
+        frame_path = Path(str(record.get("frame_path", "")))
+        if not frame_path.is_file():
+            continue
+        img = Image.open(frame_path).convert("RGB")
+        draw = ImageDraw.Draw(img)
+        has_warning = bool(record.get("flags"))
+        _draw_bbox(draw, record.get("face_safe_zone"), color="#22c55e", width=2)
+        _draw_bbox(draw, record.get("caption_bbox"), color="#ef4444" if has_warning else "#a855f7")
+        label = f"{record.get('time_sec', 0):.1f}s"
+        if has_warning:
+            label += " " + ",".join(str(flag) for flag in record.get("flags", []))
+        draw.rectangle((0, 0, img.width, 24), fill=(0, 0, 0))
+        draw.text((6, 5), label, fill=(255, 255, 255))
+        frames.append(img)
+    if not frames:
+        return None
+
+    columns = min(4, len(frames))
+    rows = int(math.ceil(len(frames) / columns))
+    tile_w = max(frame.width for frame in frames)
+    tile_h = max(frame.height for frame in frames)
+    sheet = Image.new("RGB", (columns * tile_w, rows * tile_h), (12, 12, 12))
+    for idx, frame in enumerate(frames):
+        x = (idx % columns) * tile_w
+        y = (idx // columns) * tile_h
+        sheet.paste(frame, (x, y))
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    sheet.save(output_path, quality=92)
+    return output_path
+
+
+def analyze_rendered_pixels(video_path: Path, qa_dir: Path, *, clip_id: str) -> dict[str, Any]:
+    """Sample rendered frames and run simple pixel-level QA checks."""
+
+    frames_dir = qa_dir / f"short_{clip_id}_pixel_frames"
+    records: list[dict[str, Any]] = []
+    sheet: Path | None = None
+    try:
+        samples = _sample_final_frames(video_path, frames_dir)
+        for time_sec, frame_path in samples:
+            records.append(_frame_pixel_record(frame_path, time_sec=time_sec))
+        sheet = _write_pixel_qa_sheet(records, qa_dir / f"short_{clip_id}_pixel_qa.jpg")
+    finally:
+        shutil.rmtree(frames_dir, ignore_errors=True)
+
+    sample_count = len(records)
+    caption_seen = sum(1 for record in records if record.get("caption_bbox") is not None)
+    blank_count = sum(1 for record in records if "blank_or_flat_frame" in record.get("flags", []))
+    edge_hits = sum(1 for record in records if "caption_edge_clip_check" in record.get("flags", []))
+    safe_zone_hits = sum(
+        1 for record in records if "caption_face_safe_zone_check" in record.get("flags", [])
+    )
+    min_contrast = min((float(record.get("contrast", 0.0)) for record in records), default=0.0)
+    mean_brightness = (
+        sum(float(record.get("brightness", 0.0)) for record in records) / sample_count
+        if sample_count
+        else 0.0
+    )
+
+    score = 1.0
+    if sample_count == 0:
+        score = 0.0
+    else:
+        missing_ratio = max(0.0, (max(2, sample_count // 4) - caption_seen) / max(1, sample_count))
+        score -= (blank_count / sample_count) * 0.55
+        score -= (edge_hits / sample_count) * 0.28
+        score -= (safe_zone_hits / sample_count) * 0.35
+        score -= missing_ratio * 0.20
+    score = _clamp(score)
+
+    flags: list[str] = []
+    if sample_count == 0:
+        flags.append("pixel_qa_no_samples")
+    if blank_count:
+        flags.append("blank_or_flat_frame")
+    if edge_hits:
+        flags.append("caption_edge_clip_check")
+    if safe_zone_hits:
+        flags.append("caption_face_safe_zone_check")
+    if sample_count and caption_seen < max(2, sample_count // 4):
+        flags.append("caption_pixels_sparse_check")
+
+    return {
+        "pixel_score": round(score, 3),
+        "flags": flags,
+        "sample_count": sample_count,
+        "caption_seen_frames": caption_seen,
+        "blank_frame_count": blank_count,
+        "caption_edge_hit_count": edge_hits,
+        "caption_face_safe_zone_hit_count": safe_zone_hits,
+        "mean_brightness": round(mean_brightness, 4),
+        "min_contrast": round(min_contrast, 4),
+        "annotated_sheet": str(sheet) if sheet is not None else None,
+        "frames": [
+            {
+                "time_sec": record["time_sec"],
+                "caption_bbox": record["caption_bbox"],
+                "flags": record["flags"],
+            }
+            for record in records
+        ],
+    }
+
+
+def score_short(
+    output_video: Path,
+    *,
+    clip: Clip,
+    transcript: dict,
+    layout: LayoutInstruction,
+) -> dict[str, Any]:
+    """Return lightweight, deterministic QA scores for one rendered short."""
+
+    words = clip_subtitle_words(transcript, clip).words
+    word_metrics = _word_timing_metrics(words)
+    tracking = _tracking_metrics(layout)
+    width_height = _probe_size(output_video)
+    duration = _probe_duration(output_video)
+
+    word_count = max(1, int(word_metrics["word_count"]))
+    caption_score = 1.0
+    caption_score -= (word_metrics["invalid_count"] / word_count) * 0.55
+    caption_score -= (word_metrics["very_short_count"] / word_count) * 0.22
+    caption_score -= (word_metrics["very_long_count"] / word_count) * 0.20
+    caption_score -= (word_metrics["overlap_count"] / word_count) * 0.28
+    if word_metrics["word_count"] == 0:
+        caption_score = 0.25
+    caption_score = _clamp(caption_score)
+
+    sample_count = max(1, int(tracking["tracking_sample_count"]))
+    max_jump = float(tracking["max_tracking_jump_norm"])
+    speaker_score = 1.0
+    speaker_score -= (int(tracking["edge_sample_count"]) / sample_count) * 0.35
+    speaker_score -= max(0.0, max_jump - 0.18) * 1.4
+    if layout.layout not in (LayoutKind.SIT_CENTER, LayoutKind.ZOOM_CALL_CENTER):
+        speaker_score = max(0.82, speaker_score)
+    speaker_score = _clamp(speaker_score)
+
+    crop_jump_score = _clamp(1.0 - max(0.0, max_jump - 0.12) * 2.1)
+    video_score = 1.0
+    if width_height != (1080, 1920):
+        video_score -= 0.18
+    if duration is None or duration <= 0.0:
+        video_score -= 0.35
+    video_score = _clamp(video_score)
+
+    overall = (
+        caption_score * 0.35
+        + speaker_score * 0.30
+        + crop_jump_score * 0.20
+        + video_score * 0.15
+    )
+    flags: list[str] = []
+    if caption_score < 0.82:
+        flags.append("caption_timing_check")
+    if speaker_score < 0.82:
+        flags.append("speaker_centering_check")
+    if crop_jump_score < 0.82:
+        flags.append("crop_jump_check")
+    if video_score < 0.9:
+        flags.append("video_probe_check")
+
+    return {
+        "overall_score": round(overall, 3),
+        "caption_score": round(caption_score, 3),
+        "speaker_centering_score": round(speaker_score, 3),
+        "crop_jump_score": round(crop_jump_score, 3),
+        "video_score": round(video_score, 3),
+        "flags": flags,
+        "video": {
+            "duration_sec": round(duration, 3) if duration is not None else None,
+            "size": list(width_height) if width_height else None,
+        },
+        "word_timing": word_metrics,
+        "tracking": tracking,
+    }
+
+
+def _clip_id_from_output(path: Path) -> str:
+    match = re.search(r"short_([^\\/]+?)\.mp4$", path.name, flags=re.IGNORECASE)
+    return match.group(1) if match else path.stem
+
+
+def qa_record_flags(record: dict[str, Any]) -> list[str]:
+    flags: list[str] = []
+    score = record.get("score")
+    if isinstance(score, dict):
+        flags.extend(str(flag) for flag in score.get("flags", []) if str(flag))
+    pixel_qa = record.get("pixel_qa")
+    if isinstance(pixel_qa, dict):
+        flags.extend(str(flag) for flag in pixel_qa.get("flags", []) if str(flag))
+    if record.get("errors"):
+        flags.append("qa_error")
+    return list(dict.fromkeys(flags))
+
+
+def qa_summary_lines(manifest_path: Path) -> list[str]:
+    if not manifest_path.is_file():
+        return []
+    try:
+        payload = json.loads(manifest_path.read_text(encoding="utf-8"))
+    except Exception:
+        return []
+    records = payload.get("shorts", [])
+    if not isinstance(records, list):
+        return []
+    lines: list[str] = []
+    for record in records:
+        if not isinstance(record, dict):
+            continue
+        clip_id = str(record.get("clip_id", "")).strip()
+        if not clip_id:
+            continue
+        flags = qa_record_flags(record)
+        status = "WARN " + ", ".join(flags) if flags else "OK"
+        lines.append(f"short_{clip_id} {status}")
+    return lines
+
+
+def run_render_qa(
+    *,
+    output_dir: Path,
+    final_outputs: list[Path],
+    render_clips_by_id: dict[str, Clip],
+    transcripts_by_id: dict[str, dict],
+    layouts_by_id: dict[str, LayoutInstruction],
+    assembled_sources_by_id: dict[str, Path],
+    raw_layouts_by_id: dict[str, dict[str, Any]] | None = None,
+    reference_video: Path | None = None,
+    debug_overlay: bool = True,
+) -> Path:
+    """Create QA artifacts for all rendered shorts and return the manifest path."""
+
+    qa_dir = output_dir / "render_qa"
+    qa_dir.mkdir(parents=True, exist_ok=True)
+    raw_layouts_by_id = raw_layouts_by_id or {}
+    manifest_path = qa_dir / "qa_manifest.json"
+    records_by_id: dict[str, dict[str, Any]] = {}
+    if manifest_path.is_file():
+        try:
+            existing = json.loads(manifest_path.read_text(encoding="utf-8"))
+            for item in existing.get("shorts", []):
+                if isinstance(item, dict) and item.get("clip_id"):
+                    records_by_id[str(item["clip_id"])] = item
+        except Exception as exc:  # noqa: BLE001 - stale QA should not block updates
+            logger.warning("Ignoring stale QA manifest at %s: %s", manifest_path, exc)
+
+    for video_path in final_outputs:
+        clip_id = _clip_id_from_output(video_path)
+        clip = render_clips_by_id.get(clip_id)
+        transcript = transcripts_by_id.get(clip_id)
+        layout = layouts_by_id.get(clip_id)
+        record: dict[str, Any] = {
+            "clip_id": clip_id,
+            "output": str(video_path),
+            "artifacts": {},
+            "errors": [],
+        }
+
+        try:
+            sheet = create_contact_sheet(video_path, qa_dir / f"short_{clip_id}_contact.jpg")
+            record["artifacts"]["contact_sheet"] = str(sheet)
+        except Exception as exc:  # noqa: BLE001 - QA must not fail the render
+            record["errors"].append(f"contact_sheet: {exc}")
+            logger.warning("Render QA contact sheet failed for %s: %s", clip_id, exc)
+
+        if reference_video is not None and reference_video.is_file():
+            try:
+                compare = create_ab_compare(
+                    reference_video,
+                    video_path,
+                    qa_dir / f"short_{clip_id}_ab_compare.jpg",
+                )
+                record["artifacts"]["ab_compare"] = str(compare)
+            except Exception as exc:  # noqa: BLE001
+                record["errors"].append(f"ab_compare: {exc}")
+                logger.warning("Render QA A/B compare failed for %s: %s", clip_id, exc)
+
+        if debug_overlay and clip is not None and layout is not None:
+            source = assembled_sources_by_id.get(clip_id)
+            if source is not None and source.is_file():
+                try:
+                    debug = create_crop_debug_overlay(
+                        source,
+                        qa_dir / f"short_{clip_id}_crop_debug.mp4",
+                        clip=clip,
+                        layout=layout,
+                        raw_layout=raw_layouts_by_id.get(clip_id),
+                    )
+                    record["artifacts"]["crop_debug_overlay"] = str(debug)
+                except Exception as exc:  # noqa: BLE001
+                    record["errors"].append(f"crop_debug_overlay: {exc}")
+                    logger.warning("Render QA crop debug failed for %s: %s", clip_id, exc)
+
+        try:
+            pixel_qa = analyze_rendered_pixels(video_path, qa_dir, clip_id=clip_id)
+            record["pixel_qa"] = pixel_qa
+            if pixel_qa.get("annotated_sheet"):
+                record["artifacts"]["pixel_qa_sheet"] = pixel_qa["annotated_sheet"]
+        except Exception as exc:  # noqa: BLE001
+            record["errors"].append(f"pixel_qa: {exc}")
+            logger.warning("Render QA pixel checks failed for %s: %s", clip_id, exc)
+            pixel_qa = None
+
+        if clip is not None and transcript is not None and layout is not None:
+            score = score_short(
+                video_path,
+                clip=clip,
+                transcript=transcript,
+                layout=layout,
+            )
+            if isinstance(pixel_qa, dict):
+                pixel_score = float(pixel_qa.get("pixel_score", 0.0))
+                score["pixel_score"] = round(pixel_score, 3)
+                merged_flags = list(
+                    dict.fromkeys(score.get("flags", []) + pixel_qa.get("flags", []))
+                )
+                score["flags"] = merged_flags
+                score["overall_score"] = round(
+                    _clamp(float(score["overall_score"]) * 0.80 + pixel_score * 0.20),
+                    3,
+                )
+            record["score"] = score
+        else:
+            record["errors"].append("score: missing clip, transcript, or layout")
+
+        records_by_id[clip_id] = record
+
+    manifest: dict[str, Any] = {
+        "shorts": [records_by_id[key] for key in sorted(records_by_id)]
+    }
+    manifest_path.write_text(
+        json.dumps(manifest, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    logger.info("Render QA manifest written: %s", manifest_path)
+    logger.info("Render QA summary:")
+    for line in qa_summary_lines(manifest_path):
+        logger.info("  %s", line)
+    return manifest_path
+
+
+def _main() -> None:
+    parser = argparse.ArgumentParser(description="Create a reference/output A/B contact sheet.")
+    parser.add_argument("--reference", type=Path, required=True, help="Reference video path.")
+    parser.add_argument(
+        "--output-video",
+        type=Path,
+        required=True,
+        help="Rendered output video path.",
+    )
+    parser.add_argument("--out", type=Path, required=True, help="Compare image output path.")
+    parser.add_argument("--fps", type=float, default=4.0, help="Contact-sheet sample FPS.")
+    args = parser.parse_args()
+    create_ab_compare(args.reference, args.output_video, args.out, fps=args.fps)
+    print(args.out)
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/src/humeo/render_window.py b/src/humeo/render_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..461cd06fb243dc4d621e743a4f1dff1cbb239993
--- /dev/null
+++ b/src/humeo/render_window.py
@@ -0,0 +1,48 @@
+"""Map LLM clip timing (segment + trim + hook) to one ffmpeg source window.
+
+``humeo_core.primitives.compile`` already cuts with ``-ss`` / ``-t`` from ``Clip``;
+this module is the single place that turns trim/hook fields into concrete bounds.
+"""
+
+from __future__ import annotations
+
+from humeo_core.schemas import Clip
+
+
+def effective_export_bounds(clip: Clip) -> tuple[float, float]:
+    """Return ``(start_sec, end_sec)`` on the source timeline for the exported short.
+
+    1. **Trim** narrows ``[start_time_sec, end_time_sec]``.
+    2. ``render_spans`` override contiguous trim export when present.
+    2. Hook fields remain metadata and do not change the export window.
+    """
+    if clip.render_spans:
+        return clip.render_spans[0].start_time_sec, clip.render_spans[-1].end_time_sec
+
+    s0 = clip.start_time_sec
+    s1 = clip.end_time_sec
+
+    t_lo = s0 + clip.trim_start_sec
+    t_hi = s1 - clip.trim_end_sec
+    if t_hi <= t_lo:
+        t_lo, t_hi = s0, s1
+
+    if t_hi <= t_lo:
+        t_lo, t_hi = s0, s1
+
+    return t_lo, t_hi
+
+
+def clip_for_render(clip: Clip) -> Clip:
+    """Copy with ``start``/``end`` set to the actual cut; trim/hook cleared."""
+    t0, t1 = effective_export_bounds(clip)
+    return clip.model_copy(
+        update={
+            "start_time_sec": t0,
+            "end_time_sec": t1,
+            "trim_start_sec": 0.0,
+            "trim_end_sec": 0.0,
+            "hook_start_sec": None,
+            "hook_end_sec": None,
+        }
+    )
diff --git a/src/humeo/session_state.py b/src/humeo/session_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6abeb0d4cbac5f90d2481316e0bdd91c3a2bd92
--- /dev/null
+++ b/src/humeo/session_state.py
@@ -0,0 +1,67 @@
+"""Persist and load interactive session state."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+from humeo.video_cache import extract_youtube_video_id, normalize_local_source_path
+from humeo_core.schemas import SessionState
+
+logger = logging.getLogger(__name__)
+
+SESSION_STATE_FILENAME = "session_state.json"
+
+
+def source_key_for_url(youtube_url: str) -> str:
+    video_id = extract_youtube_video_id(youtube_url)
+    if video_id:
+        return f"youtube:{video_id}"
+    local_path = normalize_local_source_path(youtube_url)
+    if local_path is not None:
+        return f"local:{local_path}"
+    return f"url:{youtube_url}"
+
+
+def fresh_state(youtube_url: str) -> SessionState:
+    return SessionState(source_key=source_key_for_url(youtube_url))
+
+
+def load_state(work_dir: Path, youtube_url: str) -> SessionState:
+    """Load session state, resetting on corruption or source mismatch."""
+    path = work_dir / SESSION_STATE_FILENAME
+    state = fresh_state(youtube_url)
+    if not path.is_file():
+        return state
+
+    try:
+        with open(path, encoding="utf-8") as f:
+            payload = json.load(f)
+        loaded = SessionState.model_validate(payload)
+    except Exception as exc:
+        logger.warning("Session state at %s is invalid. Starting fresh. Error: %s", path, exc)
+        return state
+
+    if loaded.source_key and loaded.source_key != state.source_key:
+        logger.warning(
+            "Session state at %s belongs to %s, not %s. Starting fresh.",
+            path,
+            loaded.source_key,
+            state.source_key,
+        )
+        return state
+
+    if not loaded.source_key:
+        loaded.source_key = state.source_key
+    return loaded
+
+
+def save_state(work_dir: Path, state: SessionState) -> Path:
+    """Persist session state to the work dir."""
+    work_dir.mkdir(parents=True, exist_ok=True)
+    path = work_dir / SESSION_STATE_FILENAME
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(state.model_dump_json(indent=2))
+        f.write("\n")
+    return path
diff --git a/src/humeo/transcript_align.py b/src/humeo/transcript_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..da1cb690a684c60277fb9207ee2b76b9f77fbc8b
--- /dev/null
+++ b/src/humeo/transcript_align.py
@@ -0,0 +1,290 @@
+"""Map source-timeline ASR words to per-clip subtitle timings (t=0 at clip in-point)."""
+
+from __future__ import annotations
+
+from humeo_core.schemas import Clip, ClipSubtitleWords, RenderTheme, TranscriptWord
+
+# Whisper / WhisperX / OpenAI-normalized segment shapes
+_MAX_WORDS_PER_CUE = 8
+_MAX_CUE_SEC = 4.0
+_PUNCTUATION_BREAK_CHARS = (".", "?", "!", ";", ":")
+_SENTENCE_RESTART_WORDS = frozenset(
+    {
+        "And",
+        "But",
+        "Did",
+        "Now",
+        "So",
+        "That",
+        "Then",
+        "This",
+        "Those",
+        "What",
+        "When",
+        "Where",
+        "Why",
+    }
+)
+
+
+def _iter_words_from_segments(transcript: dict) -> list[TranscriptWord]:
+    out: list[TranscriptWord] = []
+    for seg in transcript.get("segments", []) or []:
+        words = seg.get("words") or []
+        if words:
+            for raw in words:
+                w = str(raw.get("word", "")).strip()
+                if not w:
+                    continue
+                out.append(
+                    TranscriptWord(
+                        word=w,
+                        start_time=float(raw["start"]),
+                        end_time=float(raw["end"]),
+                    )
+                )
+            continue
+        # Segment-level only (no word list): treat whole segment as one token
+        text = str(seg.get("text", "")).strip()
+        if text:
+            out.append(
+                TranscriptWord(
+                    word=text,
+                    start_time=float(seg.get("start", 0.0)),
+                    end_time=float(seg.get("end", 0.0)),
+                )
+            )
+    return out
+
+
+def clip_subtitle_words(transcript: dict, clip: Clip) -> ClipSubtitleWords:
+    """Words overlapping ``clip`` with times shifted to start at 0 (clip-local)."""
+    clip_start = clip.start_time_sec
+    clip_end = clip.end_time_sec
+    words = _iter_words_from_segments(transcript)
+    local: list[TranscriptWord] = []
+    for w in words:
+        if w.end_time <= clip_start or w.start_time >= clip_end:
+            continue
+        t0 = max(w.start_time, clip_start) - clip_start
+        t1 = min(w.end_time, clip_end) - clip_start
+        if t1 <= t0:
+            continue
+        local.append(TranscriptWord(word=w.word, start_time=t0, end_time=t1))
+
+    if local:
+        return ClipSubtitleWords(words=local)
+
+    return ClipSubtitleWords(words=_fallback_even_words(clip))
+
+
+def _fallback_even_words(clip: Clip) -> list[TranscriptWord]:
+    """Even split over clip duration when no word timestamps exist."""
+    text = (clip.transcript or "").strip()
+    if not text:
+        return []
+    parts = text.split()
+    if not parts:
+        return []
+    d = clip.duration_sec
+    step = d / len(parts)
+    out: list[TranscriptWord] = []
+    for i, p in enumerate(parts):
+        out.append(
+            TranscriptWord(
+                word=p,
+                start_time=i * step,
+                end_time=(i + 1) * step if i < len(parts) - 1 else d,
+            )
+        )
+    return out
+
+
+def _looks_like_sentence_restart(prev_word: str, next_word: str) -> bool:
+    prev = prev_word.rstrip("\"')]}")
+    nxt = next_word.lstrip("\"'([{")
+    if not prev or not nxt:
+        return False
+    if nxt in _SENTENCE_RESTART_WORDS:
+        return True
+    return any(ch.isdigit() for ch in prev) and nxt[0].isupper()
+
+
+def clip_words_to_srt_lines(
+    words: list[TranscriptWord],
+    *,
+    max_words_per_cue: int = _MAX_WORDS_PER_CUE,
+    max_cue_sec: float = _MAX_CUE_SEC,
+    prefer_break_on_punctuation: bool = False,
+    min_words_before_break: int = 1,
+) -> list[tuple[float, float, str]]:
+    """Group words into SRT cues: max N words and max duration per cue."""
+    chunks = group_words_to_cue_chunks(
+        words,
+        max_words_per_cue=max_words_per_cue,
+        max_cue_sec=max_cue_sec,
+        prefer_break_on_punctuation=prefer_break_on_punctuation,
+        min_words_before_break=min_words_before_break,
+    )
+    return [
+        (chunk[0].start_time, chunk[-1].end_time, " ".join(w.word for w in chunk))
+        for chunk in chunks
+    ]
+
+
+def group_words_to_cue_chunks(
+    words: list[TranscriptWord],
+    *,
+    max_words_per_cue: int = _MAX_WORDS_PER_CUE,
+    max_cue_sec: float = _MAX_CUE_SEC,
+    prefer_break_on_punctuation: bool = False,
+    min_words_before_break: int = 1,
+) -> list[list[TranscriptWord]]:
+    """Group words into timed cue chunks while preserving per-word timings."""
+    if not words:
+        return []
+    max_words_per_cue = max(1, int(max_words_per_cue))
+    max_cue_sec = max(0.2, float(max_cue_sec))
+    min_words_before_break = max(1, int(min_words_before_break))
+    chunks_out: list[list[TranscriptWord]] = []
+    i = 0
+    n = len(words)
+    while i < n:
+        chunk: list[TranscriptWord] = [words[i]]
+        t0 = words[i].start_time
+        end_t = words[i].end_time
+        j = i + 1
+        while j < n:
+            w = words[j]
+            if len(chunk) >= max_words_per_cue:
+                break
+            if w.start_time - t0 > max_cue_sec:
+                break
+            if (
+                prefer_break_on_punctuation
+                and (len(chunk) >= 2 or end_t - t0 >= 0.45)
+                and _looks_like_sentence_restart(chunk[-1].word, w.word)
+            ):
+                break
+            chunk.append(w)
+            end_t = w.end_time
+            j += 1
+            if (
+                prefer_break_on_punctuation
+                and len(chunk) >= min_words_before_break
+                and chunk[-1].word.rstrip("\"')]}").endswith(_PUNCTUATION_BREAK_CHARS)
+            ):
+                break
+        chunks_out.append(chunk)
+        i = j
+    return chunks_out
+
+
+def format_srt(lines: list[tuple[float, float, str]]) -> str:
+    blocks: list[str] = []
+    for idx, (start, end, text) in enumerate(lines, start=1):
+        blocks.append(
+            f"{idx}\n{_fmt_time(start)} --> {_fmt_time(end)}\n{text}\n"
+        )
+    return "\n".join(blocks)
+
+
+def _fmt_time(seconds: float) -> str:
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    millis = int(round((seconds % 1) * 1000))
+    if millis >= 1000:
+        millis = 999
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
+
+
+# ---------------------------------------------------------------------------
+# ASS / SubStation Alpha output (the format libass natively renders)
+# ---------------------------------------------------------------------------
+
+
+def _fmt_ass_time(seconds: float) -> str:
+    """ASS time format: ``H:MM:SS.cs`` (centiseconds)."""
+    seconds = max(0.0, seconds)
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = seconds % 60
+    whole = int(secs)
+    cs = int(round((secs - whole) * 100))
+    if cs >= 100:
+        cs = 99
+    return f"{hours:d}:{minutes:02d}:{whole:02d}.{cs:02d}"
+
+
+def _escape_ass_text(text: str) -> str:
+    """Escape characters that are significant to the ASS dialogue parser."""
+    return (
+        text.replace("\\", r"\\")
+        .replace("{", r"\{")
+        .replace("}", r"\}")
+        .replace("\n", r"\N")
+    )
+
+
+def format_ass(
+    lines: list[tuple[float, float, str]],
+    *,
+    play_res_x: int,
+    play_res_y: int,
+    font_size: int,
+    margin_v: int,
+    margin_h: int = 60,
+    font_name: str = "Arial",
+    render_theme: RenderTheme = RenderTheme.LEGACY,
+) -> str:
+    """Render captions as an ASS script whose PlayRes matches the output video.
+
+    Why this exists: libass' font/margin scaling multiplies every pixel-ish
+    value by ``video_height / PlayResY``. The default ``PlayResY=288`` blew
+    ``FontSize=48`` up to ~320 output pixels and pushed ``MarginV`` to the
+    middle of the frame. Pinning ``PlayResY`` to the actual output height
+    makes that scale factor exactly 1.0, so ``font_size`` and ``margin_v``
+    below are honest output pixel values.
+    """
+
+    if render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+        style_line = (
+            f"Style: Default,{font_name},{font_size},&H00FFFFFF,&H000000FF,"
+            "&H00000000,&H00000000,-1,0,0,0,100,100,-1,0,1,3,0,2,"
+            f"{margin_h},{margin_h},{margin_v},0\n"
+        )
+    else:
+        style_line = (
+            f"Style: Default,{font_name},{font_size},&H00FFFFFF,&H000000FF,"
+            f"&H00000000,&H70000000,-1,0,0,0,100,100,0,0,4,0,0,2,"
+            f"{margin_h},{margin_h},{margin_v},0\n"
+        )
+
+    header = (
+        "[Script Info]\n"
+        "ScriptType: v4.00+\n"
+        f"PlayResX: {play_res_x}\n"
+        f"PlayResY: {play_res_y}\n"
+        "WrapStyle: 0\n"
+        "ScaledBorderAndShadow: yes\n"
+        "YCbCr Matrix: None\n"
+        "\n"
+        "[V4+ Styles]\n"
+        "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, "
+        "OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, "
+        "ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, "
+        "Alignment, MarginL, MarginR, MarginV, Encoding\n"
+        + style_line +
+        "\n"
+        "[Events]\n"
+        "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"
+    )
+
+    events = []
+    for start, end, text in lines:
+        events.append(
+            f"Dialogue: 0,{_fmt_ass_time(start)},{_fmt_ass_time(end)},Default,,"
+            f"0,0,0,,{_escape_ass_text(text)}"
+        )
+    return header + "\n".join(events) + ("\n" if events else "")
diff --git a/src/humeo/video_cache.py b/src/humeo/video_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de242b349afb0410c740805d5381dbb32a9dfb1
--- /dev/null
+++ b/src/humeo/video_cache.py
@@ -0,0 +1,231 @@
+"""Video ingest cache: YouTube id → work directory + manifest on disk."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from humeo.env import default_humeo_cache_root
+
+logger = logging.getLogger(__name__)
+
+# Typical watch / short / embed URLs (11-char id).
+_YOUTUBE_ID_RE = re.compile(
+    r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/)([a-zA-Z0-9_-]{11})"
+)
+
+MANIFEST_VERSION = 1
+MANIFEST_NAME = "video_cache_manifest.json"
+LOCAL_SOURCE_INFO_NAME = "source.local.json"
+
+
+class VideoCacheEntry(BaseModel):
+    """One row in the global cache manifest (machine-checkable, Pydantic-only)."""
+
+    video_id: str
+    url: str = ""
+    title: str = ""
+    channel: str = ""
+    work_dir: str
+    source_mp4: str
+    transcript_json: str
+    downloaded_at: str = ""  # ISO 8601 UTC when ingest completed
+
+
+class VideoCacheManifest(BaseModel):
+    version: int = MANIFEST_VERSION
+    entries: dict[str, VideoCacheEntry] = Field(default_factory=dict)
+
+
+def extract_youtube_video_id(url: str) -> str | None:
+    """Return the 11-character video id, or None if not a recognized YouTube URL."""
+    m = _YOUTUBE_ID_RE.search(url)
+    return m.group(1) if m else None
+
+
+def looks_like_local_source(source: str) -> bool:
+    """Return True when ``source`` should be treated as a local file path."""
+    if extract_youtube_video_id(source):
+        return False
+    return "://" not in source
+
+
+def normalize_local_source_path(source: str) -> Path | None:
+    """Return an absolute local path for ``source`` when it is file-like."""
+    if not looks_like_local_source(source):
+        return None
+    return Path(source).expanduser().resolve(strict=False)
+
+
+def local_source_cache_key(source: str) -> str | None:
+    """Return a stable cache key for a local source path."""
+    path = normalize_local_source_path(source)
+    if path is None:
+        return None
+    stem = re.sub(r"[^a-zA-Z0-9]+", "-", path.stem).strip("-").lower() or "video"
+    digest = hashlib.sha256(str(path).encode("utf-8")).hexdigest()[:16]
+    return f"{stem}-{digest}"
+
+
+def _local_source_info_path(work_dir: Path) -> Path:
+    return work_dir / LOCAL_SOURCE_INFO_NAME
+
+
+def read_local_source_info(work_dir: Path) -> dict[str, str]:
+    """Read ``source.local.json`` when present."""
+    path = _local_source_info_path(work_dir)
+    if not path.is_file():
+        return {}
+    with open(path, encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        return {}
+    return {str(k): str(v) for k, v in data.items()}
+
+
+def write_local_source_info(work_dir: Path, source_path: Path) -> Path:
+    """Persist the original local source path used for ``source.mp4``."""
+    work_dir.mkdir(parents=True, exist_ok=True)
+    path = _local_source_info_path(work_dir)
+    payload = {"local_source_path": str(Path(source_path).expanduser().resolve(strict=False))}
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2)
+        f.write("\n")
+    return path
+
+
+def local_source_matches(work_dir: Path, source: str) -> bool:
+    """Return True when ``work_dir`` already contains the same local source."""
+    path = normalize_local_source_path(source)
+    if path is None:
+        return False
+    info = read_local_source_info(work_dir)
+    return info.get("local_source_path") == str(path)
+
+
+def manifest_path(cache_root: Path | None = None) -> Path:
+    root = cache_root if cache_root is not None else default_humeo_cache_root()
+    root.mkdir(parents=True, exist_ok=True)
+    return root / MANIFEST_NAME
+
+
+def load_manifest(cache_root: Path | None = None) -> VideoCacheManifest:
+    path = manifest_path(cache_root)
+    if not path.exists():
+        return VideoCacheManifest()
+    with open(path, encoding="utf-8") as f:
+        data: Any = json.load(f)
+    return VideoCacheManifest.model_validate(data)
+
+
+def save_manifest(manifest: VideoCacheManifest, cache_root: Path | None = None) -> Path:
+    path = manifest_path(cache_root)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(manifest.model_dump_json(indent=2))
+    return path
+
+
+def resolve_work_directory(
+    *,
+    youtube_url: str,
+    explicit_work_dir: Path | None,
+    use_video_cache: bool,
+    cache_root: Path | None,
+) -> Path:
+    """Pick the directory for ``source.mp4``, ``transcript.json``, ``clips.json``, etc.
+
+    - If ``explicit_work_dir`` is set (CLI ``--work-dir``), use it.
+    - Else if video cache is disabled, use ``.humeo_work``.
+    - Else if the source is a local file path, use ``<cache_root>/local/<source_key>/``.
+    - Else if the source has no YouTube id, use ``.humeo_work``.
+    - Else use ``<cache_root>/videos/<video_id>/`` (creates parents as needed).
+    """
+    if explicit_work_dir is not None:
+        p = Path(explicit_work_dir).resolve()
+        p.mkdir(parents=True, exist_ok=True)
+        return p
+
+    if not use_video_cache:
+        p = Path(".humeo_work").resolve()
+        p.mkdir(parents=True, exist_ok=True)
+        return p
+
+    local_key = local_source_cache_key(youtube_url)
+    if local_key:
+        root = cache_root if cache_root is not None else default_humeo_cache_root()
+        p = (root / "local" / local_key).resolve()
+        p.mkdir(parents=True, exist_ok=True)
+        return p
+
+    vid = extract_youtube_video_id(youtube_url)
+    if not vid:
+        p = Path(".humeo_work").resolve()
+        p.mkdir(parents=True, exist_ok=True)
+        return p
+
+    root = cache_root if cache_root is not None else default_humeo_cache_root()
+    p = (root / "videos" / vid).resolve()
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+
+
+def ingest_complete(work_dir: Path, source: str | None = None) -> bool:
+    """Return True if both video and transcript exist and match the current source."""
+    complete = (work_dir / "source.mp4").is_file() and (work_dir / "transcript.json").is_file()
+    if not complete:
+        return False
+    if source is None:
+        return True
+    local_path = normalize_local_source_path(source)
+    if local_path is None:
+        return True
+    return local_source_matches(work_dir, source)
+
+
+def read_youtube_info_json(work_dir: Path) -> dict[str, Any]:
+    """Read ``source.info.json`` written by yt-dlp ``--write-info-json``."""
+    p = work_dir / "source.info.json"
+    if not p.is_file():
+        return {}
+    with open(p, encoding="utf-8") as f:
+        return json.load(f)
+
+
+def upsert_manifest_from_info(
+    *,
+    work_dir: Path,
+    youtube_url: str,
+    info: dict[str, Any],
+    cache_root: Path | None = None,
+) -> None:
+    """Merge or add a manifest entry after successful ingest."""
+    vid = (info.get("id") or extract_youtube_video_id(youtube_url) or "").strip()
+    if not vid:
+        logger.debug("No video id for manifest; skipping.")
+        return
+
+    now = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+    wd = work_dir.resolve()
+    entry = VideoCacheEntry(
+        video_id=vid,
+        url=str(info.get("webpage_url") or youtube_url),
+        title=str(info.get("title") or ""),
+        channel=str(info.get("channel") or info.get("uploader") or ""),
+        work_dir=str(wd),
+        source_mp4=str((wd / "source.mp4").resolve()),
+        transcript_json=str((wd / "transcript.json").resolve()),
+        downloaded_at=now,
+    )
+
+    manifest = load_manifest(cache_root)
+    manifest.entries[vid] = entry
+    path = save_manifest(manifest, cache_root)
+    logger.info("Updated video cache manifest: %s", path)