Spaces:

moonlantern1
/

clipforge

Sleeping

App Files Files Community

moonlantern1 commited on 11 days ago

Commit

eda316b

verified ·

1 Parent(s): 0bc3d15

Deploy ClipForge Docker Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +15 -0
.gitattributes +1 -0
Dockerfile +21 -0
LICENSE +21 -0
README.md +199 -10
app.py +808 -0
humeo-core/.gitignore +9 -0
humeo-core/LICENSE +21 -0
humeo-core/README.md +165 -0
humeo-core/docs/ARCHITECTURE.md +128 -0
humeo-core/docs/MCP_USAGE.md +100 -0
humeo-core/examples/render_request.json +23 -0
humeo-core/pyproject.toml +46 -0
humeo-core/src/humeo_core.egg-info/PKG-INFO +197 -0
humeo-core/src/humeo_core.egg-info/SOURCES.txt +33 -0
humeo-core/src/humeo_core.egg-info/dependency_links.txt +1 -0
humeo-core/src/humeo_core.egg-info/entry_points.txt +3 -0
humeo-core/src/humeo_core.egg-info/requires.txt +21 -0
humeo-core/src/humeo_core.egg-info/top_level.txt +1 -0
humeo-core/src/humeo_core/__init__.py +49 -0
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf +0 -0
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt +93 -0
humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt +93 -0
humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf +3 -0
humeo-core/src/humeo_core/primitives/__init__.py +1 -0
humeo-core/src/humeo_core/primitives/classify.py +232 -0
humeo-core/src/humeo_core/primitives/compile.py +602 -0
humeo-core/src/humeo_core/primitives/face_detect.py +135 -0
humeo-core/src/humeo_core/primitives/ingest.py +187 -0
humeo-core/src/humeo_core/primitives/layouts.py +707 -0
humeo-core/src/humeo_core/primitives/select_clips.py +150 -0
humeo-core/src/humeo_core/primitives/vision.py +210 -0
humeo-core/src/humeo_core/schemas.py +518 -0
humeo-core/src/humeo_core/server.py +332 -0
humeo-core/tests/__init__.py +0 -0
humeo-core/tests/test_classify.py +39 -0
humeo-core/tests/test_compile.py +329 -0
humeo-core/tests/test_face_detect.py +73 -0
humeo-core/tests/test_layout_bbox.py +17 -0
humeo-core/tests/test_layouts.py +312 -0
humeo-core/tests/test_schemas.py +267 -0
humeo-core/tests/test_select_clips.py +49 -0
humeo-core/tests/test_server_tools.py +93 -0
humeo-core/tests/test_vision.py +228 -0
pyproject.toml +56 -0
src/humeo.egg-info/PKG-INFO +223 -0
src/humeo.egg-info/SOURCES.txt +58 -0
src/humeo.egg-info/dependency_links.txt +1 -0
src/humeo.egg-info/entry_points.txt +2 -0
src/humeo.egg-info/requires.txt +19 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.git
+.env
+.env.*
+!.env.example
+.venv
+__pycache__
+.pytest_cache
+.humeo_*
+.tmp_review_frames
+.tmp_review_frames_ticketc
+output
+output*
+*.log
+*.zip
+*.pyc

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.12-slim-bookworm
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PORT=7860
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    rm -rf /var/lib/apt/lists/*
+COPY . /app
+RUN pip install --upgrade pip && \
+    pip install ./humeo-core && \
+    pip install .
+EXPOSE 7860
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 NotABot
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,199 @@
----
-title: Clipforge
-emoji: 🏆
-colorFrom: blue
-colorTo: gray
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: ClipForge
+sdk: docker
+app_port: 7860
+---
+# ClipForge
+Current default preset:
+- `native_highlight` captions
+- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
+- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
+- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
+Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
+**Architecture (static HTML, GitHub Pages):**
+[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
+## Hugging Face Space
+This repo includes a Hugging Face Docker Space entrypoint in `app.py` with the ClipForge upload/link UI.
+- Paste a YouTube/video URL or upload one local video file
+- Watch live pipeline progress in the ClipForge UI
+- Preview and download rendered `short_*.mp4` clips from the UI
+- Regenerate from the same source with a steering prompt
+Required Space secrets:
+- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
+- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
+If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
+`ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
+## Repo layout
+| Path | Role |
+|------|------|
+| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
+| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
+## Pipeline (actual order)
+```text
+YouTube URL
+  → ingest (source.mp4, transcript.json)
+  → clip selection (Gemini → clips.json)
+  → hook detection (Gemini → hooks.json)
+  → content pruning (Gemini → prune.json)
+  → keyframes + layout vision (Gemini vision → layout_vision.json)
+  → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
+```
+Details: **`docs/PIPELINE.md`**.
+## Five layouts
+A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
+## Requirements
+- **Python** ≥ 3.10
+- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
+- **`ffmpeg`** — on `PATH` for extract/render
+- **API keys** — see **`docs/ENVIRONMENT.md`**
+  - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
+  - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
+  - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
+Copy **`.env.example`** → **`.env`** (never commit `.env`).
+## Install
+```bash
+uv venv
+uv sync
+```
+Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
+```bash
+uv sync --extra whisper
+```
+## Run
+```bash
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+humeo --long-to-shorts "C:\path\to\video.mp4"
+```
+Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
+## CLI guide (all flags)
+Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
+### Required
+| Flag | Meaning |
+|------|---------|
+| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
+### Paths and cache behavior
+| Flag | Meaning |
+|------|---------|
+| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
+| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
+| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
+| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
+| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
+### Model selection and stage forcing
+| Flag | Meaning |
+|------|---------|
+| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
+| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
+| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
+| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
+| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
+| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
+| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
+### Pruning and subtitles
+| Flag | Meaning |
+|------|---------|
+| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
+| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
+| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
+| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
+| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
+### Logging
+| Flag | Meaning |
+|------|---------|
+| `--verbose`, `-v` | Enable debug logging. |
+### Common command recipes
+```bash
+# Basic run
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+# Local MP4
+humeo --long-to-shorts "C:\path\to\video.mp4"
+# Full fresh run for debugging / prompt tuning
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
+# Re-run only clip selection after prompt edits
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
+# Keep intermediates in a fixed local folder
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
+# Compare different prune levels on same source
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
+```
+## Documentation
+| Doc | Purpose |
+|-----|---------|
+| **`docs/README.md`** | Index of all files under `docs/` |
+| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
+| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
+| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
+| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
+| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
+| **`docs/full_run_output.txt`** | Example full run log (text) |
+| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
+| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
+| **`docs/TODO.md`** | Backlog |
+| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
+| **`docs/SOLUTIONS.md`** | Design rationale |
+| **`TERMINOLOGY.md`** | Glossary |
+## Tests
+```bash
+uv sync --extra dev
+uv run pytest
+```
+## Sharing outputs
+`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
+## License
+See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.

app.py ADDED Viewed

	@@ -0,0 +1,808 @@

+from __future__ import annotations
+import html
+import json
+import logging
+import os
+import queue
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import uuid
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Annotated
+def _bootstrap_local_paths() -> None:
+    repo_root = Path(__file__).resolve().parent
+    for candidate in (repo_root / "src", repo_root / "humeo-core" / "src"):
+        candidate_str = str(candidate)
+        if candidate.is_dir() and candidate_str not in sys.path:
+            sys.path.insert(0, candidate_str)
+_bootstrap_local_paths()
+if not (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip():
+    os.environ["HUMEO_TRANSCRIBE_PROVIDER"] = (
+        "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
+    )
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
+from humeo.config import PipelineConfig
+from humeo.pipeline import run_pipeline
+APP_TITLE = "ClipForge"
+LOG_FORMAT = "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s"
+MAX_LOG_LINES = 700
+LLM_KEY_NAMES = ("GOOGLE_API_KEY", "GEMINI_API_KEY", "OPENROUTER_API_KEY")
+class QueueLogHandler(logging.Handler):
+    def __init__(self, sink: queue.Queue[str]):
+        super().__init__()
+        self._sink = sink
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            self._sink.put_nowait(self.format(record))
+        except Exception:
+            pass
+@dataclass
+class ClipFile:
+    name: str
+    url: str
+    duration: str
+@dataclass
+class Job:
+    id: str
+    run_root: Path
+    output_dir: Path
+    work_dir: Path
+    source: str
+    source_path: Path | None = None
+    steering_note: str | None = None
+    status: str = "Queued"
+    nav_status: str = "Processing..."
+    error: str | None = None
+    done: bool = False
+    created_at: float = field(default_factory=time.time)
+    logs: list[str] = field(default_factory=list)
+    clips: dict[str, ClipFile] = field(default_factory=dict)
+    steps: list[dict[str, object]] = field(
+        default_factory=lambda: [
+            {"name": "Uploading video", "pct": 100, "state": "done"},
+            {"name": "Generating transcript", "pct": 5, "state": "active"},
+            {"name": "Choosing short clips", "pct": 0, "state": "pending"},
+            {"name": "Producing clips", "pct": 0, "state": "pending"},
+            {"name": "Adding subtitles & light edits", "pct": 0, "state": "pending"},
+        ]
+    )
+JOBS: dict[str, Job] = {}
+JOBS_LOCK = threading.Lock()
+def _append_log(job: Job, line: str) -> None:
+    job.logs.append(line)
+    if len(job.logs) > MAX_LOG_LINES:
+        job.logs = job.logs[-MAX_LOG_LINES:]
+def _set_step(job: Job, idx: int, pct: int, state: str = "active") -> None:
+    for step_idx, step in enumerate(job.steps):
+        if step_idx < idx:
+            step["pct"] = 100
+            step["state"] = "done"
+        elif step_idx == idx:
+            step["pct"] = max(int(step.get("pct", 0)), min(100, pct))
+            step["state"] = state
+        elif step.get("state") != "done":
+            step["state"] = "pending"
+def _update_stage_from_log(job: Job, line: str) -> None:
+    if "STAGE 1: INGESTION" in line:
+        job.status = "Generating transcript"
+        _set_step(job, 1, 15)
+    elif "Transcribing" in line:
+        job.status = "Generating transcript"
+        _set_step(job, 1, 45)
+    elif "Transcript already exists" in line or "Transcription complete" in line:
+        _set_step(job, 1, 90)
+    elif "STAGE 2: CLIP SELECTION" in line:
+        job.status = "Choosing short clips"
+        _set_step(job, 2, 20)
+    elif "STAGE 2.25: HOOK DETECTION" in line:
+        job.status = "Finding hooks"
+        _set_step(job, 2, 55)
+    elif "STAGE 2.5: CONTENT PRUNING" in line:
+        job.status = "Tightening clip windows"
+        _set_step(job, 2, 78)
+    elif "STAGE 2.75: CLIP ASSEMBLY" in line:
+        job.status = "Assembling clips"
+        _set_step(job, 3, 18)
+    elif "STAGE 3: CLIP LAYOUTS" in line:
+        job.status = "Choosing layout"
+        _set_step(job, 3, 38)
+    elif "STAGE 4: RENDER" in line:
+        job.status = "Producing clips"
+        _set_step(job, 3, 62)
+    elif "reframe_clip_ffmpeg" in line:
+        _set_step(job, 4, min(90, 20 + len(job.clips) * 12))
+    elif "RENDER QA" in line or "Render QA summary" in line:
+        job.status = "Checking clips"
+        _set_step(job, 4, 82)
+    elif "PIPELINE COMPLETE" in line:
+        job.status = "Complete"
+        job.nav_status = "Done"
+        for step in job.steps:
+            step["pct"] = 100
+            step["state"] = "done"
+def _install_log_handler(message_queue: queue.Queue[str]) -> tuple[logging.Handler, int, dict[str, int]]:
+    handler = QueueLogHandler(message_queue)
+    handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt="%H:%M:%S"))
+    root_logger = logging.getLogger()
+    previous_level = root_logger.level
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.INFO)
+    previous_logger_levels: dict[str, int] = {}
+    for logger_name in ("urllib3", "httpx", "httpcore"):
+        logger = logging.getLogger(logger_name)
+        previous_logger_levels[logger_name] = logger.level
+        logger.setLevel(logging.WARNING)
+    return handler, previous_level, previous_logger_levels
+def _remove_log_handler(
+    handler: logging.Handler,
+    previous_root_level: int,
+    previous_logger_levels: dict[str, int],
+) -> None:
+    root_logger = logging.getLogger()
+    root_logger.removeHandler(handler)
+    root_logger.setLevel(previous_root_level)
+    for logger_name, level in previous_logger_levels.items():
+        logging.getLogger(logger_name).setLevel(level)
+def _duration_label(path: Path) -> str:
+    try:
+        result = subprocess.run(
+            [
+                "ffprobe",
+                "-v",
+                "error",
+                "-show_entries",
+                "format=duration",
+                "-of",
+                "default=noprint_wrappers=1:nokey=1",
+                str(path),
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=15,
+        )
+        total = max(0, int(round(float(result.stdout.strip()))))
+    except Exception:
+        total = 0
+    return f"{total // 60}:{total % 60:02d}" if total else "0:00"
+def _publish_files(job: Job) -> None:
+    for path in sorted(job.output_dir.glob("short_*.mp4")):
+        if path.name not in job.clips and path.is_file():
+            job.clips[path.name] = ClipFile(
+                name=path.name,
+                url=f"/api/jobs/{job.id}/files/{path.name}",
+                duration=_duration_label(path),
+            )
+def _validate_credentials() -> None:
+    if not any((os.environ.get(name) or "").strip() for name in LLM_KEY_NAMES):
+        raise HTTPException(
+            status_code=400,
+            detail="Missing LLM secret. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENROUTER_API_KEY in the Space secrets.",
+        )
+    provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip().lower()
+    if provider in {"", "auto"}:
+        provider = "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
+    if provider == "elevenlabs" and not (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
+        raise HTTPException(status_code=400, detail="Missing ELEVENLABS_API_KEY Space secret.")
+    if provider in {"openai", "api"} and not (os.environ.get("OPENAI_API_KEY") or "").strip():
+        raise HTTPException(status_code=400, detail="Missing OPENAI_API_KEY Space secret.")
+def _safe_url(value: str | None) -> str | None:
+    value = (value or "").strip()
+    if not value:
+        return None
+    if not re.match(r"^https?://", value, flags=re.I):
+        raise HTTPException(status_code=400, detail="Paste a valid http(s) video URL.")
+    return value
+def _snapshot(job: Job) -> dict[str, object]:
+    return {
+        "id": job.id,
+        "status": job.status,
+        "nav_status": job.nav_status,
+        "done": job.done,
+        "error": job.error,
+        "logs": "\n".join(job.logs[-MAX_LOG_LINES:]),
+        "steps": job.steps,
+        "clips": [clip.__dict__ for clip in job.clips.values()],
+    }
+def _run_job(job_id: str) -> None:
+    with JOBS_LOCK:
+        job = JOBS[job_id]
+    message_queue: queue.Queue[str] = queue.Queue()
+    handler, previous_root_level, previous_logger_levels = _install_log_handler(message_queue)
+    def drain_queue() -> None:
+        with JOBS_LOCK:
+            local_job = JOBS[job_id]
+            while True:
+                try:
+                    line = message_queue.get_nowait()
+                except queue.Empty:
+                    break
+                _append_log(local_job, line)
+                _update_stage_from_log(local_job, line)
+            _publish_files(local_job)
+    try:
+        with JOBS_LOCK:
+            _append_log(job, f"Prepared source: {job.source}")
+            _append_log(job, f"Run id: {job.id}")
+            _set_step(job, 1, 8)
+        config = PipelineConfig(
+            source=job.source,
+            youtube_url=job.source,
+            output_dir=job.output_dir,
+            work_dir=job.work_dir,
+            use_video_cache=False,
+            clean_run=True,
+            interactive=False,
+            prune_level="balanced",
+            overwrite_outputs=True,
+            steering_notes=[job.steering_note] if job.steering_note else [],
+        )
+        worker_error: str | None = None
+        outputs: list[Path] = []
+        def pipeline_worker() -> None:
+            nonlocal outputs, worker_error
+            try:
+                outputs = run_pipeline(config)
+            except Exception as exc:
+                worker_error = str(exc)
+                for line in traceback.format_exc().splitlines():
+                    if line.strip():
+                        message_queue.put_nowait(line)
+        thread = threading.Thread(target=pipeline_worker, daemon=True)
+        thread.start()
+        while thread.is_alive():
+            drain_queue()
+            time.sleep(0.35)
+        drain_queue()
+        with JOBS_LOCK:
+            local_job = JOBS[job_id]
+            for output in outputs:
+                if Path(output).exists():
+                    local_job.clips[Path(output).name] = ClipFile(
+                        name=Path(output).name,
+                        url=f"/api/jobs/{job_id}/files/{Path(output).name}",
+                        duration=_duration_label(Path(output)),
+                    )
+            if worker_error:
+                local_job.error = worker_error
+                local_job.status = f"Failed: {worker_error}"
+                local_job.nav_status = "Failed"
+            else:
+                local_job.status = "Complete" if local_job.clips else "Complete - no clips generated"
+                local_job.nav_status = "Done"
+                for step in local_job.steps:
+                    step["pct"] = 100
+                    step["state"] = "done"
+            local_job.done = True
+    finally:
+        _remove_log_handler(handler, previous_root_level, previous_logger_levels)
+async def _stage_upload(uploaded_file: UploadFile, run_root: Path) -> Path:
+    suffix = Path(uploaded_file.filename or "input.mp4").suffix or ".mp4"
+    staged_path = run_root / f"input{suffix}"
+    with staged_path.open("wb") as handle:
+        while chunk := await uploaded_file.read(1024 * 1024):
+            handle.write(chunk)
+    return staged_path
+app = FastAPI(title=APP_TITLE)
+@app.get("/", response_class=HTMLResponse)
+def index() -> str:
+    return INDEX_HTML
+@app.post("/api/jobs")
+async def create_job(
+    video_url: Annotated[str | None, Form()] = None,
+    regen_prompt: Annotated[str | None, Form()] = None,
+    source_job_id: Annotated[str | None, Form()] = None,
+    file: Annotated[UploadFile | None, File()] = None,
+) -> JSONResponse:
+    _validate_credentials()
+    job_id = uuid.uuid4().hex[:12]
+    run_root = Path(tempfile.mkdtemp(prefix=f"clipforge-{job_id}-"))
+    work_dir = run_root / "work"
+    output_dir = run_root / "output"
+    work_dir.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    source_path: Path | None = None
+    source = _safe_url(video_url)
+    source_job_id = (source_job_id or "").strip()
+    if source_job_id:
+        with JOBS_LOCK:
+            previous = JOBS.get(source_job_id)
+        if previous is None:
+            raise HTTPException(status_code=404, detail="Previous job not found for regeneration.")
+        if previous.source_path and previous.source_path.exists():
+            source_path = run_root / previous.source_path.name
+            shutil.copy2(previous.source_path, source_path)
+            source = str(source_path)
+        else:
+            source = previous.source
+    elif file is not None:
+        source_path = await _stage_upload(file, run_root)
+        source = str(source_path)
+    if not source:
+        raise HTTPException(status_code=400, detail="Upload a video file or paste a video URL first.")
+    job = Job(
+        id=job_id,
+        run_root=run_root,
+        output_dir=output_dir,
+        work_dir=work_dir,
+        source=source,
+        source_path=source_path,
+        steering_note=(regen_prompt or "").strip() or None,
+    )
+    with JOBS_LOCK:
+        JOBS[job_id] = job
+    threading.Thread(target=_run_job, args=(job_id,), daemon=True).start()
+    return JSONResponse(_snapshot(job))
+@app.get("/api/jobs/{job_id}")
+def get_job(job_id: str) -> JSONResponse:
+    with JOBS_LOCK:
+        job = JOBS.get(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found.")
+        _publish_files(job)
+        return JSONResponse(_snapshot(job))
+@app.get("/api/jobs/{job_id}/files/{filename}")
+def get_job_file(job_id: str, filename: str) -> FileResponse:
+    with JOBS_LOCK:
+        job = JOBS.get(job_id)
+        if job is None:
+            raise HTTPException(status_code=404, detail="Job not found.")
+        path = (job.output_dir / Path(filename).name).resolve(strict=False)
+        if job.output_dir.resolve(strict=False) not in path.parents or not path.is_file():
+            raise HTTPException(status_code=404, detail="File not found.")
+    return FileResponse(path, media_type="video/mp4", filename=path.name)
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"ok": "true"}
+INDEX_HTML = r"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>ClipForge - Video to Clips</title>
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,500;0,600;1,300;1,400&family=DM+Sans:wght@300;400;500&display=swap" rel="stylesheet">
+<style>
+  :root {
+    --cream: #F7F2E9; --champagne: #EDE3CC; --champagne-deep: #D9C9A6;
+    --gold: #B8924A; --gold-light: #D4AA6A; --ink: #2A1F0E;
+    --ink-soft: #5C4A2E; --ink-muted: #9A8560; --white: #FDFAF4;
+    --surface: #F0E9D8; --border: #DDD0B3; --success: #6B8C5A;
+    --radius: 12px; --radius-lg: 20px;
+  }
+  * { margin: 0; padding: 0; box-sizing: border-box; }
+  body { font-family: 'DM Sans', sans-serif; background: var(--cream); color: var(--ink); min-height: 100vh; overflow-x: hidden; }
+  nav { display: flex; align-items: center; justify-content: space-between; padding: 20px 32px; border-bottom: 1px solid var(--border); background: var(--white); position: sticky; top: 0; z-index: 100; }
+  .logo { font-family: 'Cormorant Garamond', serif; font-size: 1.6rem; font-weight: 600; color: var(--ink); letter-spacing: 0.02em; }
+  .logo span { color: var(--gold); }
+  .screen { display: none; animation: fadeIn 0.5s ease; }
+  .screen.active { display: block; }
+  @keyframes fadeIn { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: translateY(0); } }
+  #screen-input { display: flex; flex-direction: column; align-items: center; justify-content: center; min-height: calc(100vh - 65px); padding: 40px 20px; text-align: center; }
+  .eyebrow { font-size: 0.75rem; letter-spacing: 0.18em; text-transform: uppercase; color: var(--gold); font-weight: 500; margin-bottom: 16px; }
+  .hero-title { font-family: 'Cormorant Garamond', serif; font-size: clamp(2rem, 5vw, 3.6rem); font-weight: 500; line-height: 1.15; color: var(--ink); max-width: 620px; margin-bottom: 12px; }
+  .hero-title em { font-style: italic; color: var(--gold); }
+  .hero-sub { font-size: 0.95rem; color: var(--ink-muted); margin-bottom: 48px; font-weight: 300; }
+  .input-card { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 36px; width: 100%; max-width: 520px; box-shadow: 0 8px 32px rgba(42,31,14,0.07); }
+  .mode-tabs { display: flex; background: var(--surface); border-radius: 10px; padding: 4px; margin-bottom: 28px; gap: 4px; }
+  .mode-tab { flex: 1; padding: 10px 0; border: none; background: transparent; border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.85rem; font-weight: 500; color: var(--ink-muted); cursor: pointer; transition: all 0.2s; }
+  .mode-tab.active { background: var(--white); color: var(--ink); box-shadow: 0 2px 8px rgba(42,31,14,0.1); }
+  .input-section { display: none; } .input-section.active { display: block; }
+  .input-label { font-size: 0.78rem; letter-spacing: 0.08em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 8px; display: block; font-weight: 500; text-align:left; }
+  .yt-input { width: 100%; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; transition: border-color 0.2s; }
+  .yt-input:focus { border-color: var(--gold); } .yt-input::placeholder { color: var(--ink-muted); }
+  .upload-zone { border: 2px dashed var(--champagne-deep); border-radius: var(--radius); padding: 36px 20px; text-align: center; cursor: pointer; transition: all 0.2s; background: var(--cream); }
+  .upload-zone:hover, .upload-zone.dragover { border-color: var(--gold); background: var(--champagne); }
+  .upload-icon { width: 44px; height: 44px; background: var(--champagne); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 12px; font-size: 1.2rem; }
+  .upload-text { font-size: 0.9rem; color: var(--ink-soft); font-weight: 400; }
+  .upload-sub { font-size: 0.78rem; color: var(--ink-muted); margin-top: 4px; }
+  .convert-btn { width: 100%; margin-top: 28px; padding: 16px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.95rem; font-weight: 500; cursor: pointer; letter-spacing: 0.03em; transition: all 0.2s; position: relative; overflow: hidden; }
+  .convert-btn:hover { background: var(--ink-soft); transform: translateY(-1px); box-shadow: 0 6px 20px rgba(42,31,14,0.2); } .convert-btn:active { transform: translateY(0); }
+  .convert-btn:disabled { opacity: .65; cursor: progress; transform:none; }
+  #screen-processing { max-width: 780px; margin: 0 auto; padding: 48px 20px 80px; }
+  .processing-header { text-align: center; margin-bottom: 40px; }
+  .processing-title { font-family: 'Cormorant Garamond', serif; font-size: 2rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
+  .processing-sub { font-size: 0.88rem; color: var(--ink-muted); font-weight: 300; }
+  .pipeline { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 28px; box-shadow: 0 4px 20px rgba(42,31,14,0.06); margin-bottom: 32px; }
+  .pipeline-step { display: flex; align-items: flex-start; gap: 16px; padding: 16px 0; border-bottom: 1px solid var(--champagne); opacity: 0.4; transition: opacity 0.4s; }
+  .pipeline-step:last-child { border-bottom: none; } .pipeline-step.active, .pipeline-step.done { opacity: 1; }
+  .step-icon { width: 36px; height: 36px; flex-shrink: 0; background: var(--surface); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1rem; transition: all 0.4s; border: 1.5px solid var(--border); }
+  .pipeline-step.active .step-icon { background: var(--champagne); border-color: var(--gold); }
+  .pipeline-step.done .step-icon { background: var(--gold); border-color: var(--gold); color: white; font-size: 0.85rem; }
+  .step-content { flex: 1; padding-top: 4px; }
+  .step-name { font-size: 0.9rem; font-weight: 500; color: var(--ink); margin-bottom: 8px; display: flex; align-items: center; justify-content: space-between; }
+  .step-pct { font-size: 0.8rem; color: var(--gold); font-weight: 500; }
+  .progress-track { height: 6px; background: var(--surface); border-radius: 99px; overflow: hidden; }
+  .progress-fill { height: 100%; border-radius: 99px; background: linear-gradient(90deg, var(--gold-light), var(--gold)); width: 0%; transition: width 0.25s ease; }
+  .pipeline-step.done .progress-fill { width: 100%; background: var(--gold); }
+  .tips-section { margin-bottom: 40px; }
+  .tips-label { font-size: 0.72rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 12px; font-weight: 500; }
+  .tip-card { background: var(--champagne); border-radius: var(--radius); padding: 14px 18px; font-size: 0.85rem; color: var(--ink-soft); display: flex; align-items: flex-start; gap: 10px; margin-bottom: 8px; line-height: 1.5; }
+  .tip-dot { color: var(--gold); margin-top: 2px; flex-shrink: 0; }
+  .clips-section { margin-top: 8px; }
+  .clips-title { font-family: 'Cormorant Garamond', serif; font-size: 1.4rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
+  .clips-sub { font-size: 0.82rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
+  .clips-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 16px; }
+  .clip-card { border-radius: var(--radius); overflow: hidden; cursor: pointer; background: var(--white); border: 1px solid var(--border); box-shadow: 0 2px 10px rgba(42,31,14,0.06); transition: all 0.2s; animation: clipAppear 0.5s ease both; }
+  .clip-card:hover { transform: translateY(-3px); box-shadow: 0 8px 24px rgba(42,31,14,0.13); }
+  @keyframes clipAppear { from { opacity: 0; transform: scale(0.9) translateY(10px); } to { opacity: 1; transform: scale(1) translateY(0); } }
+  .clip-thumb { aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; overflow: hidden; }
+  .clip-play { width: 44px; height: 44px; background: rgba(255,255,255,0.88); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1.1rem; z-index: 2; box-shadow: 0 2px 12px rgba(0,0,0,0.2); transition: transform 0.2s; }
+  .clip-card:hover .clip-play { transform: scale(1.1); }
+  .clip-meta { padding: 10px 12px; } .clip-num { font-size: 0.72rem; color: var(--ink-muted); text-transform: uppercase; letter-spacing: 0.08em; font-weight: 500; }
+  .clip-dur { font-size: 0.82rem; color: var(--ink); font-weight: 400; margin-top: 2px; }
+  .clip-download { margin-top: 8px; display:inline-block; font-size:.74rem; color:var(--gold); text-decoration:none; }
+  .regen-section { margin-top: 56px; background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 32px; display: none; animation: fadeIn 0.5s ease; box-shadow: 0 4px 20px rgba(42,31,14,0.06); }
+  .regen-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 500; margin-bottom: 6px; }
+  .regen-sub { font-size: 0.85rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
+  .regen-textarea { width: 100%; min-height: 100px; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; resize: vertical; transition: border-color 0.2s; line-height: 1.6; margin-bottom: 14px; }
+  .regen-textarea:focus { border-color: var(--gold); } .regen-textarea::placeholder { color: var(--ink-muted); }
+  .regen-row { display: flex; gap: 10px; align-items: center; flex-wrap: wrap; }
+  .chip { padding: 7px 14px; background: var(--champagne); border: 1px solid var(--border); border-radius: 99px; font-size: 0.78rem; color: var(--ink-soft); cursor: pointer; transition: all 0.15s; font-weight: 400; white-space: nowrap; }
+  .chip:hover { background: var(--champagne-deep); color: var(--ink); border-color: var(--gold); }
+  .regen-btn { margin-left: auto; padding: 12px 24px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.88rem; font-weight: 500; cursor: pointer; transition: all 0.2s; white-space: nowrap; }
+  .regen-btn:hover { background: var(--ink-soft); }
+  .modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
+  .modal-overlay.open { display: flex; }
+  .modal-box { background: var(--white); border-radius: var(--radius-lg); width: 100%; max-width: 390px; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
+  @keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
+  .modal-video { aspect-ratio: 9/16; max-height: 70vh; display: flex; align-items: center; justify-content: center; position: relative; background:var(--ink); }
+  .modal-video video { width:100%; height:100%; object-fit:contain; background:#000; }
+  .modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
+  .modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
+  .modal-actions { display:flex; align-items:center; gap:8px; }
+  .modal-close, .modal-download { padding: 8px 14px; background: var(--surface); border: 1px solid var(--border); border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.82rem; cursor: pointer; transition: all 0.15s; color:var(--ink); text-decoration:none; }
+  .modal-close:hover, .modal-download:hover { background: var(--champagne); }
+  .log-panel { display:none; margin-top:24px; background:var(--ink); color:var(--cream); border-radius:12px; padding:14px; font:12px/1.45 ui-monospace, SFMono-Regular, Consolas, monospace; white-space:pre-wrap; max-height:240px; overflow:auto; text-align:left; }
+  @media (max-width: 600px) { nav { padding: 16px 20px; } .input-card { padding: 24px 20px; } #screen-processing { padding: 32px 16px 60px; } .pipeline { padding: 20px 16px; } .clips-grid { grid-template-columns: repeat(2, 1fr); gap: 10px; } .regen-section { padding: 22px 18px; } .regen-btn { width: 100%; margin-left: 0; } .regen-row { flex-direction: column; align-items: flex-start; } }
+  .thumb-1 { background: linear-gradient(135deg, #D4A96A 0%, #8B5E3C 100%); } .thumb-2 { background: linear-gradient(135deg, #7A9E8A 0%, #3D6650 100%); }
+  .thumb-3 { background: linear-gradient(135deg, #9E8A7A 0%, #5C3E2E 100%); } .thumb-4 { background: linear-gradient(135deg, #8A7A9E 0%, #4A3866 100%); }
+  .thumb-5 { background: linear-gradient(135deg, #9E9A7A 0%, #5C5820 100%); } .thumb-6 { background: linear-gradient(135deg, #C4856A 0%, #7A3020 100%); }
+  .thumb-7 { background: linear-gradient(135deg, #7AABBE 0%, #2A5A6E 100%); } .thumb-8 { background: linear-gradient(135deg, #9EAA7A 0%, #4A5E20 100%); }
+  .thumb-9 { background: linear-gradient(135deg, #AA7A9E 0%, #5E2060 100%); } .thumb-0 { background: linear-gradient(135deg, #D4C36A 0%, #8B7820 100%); }
+  .spin { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--border); border-top-color: var(--gold); border-radius: 50%; animation: spin 0.8s linear infinite; }
+  @keyframes spin { to { transform: rotate(360deg); } }
+</style>
+</head>
+<body>
+<nav>
+  <div class="logo">Clip<span>Forge</span></div>
+  <div style="font-size:0.8rem;color:var(--ink-muted);font-weight:300;display:none" id="nav-status">Processing...</div>
+</nav>
+<div class="screen active" id="screen-input">
+  <div style="display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:calc(100vh - 65px);padding:40px 20px;text-align:center;">
+    <div class="eyebrow">AI Video Editor</div>
+    <h1 class="hero-title">Convert your long video to <em>short clips</em> for social media</h1>
+    <p class="hero-sub">Paste a link or upload a file - we handle the rest</p>
+    <div class="input-card">
+      <div class="mode-tabs">
+        <button class="mode-tab active" onclick="switchMode('yt')">Link</button>
+        <button class="mode-tab" onclick="switchMode('upload')">Upload File</button>
+      </div>
+      <div class="input-section active" id="mode-yt">
+        <label class="input-label">Video URL</label>
+        <input class="yt-input" type="text" placeholder="https://youtube.com/watch?v=..." id="yt-url">
+      </div>
+      <div class="input-section" id="mode-upload">
+        <input type="file" id="file-input" accept="video/mp4,video/quicktime,video/*" hidden>
+        <div class="upload-zone" id="upload-zone" onclick="openUpload()">
+          <div class="upload-icon">File</div>
+          <div class="upload-text">Click to browse or drag & drop</div>
+          <div class="upload-sub">MP4, MOV, AVI - up to your Space limit</div>
+        </div>
+      </div>
+      <button class="convert-btn" id="convert-btn" onclick="startProcessing()">Convert to Clips -></button>
+    </div>
+  </div>
+</div>
+<div class="screen" id="screen-processing">
+  <div class="processing-header">
+    <div class="eyebrow">Working on it</div>
+    <h2 class="processing-title">Your clips are being crafted</h2>
+    <p class="processing-sub" id="processing-sub">Sit back - long videos can take a little while</p>
+  </div>
+  <div class="pipeline" id="pipeline">
+    <div class="pipeline-step" id="step-0"><div class="step-icon">Up</div><div class="step-content"><div class="step-name">Uploading video <span class="step-pct" id="pct-0">0%</span></div><div class="progress-track"><div class="progress-fill" id="fill-0"></div></div></div></div>
+    <div class="pipeline-step" id="step-1"><div class="step-icon">Text</div><div class="step-content"><div class="step-name">Generating transcript <span class="step-pct" id="pct-1"></span></div><div class="progress-track"><div class="progress-fill" id="fill-1"></div></div></div></div>
+    <div class="pipeline-step" id="step-2"><div class="step-icon">Cut</div><div class="step-content"><div class="step-name">Choosing short clips <span class="step-pct" id="pct-2"></span></div><div class="progress-track"><div class="progress-fill" id="fill-2"></div></div></div></div>
+    <div class="pipeline-step" id="step-3"><div class="step-icon">Film</div><div class="step-content"><div class="step-name">Producing clips <span class="step-pct" id="pct-3"></span></div><div class="progress-track"><div class="progress-fill" id="fill-3"></div></div></div></div>
+    <div class="pipeline-step" id="step-4"><div class="step-icon">Edit</div><div class="step-content"><div class="step-name">Adding subtitles &amp; light edits <span class="step-pct" id="pct-4"></span></div><div class="progress-track"><div class="progress-fill" id="fill-4"></div></div></div></div>
+  </div>
+  <div class="tips-section" id="tips-section">
+    <div class="tips-label">Tips while you wait</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> Clips are automatically trimmed around the strongest hook.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> The system can pick centered speaker or split presentation layout per clip.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> Word-by-word subtitles are added by default.</div>
+    <div class="tip-card"><span class="tip-dot">◆</span> You can regenerate with different instructions after the first batch.</div>
+  </div>
+  <div class="clips-section" id="clips-section" style="display:none">
+    <div class="clips-title">Your clips</div>
+    <p class="clips-sub" id="clips-sub-text">Tap any clip to preview</p>
+    <div class="clips-grid" id="clips-grid"></div>
+  </div>
+  <div class="regen-section" id="regen-section">
+    <div class="regen-title">Produce a different set</div>
+    <p class="regen-sub">Describe what you're looking for and we'll re-cut your video</p>
+    <textarea class="regen-textarea" placeholder="e.g. Focus on the funniest moments, keep clips under 30 seconds, add a text hook at the start..." id="regen-prompt"></textarea>
+    <div class="regen-row">
+      <span class="chip" onclick="setChip('Highlight key insights')">Key insights</span>
+      <span class="chip" onclick="setChip('Funny & entertaining moments')">Funny moments</span>
+      <span class="chip" onclick="setChip('Emotional or inspiring clips')">Emotional</span>
+      <span class="chip" onclick="setChip('Fast-paced, high energy edits')">High energy</span>
+      <button class="regen-btn" onclick="triggerRegen()">Regenerate Clips -></button>
+    </div>
+  </div>
+  <pre class="log-panel" id="log-panel"></pre>
+</div>
+<div class="modal-overlay" id="modal" onclick="closeModal(event)">
+  <div class="modal-box">
+    <div class="modal-video" id="modal-video"><div class="clip-play" style="width:56px;height:56px;font-size:1.4rem;background:rgba(255,255,255,0.9)">▶</div></div>
+    <div class="modal-footer">
+      <div class="modal-clip-label" id="modal-label">Clip 1</div>
+      <div class="modal-actions"><a class="modal-download" id="modal-download" href="#" download>Download</a><button class="modal-close" onclick="document.getElementById('modal').classList.remove('open')">Close</button></div>
+    </div>
+  </div>
+</div>
+<script>
+  let currentMode = 'yt';
+  let selectedFile = null;
+  let currentJobId = null;
+  let renderedClips = [];
+  const iconLabels = ['Up','Text','Cut','Film','Edit'];
+  function switchMode(m) {
+    currentMode = m;
+    document.querySelectorAll('.mode-tab').forEach((t,i) => t.classList.toggle('active', (i===0 && m==='yt') || (i===1 && m==='upload')));
+    document.getElementById('mode-yt').classList.toggle('active', m==='yt');
+    document.getElementById('mode-upload').classList.toggle('active', m==='upload');
+  }
+  function openUpload() { document.getElementById('file-input').click(); }
+  function setSelectedFile(file) {
+    selectedFile = file;
+    const zone = document.getElementById('upload-zone');
+    zone.innerHTML = `<div class="upload-icon">OK</div><div class="upload-text" style="color:var(--gold)">File selected: ${escapeHtml(file.name)}</div><div class="upload-sub">Ready to convert</div>`;
+  }
+  const uploadZone = document.getElementById('upload-zone');
+  document.getElementById('file-input').addEventListener('change', e => { if (e.target.files[0]) setSelectedFile(e.target.files[0]); });
+  uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
+  uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
+  uploadZone.addEventListener('drop', e => { e.preventDefault(); uploadZone.classList.remove('dragover'); if (e.dataTransfer.files[0]) setSelectedFile(e.dataTransfer.files[0]); });
+  function escapeHtml(s) {
+    return String(s).replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#039;'}[c]));
+  }
+  async function createJob(extraPrompt = '') {
+    const form = new FormData();
+    if (extraPrompt && currentJobId) {
+      form.append('source_job_id', currentJobId);
+      form.append('regen_prompt', extraPrompt);
+    } else if (currentMode === 'upload') {
+      if (!selectedFile) throw new Error('Choose a video file first.');
+      form.append('file', selectedFile);
+    } else {
+      const url = document.getElementById('yt-url').value.trim();
+      if (!url) throw new Error('Paste a video URL first.');
+      form.append('video_url', url);
+    }
+    const res = await fetch('/api/jobs', { method: 'POST', body: form });
+    const data = await res.json();
+    if (!res.ok) throw new Error(data.detail || 'Could not start job.');
+    return data;
+  }
+  async function startProcessing() {
+    const btn = document.getElementById('convert-btn');
+    try {
+      btn.disabled = true;
+      btn.textContent = 'Starting...';
+      const job = await createJob();
+      currentJobId = job.id;
+      renderedClips = [];
+      document.getElementById('clips-grid').innerHTML = '';
+      document.getElementById('screen-input').classList.remove('active');
+      document.getElementById('screen-processing').classList.add('active');
+      document.getElementById('nav-status').style.display = 'block';
+      syncJob(job);
+      pollJob(job.id);
+    } catch (err) {
+      alert(err.message || err);
+    } finally {
+      btn.disabled = false;
+      btn.textContent = 'Convert to Clips ->';
+    }
+  }
+  async function pollJob(id) {
+    let done = false;
+    while (!done && currentJobId === id) {
+      await new Promise(r => setTimeout(r, 1400));
+      const res = await fetch(`/api/jobs/${id}`);
+      const job = await res.json();
+      syncJob(job);
+      done = job.done;
+    }
+  }
+  function syncJob(job) {
+    document.getElementById('nav-status').textContent = job.nav_status || 'Processing...';
+    document.getElementById('processing-sub').textContent = job.error ? job.error : job.status;
+    document.getElementById('log-panel').textContent = job.logs || '';
+    (job.steps || []).forEach((step, i) => {
+      const el = document.getElementById(`step-${i}`);
+      const fill = document.getElementById(`fill-${i}`);
+      const pct = document.getElementById(`pct-${i}`);
+      el.classList.toggle('active', step.state === 'active');
+      el.classList.toggle('done', step.state === 'done');
+      el.querySelector('.step-icon').innerHTML = step.state === 'done' ? '✓' : (step.state === 'active' ? '<span class="spin"></span>' : iconLabels[i]);
+      fill.style.width = `${step.pct || 0}%`;
+      pct.textContent = step.pct ? `${Math.floor(step.pct)}%` : '';
+    });
+    (job.clips || []).forEach((clip, idx) => {
+      if (!renderedClips.some(c => c.name === clip.name)) {
+        renderedClips.push(clip);
+        addClip(renderedClips.length - 1);
+      }
+    });
+    if (renderedClips.length) {
+      document.getElementById('clips-section').style.display = 'block';
+      document.getElementById('clips-sub-text').textContent = job.done
+        ? `All ${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - tap to preview`
+        : `${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - more coming...`;
+    }
+    if (job.done) {
+      document.getElementById('regen-section').style.display = 'block';
+      if (job.error) document.getElementById('log-panel').style.display = 'block';
+    }
+  }
+  function addClip(idx) {
+    const clip = renderedClips[idx];
+    const grid = document.getElementById('clips-grid');
+    const card = document.createElement('div');
+    card.className = 'clip-card';
+    card.innerHTML = `<div class="clip-thumb thumb-${idx % 10}"><div class="clip-play">▶</div></div><div class="clip-meta"><div class="clip-num">Clip ${idx + 1}</div><div class="clip-dur">${clip.duration || '0:00'}</div><a class="clip-download" href="${clip.url}" download onclick="event.stopPropagation()">Download</a></div>`;
+    card.onclick = () => openModal(idx);
+    grid.appendChild(card);
+  }
+  function openModal(idx) {
+    const clip = renderedClips[idx];
+    const modal = document.getElementById('modal');
+    const video = document.getElementById('modal-video');
+    video.className = 'modal-video';
+    video.innerHTML = `<video src="${clip.url}" controls autoplay playsinline></video>`;
+    document.getElementById('modal-label').textContent = `Clip ${idx + 1}`;
+    document.getElementById('modal-download').href = clip.url;
+    modal.classList.add('open');
+  }
+  function closeModal(e) {
+    if (e.target === document.getElementById('modal')) {
+      document.getElementById('modal').classList.remove('open');
+      document.getElementById('modal-video').innerHTML = '';
+    }
+  }
+  function setChip(text) {
+    const ta = document.getElementById('regen-prompt');
+    ta.value = text;
+    ta.focus();
+  }
+  async function triggerRegen() {
+    const prompt = document.getElementById('regen-prompt').value.trim();
+    if (!prompt) { document.getElementById('regen-prompt').focus(); return; }
+    if (!currentJobId) { alert('Run a video first.'); return; }
+    renderedClips = [];
+    document.getElementById('clips-grid').innerHTML = '';
+    document.getElementById('clips-section').style.display = 'none';
+    document.getElementById('regen-section').style.display = 'none';
+    document.getElementById('nav-status').textContent = 'Regenerating...';
+    document.querySelectorAll('.pipeline-step').forEach((s, i) => {
+      s.classList.remove('active', 'done');
+      s.querySelector('.step-icon').innerHTML = iconLabels[i];
+      document.getElementById(`fill-${i}`).style.width = '0%';
+      document.getElementById(`pct-${i}`).textContent = '';
+    });
+    window.scrollTo({ top: 0, behavior: 'smooth' });
+    try {
+      const job = await createJob(prompt);
+      currentJobId = job.id;
+      syncJob(job);
+      pollJob(job.id);
+    } catch (err) {
+      alert(err.message || err);
+    }
+  }
+</script>
+</body>
+</html>"""
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))

humeo-core/.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+.pytest_cache/
+build/
+dist/
+.venv/
+.env

humeo-core/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 NotABot
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

humeo-core/README.md ADDED Viewed

	@@ -0,0 +1,165 @@

+# humeo-core
+**Reusable-rocket MCP server for long-video → 9:16 shorts.**
+First-principles design, from the HIVE paper + Bryan's rocket analogy:
+we don't build doors and windows (general subject-tracker UI, retraining
+models). We build the **container** (schemas), **landing gear** (deterministic
+local extraction), and **five thrusters** (the five 9:16 layouts this video
+format actually uses). Everything else is pluggable.
+## The rocket, in one picture
+```
+            ┌──────────────────────────────────────────┐
+            │         Control panel  (MCP tools)       │   <- any MCP client
+            └───────────────────┬──────────────────────┘
+                                │ strict JSON
+   ┌────────────────┬───────────┼────────────────┬─────────────────┐
+   ▼                ▼           ▼                ▼                 ▼
+ ingest       classify_scenes  select_clips   plan_layout       render_clip
+(scenes +     (5-way layout   (clip picker,   (5 thrusters,    (ffmpeg compile,
+ keyframes +   classifier)     heuristic +     pure filter      dry-run safe)
+ transcript)                   LLM-ready)      math)
+                                                 │
+                                                 ▼
+                                       ┌────────────────────┐
+                                       │   LayoutKind       │
+                                       │  ────────────────  │
+                                       │  zoom_call_center  │
+                                       │  sit_center        │
+                                       │  split_chart_person│
+                                       │  split_two_persons │
+                                       │  split_two_charts  │
+                                       └────────────────────┘
+```
+Only the classifier and clip-selector have optional LLM hooks; everything
+else is deterministic, local, and cheap.
+## Why five layouts? (the "max 2 items" rule)
+The hard constraint for this format: **a short shows at most two on-screen
+items** — where an "item" is a `person` (a human speaker) or a `chart`
+(slide, graph, data visual, screenshare). That gives exactly five recipes:
+1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
+2. **`sit_center`** — 1 person, interview / seated framing.
+3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
+   (default: **even 50/50** top/bottom, chart on top).
+4. **`split_two_persons`** — 2 speakers, stacked vertically.
+5. **`split_two_charts`** — 2 charts, stacked vertically.
+Because the geometry is bounded, we do NOT need a general subject-tracker
+ML model or a drag-to-highlight UI. We need five small, correct pieces of
+crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
+is.
+See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
+used across these docs (subject, crop, band, seam, bbox, layout, etc.).
+## Install
+```bash
+uv venv
+uv sync
+```
+External requirements: `ffmpeg` and `ffprobe` on PATH.
+`scenedetect` requires OpenCV. Install `opencv-python-headless` or
+`opencv-python` alongside `scenedetect`.
+## Use it as an MCP server
+```bash
+humeo-core         # stdio transport (primary console script)
+# humeo-mcp        # same entrypoint — kept so existing MCP configs keep working
+```
+Example Cursor/Claude Desktop config:
+```json
+{
+  "mcpServers": {
+    "humeo": { "command": "humeo-core" }
+  }
+}
+```
+Tools exposed:
+| Tool                              | Purpose                                                                     |
+| --------------------------------- | --------------------------------------------------------------------------- |
+| `list_layouts`                    | Enumerate the 5 supported layouts.                                          |
+| `ingest`                          | Scene detection + keyframe extraction (+ optional transcript).              |
+| `classify_scenes`                 | Pixel-heuristic per-scene layout classification.                            |
+| `detect_scene_regions`            | Return the bbox prompt + per-scene jobs (agent runs its own vision model).  |
+| `classify_scenes_with_vision`     | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
+| `select_clips`                    | Heuristic clip picker over a word-level transcript.                         |
+| `plan_layout`                     | Return the exact `ffmpeg -filter_complex` for a layout.                     |
+| `build_render_cmd`                | Build the ffmpeg command (no execution) — review before spend.              |
+| `render_clip`                     | Build + run ffmpeg to produce a 9:16 MP4.                                   |
+Resource: `humeo://layouts` (JSON listing of the 5 layouts).
+### Three interchangeable region detectors
+All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
+```
+classify.py   (pixel variance, no ML)
+face_detect.py (MediaPipe, local)            ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
+vision.py     (multimodal LLM + OCR bboxes)
+```
+## JSON contracts (non-negotiable)
+All tools take and return Pydantic-validated JSON. The contracts live in
+[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
+- `Scene`                     `{scene_id, start_time, end_time, keyframe_path?}`
+- `TranscriptWord`            `{word, start_time, end_time}`
+- `IngestResult`              `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
+- `SceneClassification`       `{scene_id, layout, confidence, reason}`
+- `BoundingBox`               `{x1, y1, x2, y2, label, confidence}`  (all coords normalized)
+- `SceneRegions`              `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
+- `Clip`                      `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
+- `ClipPlan`                  `{source_path, clips[]}`
+- `LayoutInstruction`         `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
+- `RenderRequest` / `RenderResult`
+## First-principles decisions (what we intentionally did NOT build)
+- **No giant subject-tracker ML.** The video format has 5 fixed layouts
+  (with a hard "max 2 items" rule); pixel-level tracking is not needed.
+- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
+  agent-first workflow. If a human wants to override, they pass a
+  `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
+  `zoom`.
+- **No end-to-end video→video model.** The HIVE paper's core insight is
+  that decomposed orchestration beats monolithic generation. We reify
+  that insight as six small composable tools.
+## Extending the pilot
+- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
+- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
+- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
+  to get per-scene bboxes + OCR text, then feed the results back through
+  `classify_scenes_with_vision`. This is the scene-change → v3 images →
+  LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
+- All enforce strict JSON outputs, so bad model output can't corrupt
+  downstream stages.
+## Testing
+```bash
+python -m pytest
+```
+See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
+## License
+MIT

humeo-core/docs/ARCHITECTURE.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Architecture — Reusable Rocket
+> *"We don't need to build the door or windows — just a container with landing
+> gear and thrusters that move in different directions."*
+> — Bryan
+That analogy maps exactly onto this MCP:
+| Rocket part     | Codebase                                                         | Purpose                                                                 |
+| --------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------- |
+| Container       | `src/humeo_core/schemas.py`                                       | Strict JSON contracts every stage reads/writes.                         |
+| Landing gear    | `src/humeo_core/primitives/ingest.py`                             | Deterministic local extraction (scenes, keyframes, transcript).         |
+| Thrusters (×5)  | `src/humeo_core/primitives/layouts.py`                            | Five fixed 9:16 crop/compose recipes (max 2 on-screen items).           |
+| Pilot           | `primitives/classify.py` + `primitives/select_clips.py`          | Heuristic + LLM-ready decision makers.                                  |
+| Compiler        | `src/humeo_core/primitives/compile.py`                            | Deterministic ffmpeg assembly.                                          |
+| Control panel   | `src/humeo_core/server.py`                                        | MCP tools exposing every primitive.                                     |
+| Control surface | `src/humeo_core/server.py`                                        | MCP tool surface for agents and clients.                                |
+## First-principles reasoning
+The HIVE paper's core insight is that good short-video editing requires
+**staged reasoning with strict intermediate artifacts**, not a single
+giant model call. Three consequences flow from that:
+1. **Extraction must be local and deterministic.** No model call should
+   ever touch raw video bytes. `ingest.py` runs ffprobe + PySceneDetect
+   + ffmpeg + (optional) faster-whisper. Everything it emits is JSON or
+   a file path.
+2. **Reasoning must be decomposed into narrow sub-tasks.** Classifying a
+   scene's layout is a completely different task from selecting a viral
+   clip. Each has its own schema, its own prompt, its own validation.
+   This is why `primitives/` is five files instead of one.
+3. **Every model call must emit schema-validated JSON.** Free-form model
+   output is not allowed to enter the pipeline. `classify_scenes_with_llm`
+   and `select_clips_with_llm` both `model_validate(...)` the raw output
+   before returning; parse failures degrade gracefully to `SIT_CENTER` +
+   low confidence, not crashes.
+## Why only five layouts?
+The hard rule for this format: **a short shows at most two on-screen
+items**, where an "item" is a `person` or a `chart`. That gives exactly
+five recipes — all implemented as pure functions from
+`LayoutInstruction` to an ffmpeg filtergraph string in `layouts.py`:
+| Layout                 | Items           | Recipe                                        |
+| ---------------------- | --------------- | --------------------------------------------- |
+| `zoom_call_center`     | 1 person        | tight centered 9:16 crop (zoom ≥ 1.25).       |
+| `sit_center`           | 1 person        | wider centered 9:16 crop.                     |
+| `split_chart_person`   | 1 chart + person| source partitioned L/R by bboxes, stacked.    |
+| `split_two_persons`    | 2 persons       | L/R speakers, stacked top/bottom.             |
+| `split_two_charts`     | 2 charts        | L/R charts, stacked top/bottom.               |
+A general subject-tracker ML model is orders of magnitude more expensive
+and less reliable than five hand-written crop recipes. If a new geometry
+ever shows up in future source videos, adding a sixth thruster is
+strictly additive: write a new `plan_*` function, add it to `_DISPATCH`,
+add an enum variant. No existing code has to change.
+## 9:16 layout math
+Source is assumed 16:9 (1920×1080 by default, but probed per-clip).
+Target is 1080×1920. For each layout:
+### `zoom_call_center` and `sit_center`
+Standard centered aspect-ratio crop to 9:16, then scale to 1080×1920:
+```
+crop=cw:ch:x:y,scale=1080:1920:flags=lanczos,setsar=1[vout]
+```
+`cw`, `ch` are the largest 9:16 window that fits in the source, divided
+by `zoom`. `x`, `y` center the window on `person_x_norm` / 0.5.
+Dimensions are rounded to even values so libx264 is happy. The window is
+clamped inside the source so a high `person_x_norm` never crops outside.
+### Split layouts (`split_chart_person`, `split_two_persons`, `split_two_charts`)
+All three splits share one recipe — only the items differ:
+1. **Horizontal partition.** The source is cut at a single vertical seam
+   so the two source strips are **complementary** (no overlap, no gap).
+   When both bboxes are set (Gemini vision), the seam is the midpoint
+   between `left.x2` and `right.x1`. Otherwise the seam defaults to
+   either an even 50/50 (two-of-a-kind splits) or a 2/3 | 1/3 split
+   (legacy `split_chart_person` fallback).
+2. **Vertical crop.** Each strip's vertical extent comes from the
+   corresponding bbox when provided, so each item **fills** its output
+   band instead of being lost in full-height source context.
+3. **Cover-scale to the band.** Each strip is scaled with
+   `force_original_aspect_ratio=increase` + center-cropped to the band
+   dimensions. Bands are always fully painted; no letterbox bars.
+4. **Stack.** Two branches produced by `split=2` are `vstack`-ed into
+   the final 1080×1920.
+**Band heights** are controlled by `LayoutInstruction.top_band_ratio`,
+which defaults to **0.5** (even 50/50 — the symmetric look Bryan asked
+for after the uneven Cathy Wood shorts). Legacy 60/40 is still reachable
+by setting `top_band_ratio=0.6`.
+**Stack order** (for `split_chart_person`) is controlled by
+`focus_stack_order`: chart-on-top (default) or person-on-top.
+## Extensibility story
+- **Smarter classifier:** implement `LLMVisionFn` with any multimodal
+  model and pass it to `classify_scenes_with_llm`. The fallback heuristic
+  stays available for offline runs and tests.
+- **Smarter clip selector:** same pattern, `LLMTextFn` → `select_clips_with_llm`.
+- **New layout:** add a `plan_*` planner, register in `_DISPATCH`, add a
+  `LayoutKind` variant. Tests in `test_layouts.py` automatically iterate
+  over all `LayoutKind`s, so the dispatch coverage test will catch a
+  missing registration immediately.
+## What we intentionally did NOT build
+- Drag-and-highlight subject-selector UI.
+- A general ML subject-tracker.
+- A monolithic video-in-video-out model.
+- Any network calls in the core library. The MCP server is stdio-only;
+  the CLI runs fully offline.
+This keeps the rocket **reusable**: the same primitives power the MCP
+server, the CLI, a Python library, and (soon) a web UI if that's ever
+warranted.

humeo-core/docs/MCP_USAGE.md ADDED Viewed

	@@ -0,0 +1,100 @@

+# Using humeo-core from an MCP client
+The installed console command is **`humeo-core`**. For backward compatibility,
+**`humeo-mcp`** is also registered (same entrypoint); either works in
+`"command": ...` if both are on `PATH` from the same install.
+## 1. Add to your client
+`claude_desktop_config.json` or `.cursor/mcp.json`:
+```json
+{
+  "mcpServers": {
+    "humeo": {
+      "command": "humeo-core"
+    }
+  }
+}
+```
+## 2. A typical agent plan
+```
+→ humeo.list_layouts()
+    # discover the 5 layouts (max 2 on-screen items per short)
+→ humeo.ingest(source_path="/abs/long.mp4", work_dir="/abs/work", with_transcript=true)
+    # IngestResult: scenes[], keyframes, transcript_words[]
+→ humeo.classify_scenes(scenes=<IngestResult.scenes>)
+    # SceneClassification[] — one layout per scene
+→ humeo.select_clips(
+      source_path=..., transcript_words=..., duration_sec=...,
+      target_count=5, min_sec=30, max_sec=60
+  )
+    # ClipPlan — top non-overlapping clips
+# For each clip, pick the layout of the scene its midpoint falls in,
+# build a LayoutInstruction, and:
+→ humeo.build_render_cmd(request={...})
+    # dry-run: returns the exact ffmpeg argv, no execution
+→ humeo.render_clip(request={..., "mode": "normal"})
+    # actually renders the 9:16 MP4
+```
+## 3. Strict JSON all the way
+Every request/response is validated against the schemas in
+[`schemas.py`](../src/humeo_core/schemas.py). Invalid input is rejected
+*before* ffmpeg is touched, so a confused agent can't accidentally
+rm-rf your disk or burn GPU hours.
+## 4. Override knobs
+`LayoutInstruction` accepts:
+- `zoom`, `person_x_norm`, `chart_x_norm` — single-subject knobs.
+- `split_chart_region`, `split_person_region`,
+  `split_second_chart_region`, `split_second_person_region` —
+  normalized bboxes that drive split-layout cropping.
+- `top_band_ratio` — fraction of output height used by the top band
+  (default 0.5 = even 50/50, the symmetric look).
+- `focus_stack_order` — for `split_chart_person`, chart-on-top vs
+  person-on-top.
+Example: chart + person with a precise bbox crop and an even split.
+```json
+{
+  "clip_id": "001",
+  "layout": "split_chart_person",
+  "split_chart_region":  {"x1": 0.00, "y1": 0.10, "x2": 0.52, "y2": 0.95},
+  "split_person_region": {"x1": 0.55, "y1": 0.05, "x2": 1.00, "y2": 1.00},
+  "top_band_ratio": 0.5,
+  "focus_stack_order": "chart_then_person"
+}
+```
+Example: two-speaker interview.
+```json
+{
+  "clip_id": "002",
+  "layout": "split_two_persons",
+  "split_person_region":        {"x1": 0.02, "y1": 0.05, "x2": 0.48, "y2": 0.95},
+  "split_second_person_region": {"x1": 0.52, "y1": 0.05, "x2": 0.98, "y2": 0.95}
+}
+```
+## 5. When to stay in dry-run
+- You want to show an approval UI before spending CPU.
+- You want to diff the planned ffmpeg commands against a previous run.
+- You're building tests.
+`mode="dry_run"` is always safe, never writes output, and returns the
+exact argv list.

humeo-core/examples/render_request.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "source_path": "/absolute/path/to/long.mp4",
+  "clip": {
+    "clip_id": "001",
+    "topic": "Prediction Market Explosion",
+    "start_time_sec": 289.0,
+    "end_time_sec": 331.5,
+    "viral_hook": "Prediction markets could explode to $5 trillion.",
+    "virality_score": 0.94,
+    "transcript": "Full text for subtitle generation...",
+    "suggested_overlay_title": "$5T Prediction Markets"
+  },
+  "layout": {
+    "clip_id": "001",
+    "layout": "split_chart_person",
+    "zoom": 1.0,
+    "person_x_norm": 0.83,
+    "chart_x_norm": 0.0
+  },
+  "output_path": "/absolute/path/to/out/clip_001.mp4",
+  "title_text": "$5T Prediction Markets",
+  "mode": "dry_run"
+}

humeo-core/pyproject.toml ADDED Viewed

	@@ -0,0 +1,46 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "humeo-core"
+version = "0.1.0"
+description = "Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints)."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "MIT" }
+authors = [{ name = "Humeo" }]
+keywords = ["mcp", "video", "shorts", "ffmpeg", "editing", "humeo", "hive"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+    "mcp[cli]>=1.2.0",
+    "pydantic>=2.0",
+    "scenedetect>=0.6",
+]
+[project.optional-dependencies]
+transcribe = ["faster-whisper>=1.0"]
+download = ["yt-dlp>=2024.0"]
+face = ["mediapipe>=0.10", "opencv-python>=4.8"]
+vision = ["Pillow>=10.0"]
+dev = ["pytest>=7", "pytest-asyncio>=0.23", "Pillow>=10.0"]
+[project.scripts]
+humeo-core = "humeo_core.server:main"
+# Backward-compatible entry point (same module); existing MCP configs may still call `humeo-mcp`.
+humeo-mcp = "humeo_core.server:main"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+humeo_core = ["assets/fonts/*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+addopts = "-ra -q"

humeo-core/src/humeo_core.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,197 @@

+Metadata-Version: 2.4
+Name: humeo-core
+Version: 0.1.0
+Summary: Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints).
+Author: Humeo
+License: MIT
+Keywords: mcp,video,shorts,ffmpeg,editing,humeo,hive
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: mcp[cli]>=1.2.0
+Requires-Dist: pydantic>=2.0
+Requires-Dist: scenedetect>=0.6
+Provides-Extra: transcribe
+Requires-Dist: faster-whisper>=1.0; extra == "transcribe"
+Provides-Extra: download
+Requires-Dist: yt-dlp>=2024.0; extra == "download"
+Provides-Extra: face
+Requires-Dist: mediapipe>=0.10; extra == "face"
+Requires-Dist: opencv-python>=4.8; extra == "face"
+Provides-Extra: vision
+Requires-Dist: Pillow>=10.0; extra == "vision"
+Provides-Extra: dev
+Requires-Dist: pytest>=7; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
+Requires-Dist: Pillow>=10.0; extra == "dev"
+Dynamic: license-file
+# humeo-core
+**Reusable-rocket MCP server for long-video → 9:16 shorts.**
+First-principles design, from the HIVE paper + Bryan's rocket analogy:
+we don't build doors and windows (general subject-tracker UI, retraining
+models). We build the **container** (schemas), **landing gear** (deterministic
+local extraction), and **five thrusters** (the five 9:16 layouts this video
+format actually uses). Everything else is pluggable.
+## The rocket, in one picture
+```
+            ┌──────────────────────────────────────────┐
+            │         Control panel  (MCP tools)       │   <- any MCP client
+            └───────────────────┬──────────────────────┘
+                                │ strict JSON
+   ┌────────────────┬───────────┼────────────────┬─────────────────┐
+   ▼                ▼           ▼                ▼                 ▼
+ ingest       classify_scenes  select_clips   plan_layout       render_clip
+(scenes +     (5-way layout   (clip picker,   (5 thrusters,    (ffmpeg compile,
+ keyframes +   classifier)     heuristic +     pure filter      dry-run safe)
+ transcript)                   LLM-ready)      math)
+                                                 │
+                                                 ▼
+                                       ┌────────────────────┐
+                                       │   LayoutKind       │
+                                       │  ────────────────  │
+                                       │  zoom_call_center  │
+                                       │  sit_center        │
+                                       │  split_chart_person│
+                                       │  split_two_persons │
+                                       │  split_two_charts  │
+                                       └────────────────────┘
+```
+Only the classifier and clip-selector have optional LLM hooks; everything
+else is deterministic, local, and cheap.
+## Why five layouts? (the "max 2 items" rule)
+The hard constraint for this format: **a short shows at most two on-screen
+items** — where an "item" is a `person` (a human speaker) or a `chart`
+(slide, graph, data visual, screenshare). That gives exactly five recipes:
+1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
+2. **`sit_center`** — 1 person, interview / seated framing.
+3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
+   (default: **even 50/50** top/bottom, chart on top).
+4. **`split_two_persons`** — 2 speakers, stacked vertically.
+5. **`split_two_charts`** — 2 charts, stacked vertically.
+Because the geometry is bounded, we do NOT need a general subject-tracker
+ML model or a drag-to-highlight UI. We need five small, correct pieces of
+crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
+is.
+See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
+used across these docs (subject, crop, band, seam, bbox, layout, etc.).
+## Install
+```bash
+uv venv
+uv sync
+```
+External requirements: `ffmpeg` and `ffprobe` on PATH.
+`scenedetect` requires OpenCV. Install `opencv-python-headless` or
+`opencv-python` alongside `scenedetect`.
+## Use it as an MCP server
+```bash
+humeo-core         # stdio transport (primary console script)
+# humeo-mcp        # same entrypoint — kept so existing MCP configs keep working
+```
+Example Cursor/Claude Desktop config:
+```json
+{
+  "mcpServers": {
+    "humeo": { "command": "humeo-core" }
+  }
+}
+```
+Tools exposed:
+| Tool                              | Purpose                                                                     |
+| --------------------------------- | --------------------------------------------------------------------------- |
+| `list_layouts`                    | Enumerate the 5 supported layouts.                                          |
+| `ingest`                          | Scene detection + keyframe extraction (+ optional transcript).              |
+| `classify_scenes`                 | Pixel-heuristic per-scene layout classification.                            |
+| `detect_scene_regions`            | Return the bbox prompt + per-scene jobs (agent runs its own vision model).  |
+| `classify_scenes_with_vision`     | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
+| `select_clips`                    | Heuristic clip picker over a word-level transcript.                         |
+| `plan_layout`                     | Return the exact `ffmpeg -filter_complex` for a layout.                     |
+| `build_render_cmd`                | Build the ffmpeg command (no execution) — review before spend.              |
+| `render_clip`                     | Build + run ffmpeg to produce a 9:16 MP4.                                   |
+Resource: `humeo://layouts` (JSON listing of the 5 layouts).
+### Three interchangeable region detectors
+All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
+```
+classify.py   (pixel variance, no ML)
+face_detect.py (MediaPipe, local)            ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
+vision.py     (multimodal LLM + OCR bboxes)
+```
+## JSON contracts (non-negotiable)
+All tools take and return Pydantic-validated JSON. The contracts live in
+[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
+- `Scene`                     `{scene_id, start_time, end_time, keyframe_path?}`
+- `TranscriptWord`            `{word, start_time, end_time}`
+- `IngestResult`              `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
+- `SceneClassification`       `{scene_id, layout, confidence, reason}`
+- `BoundingBox`               `{x1, y1, x2, y2, label, confidence}`  (all coords normalized)
+- `SceneRegions`              `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
+- `Clip`                      `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
+- `ClipPlan`                  `{source_path, clips[]}`
+- `LayoutInstruction`         `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
+- `RenderRequest` / `RenderResult`
+## First-principles decisions (what we intentionally did NOT build)
+- **No giant subject-tracker ML.** The video format has 5 fixed layouts
+  (with a hard "max 2 items" rule); pixel-level tracking is not needed.
+- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
+  agent-first workflow. If a human wants to override, they pass a
+  `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
+  `zoom`.
+- **No end-to-end video→video model.** The HIVE paper's core insight is
+  that decomposed orchestration beats monolithic generation. We reify
+  that insight as six small composable tools.
+## Extending the pilot
+- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
+- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
+- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
+  to get per-scene bboxes + OCR text, then feed the results back through
+  `classify_scenes_with_vision`. This is the scene-change → v3 images →
+  LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
+- All enforce strict JSON outputs, so bad model output can't corrupt
+  downstream stages.
+## Testing
+```bash
+python -m pytest
+```
+See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
+## License
+MIT

humeo-core/src/humeo_core.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+LICENSE
+README.md
+pyproject.toml
+src/humeo_core/__init__.py
+src/humeo_core/schemas.py
+src/humeo_core/server.py
+src/humeo_core.egg-info/PKG-INFO
+src/humeo_core.egg-info/SOURCES.txt
+src/humeo_core.egg-info/dependency_links.txt
+src/humeo_core.egg-info/entry_points.txt
+src/humeo_core.egg-info/requires.txt
+src/humeo_core.egg-info/top_level.txt
+src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
+src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
+src/humeo_core/assets/fonts/SourceSans3-OFL.txt
+src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
+src/humeo_core/primitives/__init__.py
+src/humeo_core/primitives/classify.py
+src/humeo_core/primitives/compile.py
+src/humeo_core/primitives/face_detect.py
+src/humeo_core/primitives/ingest.py
+src/humeo_core/primitives/layouts.py
+src/humeo_core/primitives/select_clips.py
+src/humeo_core/primitives/vision.py
+tests/test_classify.py
+tests/test_compile.py
+tests/test_face_detect.py
+tests/test_layout_bbox.py
+tests/test_layouts.py
+tests/test_schemas.py
+tests/test_select_clips.py
+tests/test_server_tools.py
+tests/test_vision.py

humeo-core/src/humeo_core.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

humeo-core/src/humeo_core.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+[console_scripts]
+humeo-core = humeo_core.server:main
+humeo-mcp = humeo_core.server:main

humeo-core/src/humeo_core.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+mcp[cli]>=1.2.0
+pydantic>=2.0
+scenedetect>=0.6
+[dev]
+pytest>=7
+pytest-asyncio>=0.23
+Pillow>=10.0
+[download]
+yt-dlp>=2024.0
+[face]
+mediapipe>=0.10
+opencv-python>=4.8
+[transcribe]
+faster-whisper>=1.0
+[vision]
+Pillow>=10.0

humeo-core/src/humeo_core.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ humeo_core

humeo-core/src/humeo_core/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""humeo-core: reusable-rocket MCP primitives for long-video-to-shorts editing.
+First-principles design (rocket analogy):
+    Container  -> schemas.py        (strict JSON contracts)
+    Landing gear -> primitives/ingest.py, primitives/compile.py  (deterministic local)
+    Thrusters    -> primitives/layouts.py                         (5 fixed 9:16 layouts, max 2 items)
+    Pilot        -> primitives/classify.py, primitives/select_clips.py (heuristic, LLM-ready)
+    Control panel -> server.py      (FastMCP tools that expose all primitives)
+"""
+from .schemas import (
+    BoundingBox,
+    Clip,
+    ClipPlan,
+    ClipRenderSpan,
+    ClipSubtitleWords,
+    FocusStackOrder,
+    IngestResult,
+    LayoutInstruction,
+    LayoutKind,
+    RenderRequest,
+    RenderResult,
+    RenderTheme,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+    TranscriptWord,
+)
+__all__ = [
+    "BoundingBox",
+    "Clip",
+    "ClipPlan",
+    "ClipRenderSpan",
+    "ClipSubtitleWords",
+    "FocusStackOrder",
+    "IngestResult",
+    "LayoutInstruction",
+    "LayoutKind",
+    "RenderRequest",
+    "RenderResult",
+    "RenderTheme",
+    "Scene",
+    "SceneClassification",
+    "SceneRegions",
+    "TranscriptWord",
+]
+__version__ = "0.1.0"

humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf ADDED Viewed

Binary file (95.1 kB). View file

humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+Copyright 2020 The League Spartan Project Authors (https://github.com/theleagueof/league-spartan)
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+This license is copied below, and is also available with a FAQ at:
+https://scripts.sil.org/OFL
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.

humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+Copyright 2010-2020 Adobe (http://www.adobe.com/), with Reserved Font Name 'Source'. All Rights Reserved. Source is a trademark of Adobe in the United States and/or other countries.
+This Font Software is licensed under the SIL Open Font License, Version 1.1.
+This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
+-----------------------------------------------------------
+SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
+-----------------------------------------------------------
+PREAMBLE
+The goals of the Open Font License (OFL) are to stimulate worldwide
+development of collaborative font projects, to support the font creation
+efforts of academic and linguistic communities, and to provide a free and
+open framework in which fonts may be shared and improved in partnership
+with others.
+The OFL allows the licensed fonts to be used, studied, modified and
+redistributed freely as long as they are not sold by themselves. The
+fonts, including any derivative works, can be bundled, embedded,
+redistributed and/or sold with any software provided that any reserved
+names are not used by derivative works. The fonts and derivatives,
+however, cannot be released under any other type of license. The
+requirement for fonts to remain under this license does not apply
+to any document created using the fonts or their derivatives.
+DEFINITIONS
+"Font Software" refers to the set of files released by the Copyright
+Holder(s) under this license and clearly marked as such. This may
+include source files, build scripts and documentation.
+"Reserved Font Name" refers to any names specified as such after the
+copyright statement(s).
+"Original Version" refers to the collection of Font Software components as
+distributed by the Copyright Holder(s).
+"Modified Version" refers to any derivative made by adding to, deleting,
+or substituting -- in part or in whole -- any of the components of the
+Original Version, by changing formats or by porting the Font Software to a
+new environment.
+"Author" refers to any designer, engineer, programmer, technical
+writer or other person who contributed to the Font Software.
+PERMISSION & CONDITIONS
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Font Software, to use, study, copy, merge, embed, modify,
+redistribute, and sell modified and unmodified copies of the Font
+Software, subject to the following conditions:
+1) Neither the Font Software nor any of its individual components,
+in Original or Modified Versions, may be sold by itself.
+2) Original or Modified Versions of the Font Software may be bundled,
+redistributed and/or sold with any software, provided that each copy
+contains the above copyright notice and this license. These can be
+included either as stand-alone text files, human-readable headers or
+in the appropriate machine-readable metadata fields within text or
+binary files as long as those fields can be easily viewed by the user.
+3) No Modified Version of the Font Software may use the Reserved Font
+Name(s) unless explicit written permission is granted by the corresponding
+Copyright Holder. This restriction only applies to the primary font name as
+presented to the users.
+4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
+Software shall not be used to promote, endorse or advertise any
+Modified Version, except to acknowledge the contribution(s) of the
+Copyright Holder(s) and the Author(s) or with their explicit written
+permission.
+5) The Font Software, modified or unmodified, in part or in whole,
+must be distributed entirely under this license, and must not be
+distributed under any other license. The requirement for fonts to
+remain under this license does not apply to any document created
+using the Font Software.
+TERMINATION
+This license becomes null and void if any of the above conditions are
+not met.
+DISCLAIMER
+THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
+DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
+OTHER DEALINGS IN THE FONT SOFTWARE.

humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39e3ab05ccd7cb94907c31005bb5bec1d5432f0b096a2b782976e217a540eb6c
+size 395372

humeo-core/src/humeo_core/primitives/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Primitives: deterministic, composable building blocks of the rocket."""

humeo-core/src/humeo_core/primitives/classify.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""Scene classifier: assigns one of the 5 layouts to each scene.
+Two backends share the same contract:
+* ``classify_scenes_heuristic`` — no model call. Uses keyframe pixel analysis
+  (edge density + color variance + face-rectangle heuristic-free approach)
+  to guess which of the 5 layouts fits best. Fully offline, deterministic.
+  Note: the heuristic only picks between ``SIT_CENTER`` / ``ZOOM_CALL_CENTER`` /
+  ``SPLIT_CHART_PERSON``; the two-of-a-kind splits (``SPLIT_TWO_PERSONS`` /
+  ``SPLIT_TWO_CHARTS``) are only selectable by the vision-LLM backend.
+* ``classify_scenes_with_llm`` — pluggable LLM hook. Takes a callable
+  ``(image_path, prompt) -> str`` so the caller (MCP client or test) can
+  wire up whatever multimodal model they want. Enforces strict JSON output.
+Even without a model, the heuristic is good enough for many real inputs and
+keeps the whole pipeline runnable with zero external dependencies.
+"""
+from __future__ import annotations
+import json
+import os
+import struct
+from typing import Callable, Iterable
+from ..schemas import LayoutKind, Scene, SceneClassification
+# ---------------------------------------------------------------------------
+# Tiny PNG/JPEG reader → down-sampled grayscale column profile
+# ---------------------------------------------------------------------------
+# We intentionally avoid a hard dependency on Pillow. If Pillow is available
+# we use it; otherwise we fall back to reading just PNG dimensions, which is
+# enough for a coarse column-variance heuristic on any pre-decoded frame.
+def _load_grayscale(path: str) -> tuple[list[list[int]], int, int] | None:
+    try:
+        from PIL import Image  # type: ignore
+        img = Image.open(path).convert("L")
+        w, h = img.size
+        # Down-sample to at most 128 cols x 72 rows for cheap analysis.
+        tw = min(128, w)
+        th = min(72, h)
+        img = img.resize((tw, th))
+        px = list(img.getdata())
+        grid = [px[i * tw : (i + 1) * tw] for i in range(th)]
+        return grid, tw, th
+    except Exception:
+        return None
+def _png_dims(path: str) -> tuple[int, int] | None:
+    try:
+        with open(path, "rb") as f:
+            head = f.read(24)
+        if head[:8] != b"\x89PNG\r\n\x1a\n":
+            return None
+        w, h = struct.unpack(">II", head[16:24])
+        return int(w), int(h)
+    except Exception:
+        return None
+def _column_profile(grid: list[list[int]]) -> list[float]:
+    if not grid:
+        return []
+    h = len(grid)
+    w = len(grid[0])
+    out: list[float] = []
+    for x in range(w):
+        s = 0
+        for y in range(h):
+            s += grid[y][x]
+        out.append(s / h)
+    return out
+def _variance(values: Iterable[float]) -> float:
+    vs = list(values)
+    if not vs:
+        return 0.0
+    m = sum(vs) / len(vs)
+    return sum((v - m) ** 2 for v in vs) / len(vs)
+# ---------------------------------------------------------------------------
+# Heuristic classifier
+# ---------------------------------------------------------------------------
+def _classify_one_heuristic(keyframe_path: str | None) -> SceneClassification:
+    if not keyframe_path or not os.path.exists(keyframe_path):
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.3,
+            reason="no keyframe available — defaulting to SIT_CENTER",
+        )
+    gs = _load_grayscale(keyframe_path)
+    if gs is None:
+        # Can't read pixels: still return a safe default with low confidence.
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.25,
+            reason="PIL unavailable or image unreadable — defaulting to SIT_CENTER",
+        )
+    grid, w, h = gs
+    cols = _column_profile(grid)
+    def _split_contrast(left: list[float], right: list[float]) -> float:
+        lm = sum(left) / max(1, len(left))
+        rm = sum(right) / max(1, len(right))
+        lv = _variance(left)
+        rv = _variance(right)
+        between = (lm - rm) ** 2
+        within = (lv + rv) / 2.0 + 1e-6
+        return between / within
+    # Left/right halves — good for symmetric two-up scenes.
+    mid = max(1, w // 2)
+    split_halves = _split_contrast(cols[:mid], cols[mid:])
+    # Left 2/3 vs right 1/3 — matches explainer slides (chart + talking head).
+    t = max(1, w // 3)
+    left_two_thirds = cols[: 2 * t]
+    right_one_third = cols[2 * t :]
+    split_thirds = _split_contrast(left_two_thirds, right_one_third)
+    split_score = max(split_halves, split_thirds)
+    # Overall column variance: low variance → flat composition (zoom call).
+    overall_var = _variance(cols)
+    # Threshold tuned on Ark-style 2/3 chart + 1/3 speaker; "thirds" score catches
+    # layouts where half-vs-half contrast was too weak (e.g. clip 005 vs 004).
+    if split_score > 20.0:
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            confidence=min(0.95, 0.5 + split_score / 200.0),
+            reason=(
+                f"chart/person contrast (halves={split_halves:.1f}, "
+                f"thirds={split_thirds:.1f} → max={split_score:.1f})"
+            ),
+        )
+    if overall_var < 100.0:
+        return SceneClassification(
+            scene_id="?",
+            layout=LayoutKind.ZOOM_CALL_CENTER,
+            confidence=0.7,
+            reason=f"low column variance ({overall_var:.1f}) — flat centered framing",
+        )
+    return SceneClassification(
+        scene_id="?",
+        layout=LayoutKind.SIT_CENTER,
+        confidence=0.6,
+        reason=f"moderate composition (score={split_score:.1f}, var={overall_var:.1f})",
+    )
+def classify_scenes_heuristic(scenes: list[Scene]) -> list[SceneClassification]:
+    out: list[SceneClassification] = []
+    for s in scenes:
+        r = _classify_one_heuristic(s.keyframe_path)
+        out.append(r.model_copy(update={"scene_id": s.scene_id}))
+    return out
+# ---------------------------------------------------------------------------
+# LLM-backed classifier (caller provides the model hook)
+# ---------------------------------------------------------------------------
+LLMVisionFn = Callable[[str, str], str]
+"""Signature: (image_path, prompt) -> raw model string (expected JSON)."""
+CLASSIFIER_PROMPT = """You are a scene layout classifier for a short-video editor.
+Return ONLY a JSON object of the form:
+  {"layout": "<one of: zoom_call_center | sit_center | split_chart_person>",
+   "confidence": <0..1 float>,
+   "reason": "<=15 words"}
+Layout definitions:
+- zoom_call_center: one person on a video call (webcam grid / talking head tight crop), subject centered.
+- sit_center:       one person sitting in frame, subject centered, wider framing than a zoom call.
+- split_chart_person: an explainer scene with a chart/graphic on the LEFT (~2/3 of frame) and a person on the RIGHT (~1/3).
+Pick the single best match. No prose, no markdown, JSON only.
+"""
+def classify_scenes_with_llm(
+    scenes: list[Scene], vision_fn: LLMVisionFn
+) -> list[SceneClassification]:
+    out: list[SceneClassification] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind.SIT_CENTER,
+                    confidence=0.2,
+                    reason="no keyframe",
+                )
+            )
+            continue
+        raw = vision_fn(s.keyframe_path, CLASSIFIER_PROMPT)
+        try:
+            data = json.loads(raw)
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind(data["layout"]),
+                    confidence=float(data.get("confidence", 0.5)),
+                    reason=str(data.get("reason", ""))[:200],
+                )
+            )
+        except Exception as e:
+            out.append(
+                SceneClassification(
+                    scene_id=s.scene_id,
+                    layout=LayoutKind.SIT_CENTER,
+                    confidence=0.25,
+                    reason=f"LLM parse error: {e!r}",
+                )
+            )
+    return out

humeo-core/src/humeo_core/primitives/compile.py ADDED Viewed

	@@ -0,0 +1,602 @@

+"""Compiler: assemble a final 9:16 clip from source + clip + layout instruction.
+Builds the ffmpeg invocation, optionally runs it. Keeping ``dry_run`` as a
+first-class mode means the MCP server can return the exact command without
+executing — ideal for an agent that wants to review before spending CPU.
+Rendering order is fixed and intentional:
+1. **Cut + crop/compose.** ``plan_layout`` produces the base filtergraph
+   that takes the source, applies the layout-specific crops, and emits a
+   labelled ``[vout]`` at the exact output resolution (e.g. 1080x1920).
+2. **Overlay title** (``drawtext``) — skipped for split layouts because
+   the source itself already has a slide/chart title and an extra overlay
+   just obscures content.
+3. **Subtitles.** ``subtitles`` filter runs **last** so text is drawn over
+   the finished composition, not the source. ``original_size`` is pinned
+   to the output resolution so libass coordinate math (MarginV, FontSize)
+   is in *output pixels*, not libass's default PlayResY=288 — which was
+   the bug behind the "subtitles blocked / floating in the middle" look.
+4. **Mux** with the source audio stream (``0:a:0``).
+"""
+from __future__ import annotations
+import os
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from ..schemas import RenderRequest, RenderResult, RenderTheme, SPLIT_LAYOUTS
+from .layouts import plan_layout
+def _ensure_ffmpeg() -> str:
+    exe = shutil.which("ffmpeg")
+    if not exe:
+        raise RuntimeError("ffmpeg not found on PATH")
+    return exe
+def _ensure_windows_fontconfig() -> dict[str, str]:
+    """Return subprocess env with a minimal fontconfig setup on Windows.
+    Some Windows FFmpeg builds ship libass + fontconfig but do not bundle a
+    default fontconfig config, which makes subtitle rendering fail with:
+    ``Fontconfig error: Cannot load default config file: No such file: (null)``
+    We generate a tiny config that points fontconfig at ``C:/Windows/Fonts`` and
+    a writable cache dir under ``%LOCALAPPDATA%/humeo``. Non-Windows platforms
+    pass through the existing environment unchanged.
+    """
+    env = os.environ.copy()
+    if os.name != "nt":
+        return env
+    if env.get("FONTCONFIG_FILE"):
+        return env
+    local_appdata = Path(
+        env.get("LOCALAPPDATA", str(Path(tempfile.gettempdir()) / "humeo-local"))
+    )
+    cfg_dir = local_appdata / "humeo" / "fontconfig"
+    cache_dir = local_appdata / "humeo" / "fontconfig-cache"
+    cfg_dir.mkdir(parents=True, exist_ok=True)
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    cfg_file = cfg_dir / "fonts.conf"
+    windows_fonts = Path(env.get("WINDIR", r"C:\Windows")) / "Fonts"
+    if not cfg_file.exists():
+        cfg_file.write_text(
+            "\n".join(
+                [
+                    '<?xml version="1.0"?>',
+                    "<fontconfig>",
+                    f"  <dir>{windows_fonts.as_posix()}</dir>",
+                    f"  <cachedir>{cache_dir.as_posix()}</cachedir>",
+                    "</fontconfig>",
+                    "",
+                ]
+            ),
+            encoding="utf-8",
+        )
+    env["FONTCONFIG_PATH"] = str(cfg_dir)
+    env["FONTCONFIG_FILE"] = str(cfg_file)
+    return env
+def _escape_drawtext(text: str) -> str:
+    # drawtext quoting is brittle across ffmpeg builds. Keep it simple:
+    # collapse whitespace, drop apostrophes, and escape the characters
+    # that are still significant to the filter parser.
+    safe = " ".join(text.split()).replace("'", "")
+    return safe.replace("\\", "\\\\").replace(":", "\\:")
+# ---------------------------------------------------------------------------
+# Title overlay planning
+# ---------------------------------------------------------------------------
+#
+# ffmpeg ``drawtext`` does not wrap text by itself; whatever you hand it is
+# emitted as a single line. With a fixed 72px font and no width budget, the
+# "Prediction Markets vs Derivatives" title on a 1080px canvas would spill
+# past both edges and show up clipped (the user reported exactly this bug).
+#
+# The helpers below plan a title layout BEFORE it hits drawtext:
+#
+# 1. Short titles (fit at 72px single line): emit the existing single
+#    ``drawtext`` call unchanged so golden tests and previously-calibrated
+#    visuals stay byte-for-byte identical.
+# 2. Long titles: split at the best word boundary into two balanced lines and
+#    emit two stacked ``drawtext`` filters at a slightly smaller font
+#    (60px / 52px / 44px, auto-shrinking until both lines fit).
+# 3. Single-word titles that still overflow: shrink the single line until it
+#    fits, then hard-truncate with an ellipsis as a last resort.
+#
+# The character-width estimate is deliberately conservative (0.55 * fontsize)
+# so mixed-case prose with wide letters like W/M still clears the margin.
+# Calibrated visually against Arial Bold on 1080p output.
+_TITLE_PRIMARY_SIZE = 72   # Current "hero" title size; preserved for short titles.
+_TITLE_MIN_SIZE = 44       # Readability floor at 1080x1920 output.
+_TITLE_MARGIN_PX = 60      # Horizontal safe-area on each side.
+_TITLE_Y_TOP = 80          # Pixel offset of the top title baseline (matches pre-P2 look).
+_TITLE_CHAR_WIDTH_RATIO = 0.55
+_TITLE_LINE_SPACING_RATIO = 1.3
+# Keep the overlay font explicit. Without a ``font=`` directive, drawtext
+# falls back to fontconfig's "Sans", which resolves to a serif (Times New
+# Roman) on default Windows installs — the "ugly serif title" bug reported
+# against v1. Arial matches the ASS subtitle ``Fontname`` below so the
+# title and captions read as a single typographic family. Keep this in
+# sync with the ``Fontname=Arial`` in the subtitle filter if it ever
+# changes.
+_TITLE_FONT_NAME = "Arial"
+_REFERENCE_TITLE_FONT_NAME = "League Spartan"
+_REFERENCE_CAPTION_FONT_NAME = "Source Sans 3"
+_REFERENCE_TITLE_BAR_X = 28
+_REFERENCE_TITLE_BAR_Y = 32
+_REFERENCE_TITLE_BAR_W = 1024
+_REFERENCE_TITLE_BAR_H = 148
+_REFERENCE_TITLE_TEXT_X = 72
+_REFERENCE_TITLE_TEXT_Y = 54
+_REFERENCE_TITLE_SIZE = 64
+_REFERENCE_CAPTION_BAR_X = 0
+_REFERENCE_CAPTION_BAR_W = 1080
+_REFERENCE_CAPTION_BAR_H = 120
+_REFERENCE_CAPTION_TEXT_MARGIN_L = 92
+_REFERENCE_CAPTION_TEXT_MARGIN_R = 92
+def _fonts_dir() -> Path:
+    return Path(__file__).resolve().parents[1] / "assets" / "fonts"
+def _bundled_font_path(filename: str) -> Path | None:
+    path = _fonts_dir() / filename
+    return path if path.is_file() else None
+def _title_char_px(size_px: int) -> float:
+    return size_px * _TITLE_CHAR_WIDTH_RATIO
+def _title_fits(text: str, size_px: int, usable_w: int) -> bool:
+    return int(len(text) * _title_char_px(size_px)) <= usable_w
+def _wrap_title_two_lines(text: str) -> tuple[str, str]:
+    """Split ``text`` at the word boundary that most balances the two halves.
+    Returns ``(line1, line2)``. If ``text`` has fewer than two words, returns
+    ``(text, "")`` and the caller should fall back to single-line shrinking.
+    """
+    words = text.split()
+    if len(words) < 2:
+        return text, ""
+    best_idx = 1
+    best_delta = 10**9
+    for i in range(1, len(words)):
+        left = " ".join(words[:i])
+        right = " ".join(words[i:])
+        delta = abs(len(left) - len(right))
+        if delta < best_delta:
+            best_delta = delta
+            best_idx = i
+    return " ".join(words[:best_idx]), " ".join(words[best_idx:])
+def _drawtext_font_arg() -> str:
+    """Return a drawtext font selector that is stable on the current platform."""
+    if os.name == "nt":
+        arial = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" / "arial.ttf"
+        if arial.is_file():
+            return f"fontfile='{_escape_filter_path(str(arial))}'"
+    return f"font={_TITLE_FONT_NAME}"
+def _reference_title_font_arg() -> str:
+    bundled = _bundled_font_path("LeagueSpartan-Bold-static.ttf") or _bundled_font_path(
+        "LeagueSpartan-Bold.ttf"
+    )
+    if bundled is not None:
+        return f"fontfile='{_escape_filter_path(str(bundled))}'"
+    return f"font={_REFERENCE_TITLE_FONT_NAME}"
+def _drawtext_single(text: str, size: int, y: int) -> str:
+    esc = _escape_drawtext(text)
+    return (
+        f"drawtext=text='{esc}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y}"
+    )
+def _drawtext_two(line1: str, line2: str, size: int, y_top: int) -> str:
+    """Two drawtext filters chained by comma — one ffmpeg filter chain, two lines."""
+    esc1 = _escape_drawtext(line1)
+    esc2 = _escape_drawtext(line2)
+    y_bottom = y_top + int(round(size * _TITLE_LINE_SPACING_RATIO))
+    return (
+        f"drawtext=text='{esc1}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y_top},"
+        f"drawtext=text='{esc2}':"
+        "expansion=none:"
+        f"{_drawtext_font_arg()}:"
+        f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
+        f"x=(w-text_w)/2:y={y_bottom}"
+    )
+def plan_title_drawtext(title_text: str, out_w: int = 1080) -> str | None:
+    """Return the ``drawtext`` filter fragment for ``title_text`` or None to skip.
+    The returned string is intended to be spliced into the main filtergraph
+    between the ``[v_prepad]`` and ``[vout]`` labels by
+    :func:`build_ffmpeg_cmd`. It does NOT include those labels itself.
+    Backward compatibility: when the title fits on one line at the original
+    72px size, the output is identical to the pre-P2 single-``drawtext``
+    form (same x/y/fontsize/borderw), so golden ffmpeg tests stay green.
+    """
+    text = " ".join((title_text or "").split())
+    if not text:
+        return None
+    usable_w = max(1, out_w - 2 * _TITLE_MARGIN_PX)
+    if _title_fits(text, _TITLE_PRIMARY_SIZE, usable_w):
+        return _drawtext_single(text, _TITLE_PRIMARY_SIZE, _TITLE_Y_TOP)
+    line1, line2 = _wrap_title_two_lines(text)
+    if line2:
+        for size in (60, 52, _TITLE_MIN_SIZE):
+            if _title_fits(line1, size, usable_w) and _title_fits(line2, size, usable_w):
+                return _drawtext_two(line1, line2, size, _TITLE_Y_TOP)
+    for size in (64, 56, 52, _TITLE_MIN_SIZE):
+        if _title_fits(text, size, usable_w):
+            return _drawtext_single(text, size, _TITLE_Y_TOP)
+    max_chars = max(4, int(usable_w / _title_char_px(_TITLE_MIN_SIZE)))
+    truncated = text[: max_chars - 1].rstrip() + "..."
+    return _drawtext_single(truncated, _TITLE_MIN_SIZE, _TITLE_Y_TOP)
+def _reference_title_fragment(title_text: str, out_w: int = 1080) -> str:
+    bar_w = min(_REFERENCE_TITLE_BAR_W, max(320, out_w - 2 * _REFERENCE_TITLE_BAR_X))
+    accent_w = 16
+    title = " ".join((title_text or "").split())
+    usable_w = max(220, bar_w - (_REFERENCE_TITLE_TEXT_X - _REFERENCE_TITLE_BAR_X) - 30)
+    text_filters: list[str] = []
+    if title:
+        if _title_fits(title, _REFERENCE_TITLE_SIZE, usable_w):
+            esc = _escape_drawtext(title)
+            text_filters.append(
+                f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                f"fontcolor=white:fontsize={_REFERENCE_TITLE_SIZE}:"
+                "borderw=1.2:bordercolor=0x101010@0.18:"
+                f"x={_REFERENCE_TITLE_TEXT_X}:"
+                f"y={_REFERENCE_TITLE_TEXT_Y}"
+            )
+        else:
+            line1, line2 = _wrap_title_two_lines(title)
+            two_line_size = 54
+            while (
+                line2
+                and two_line_size > 42
+                and not (
+                    _title_fits(line1, two_line_size, usable_w)
+                    and _title_fits(line2, two_line_size, usable_w)
+                )
+            ):
+                two_line_size -= 2
+            if line2 and _title_fits(line1, two_line_size, usable_w) and _title_fits(line2, two_line_size, usable_w):
+                y_top = 36
+                y_bottom = y_top + int(round(two_line_size * 1.08))
+                for line, y in ((line1, y_top), (line2, y_bottom)):
+                    esc = _escape_drawtext(line)
+                    text_filters.append(
+                        f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                        f"fontcolor=white:fontsize={two_line_size}:"
+                        "borderw=1.2:bordercolor=0x101010@0.18:"
+                        f"x={_REFERENCE_TITLE_TEXT_X}:y={y}"
+                    )
+            else:
+                size = _REFERENCE_TITLE_SIZE
+                while title and not _title_fits(title, size, usable_w) and size > 38:
+                    size -= 2
+                if title and not _title_fits(title, size, usable_w):
+                    max_chars = max(8, int(usable_w / _title_char_px(size)))
+                    title = title[: max_chars - 1].rstrip() + "..."
+                esc = _escape_drawtext(title)
+                text_filters.append(
+                    f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
+                    f"fontcolor=white:fontsize={size}:"
+                    "borderw=1.2:bordercolor=0x101010@0.18:"
+                    f"x={_REFERENCE_TITLE_TEXT_X}:"
+                    f"y={_REFERENCE_TITLE_TEXT_Y}"
+                )
+    text_filter = f",{','.join(text_filters)}" if text_filters else ""
+    return (
+        f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
+        f"w={bar_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x1F1F1F@0.84:t=fill,"
+        f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
+        f"w={accent_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x2A2453@0.98:t=fill"
+        f"{text_filter}"
+    )
+def _reference_caption_bar_fragment(
+    *,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    margin_v: int = 166,
+    font_size: int = 38,
+) -> str:
+    bar_w = min(_REFERENCE_CAPTION_BAR_W, max(320, out_w - 2 * _REFERENCE_CAPTION_BAR_X))
+    bar_h = max(_REFERENCE_CAPTION_BAR_H, int(round(font_size * 2.05)))
+    bar_y = max(
+        _REFERENCE_TITLE_BAR_Y + _REFERENCE_TITLE_BAR_H + 36,
+        out_h - max(40, margin_v) - bar_h,
+    )
+    return (
+        f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
+        f"w={bar_w}:h={bar_h}:color=0x6570E6@1.0:t=fill,"
+        f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
+        f"w={bar_w}:h=3:color=0xE4E7FF@0.14:t=fill"
+    )
+def _escape_filter_path(path: str) -> str:
+    return path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'")
+def _has_audio_stream(media_path: str) -> bool:
+    probe = shutil.which("ffprobe")
+    if not probe:
+        return False
+    out = subprocess.run(
+        [
+            probe,
+            "-v",
+            "error",
+            "-select_streams",
+            "a:0",
+            "-show_entries",
+            "stream=codec_type",
+            "-of",
+            "csv=p=0",
+            media_path,
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    return out.returncode == 0 and "audio" in (out.stdout or "").lower()
+def build_ffmpeg_cmd(
+    req: RenderRequest,
+    *,
+    src_w: int = 1920,
+    src_h: int = 1080,
+    include_audio: bool = True,
+) -> list[str]:
+    exe = _ensure_ffmpeg() if req.mode != "dry_run" else "ffmpeg"
+    plan = plan_layout(
+        req.layout, out_w=req.width, out_h=req.height, src_w=src_w, src_h=src_h
+    )
+    fg = plan.filtergraph
+    if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+        chrome_parts = [
+            _reference_title_fragment(req.title_text, out_w=req.width),
+            _reference_caption_bar_fragment(
+                out_w=req.width,
+                out_h=req.height,
+                margin_v=min(req.subtitle_margin_v, 136),
+                font_size=max(req.subtitle_font_size, 124),
+            )
+            if req.subtitle_path
+            else "",
+        ]
+        fg = fg.replace(
+            "[vout]",
+            f"[v_prepad];[v_prepad]{','.join(part for part in chrome_parts if part)}[vout]",
+        )
+    elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+        # The native-highlight theme mirrors the reference short in
+        # videoplayback (12): no separate top title card, just centered
+        # floating captions with per-word highlight timing.
+        pass
+    else:
+        # Skip the drawtext title overlay on split layouts: the top band already
+        # shows a slide/chart with its own baked-in title, so adding an overlay
+        # on top of that is pure noise (and was stacking over the chart title
+        # in the SPLIT_CHART_PERSON Cathy Wood shorts).
+        title_allowed = req.layout.layout not in SPLIT_LAYOUTS
+        if req.title_text and title_allowed:
+            # ``plan_title_drawtext`` returns a full filter fragment (possibly
+            # two chained ``drawtext`` calls) that fits within the output width.
+            # For short titles it is byte-identical to the pre-P2 single-line
+            # form, keeping existing golden tests green while fixing the
+            # "Prediction Markets vs Derivatives" edge-clip report.
+            title_fragment = plan_title_drawtext(req.title_text, out_w=req.width)
+            if title_fragment:
+                fg = fg.replace(
+                    "[vout]",
+                    f"[v_prepad];[v_prepad]{title_fragment}[vout]",
+                )
+    if req.subtitle_path:
+        subtitle_esc = _escape_filter_path(req.subtitle_path)
+        fonts_dir = _fonts_dir()
+        fontsdir_arg = (
+            f":fontsdir='{_escape_filter_path(str(fonts_dir))}'" if fonts_dir.is_dir() else ""
+        )
+        # ``original_size`` pins libass's PlayResY to the actual output so
+        # ``FontSize`` and ``MarginV`` are interpreted in output pixels. Without
+        # this, libass defaults to PlayResY=288 and then upscales to the real
+        # canvas (1920) -- blowing font sizes and pushing subtitles to the
+        # middle of the frame. ``WrapStyle=0`` enables smart word wrap so long
+        # lines break into readable stacks instead of running off-screen.
+        if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
+            force_style = (
+                f"Fontname={_REFERENCE_CAPTION_FONT_NAME},"
+                f"FontSize={max(req.subtitle_font_size, 124)},Alignment=2,"
+                f"MarginV={min(req.subtitle_margin_v, 136)},"
+                "MarginL=56,MarginR=56,"
+                "WrapStyle=0,BorderStyle=1,Outline=2,Shadow=0,"
+                "BackColour=&H00000000&,PrimaryColour=&H00FFFFFF&,"
+                "Bold=1,Italic=0,Spacing=-1"
+            )
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}:"
+                f"force_style='{force_style}'[vout]"
+            )
+        elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}[vout]"
+            )
+        else:
+            force_style = (
+                f"Fontname=Arial,"
+                f"FontSize={req.subtitle_font_size},Alignment=2,"
+                f"MarginV={req.subtitle_margin_v},MarginL=60,MarginR=60,"
+                "WrapStyle=0,BorderStyle=4,"
+                "BackColour=&H70000000&,PrimaryColour=&H00FFFFFF&,"
+                "Outline=0,Shadow=0,Bold=1"
+            )
+            subtitle_filter = (
+                "[v_sub_in];"
+                f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
+                f"original_size={req.width}x{req.height}:"
+                f"force_style='{force_style}'[vout]"
+            )
+        fg = fg.replace("[vout]", subtitle_filter)
+    start = req.clip.start_time_sec
+    dur = max(0.1, req.clip.duration_sec)
+    Path(Path(req.output_path).parent).mkdir(parents=True, exist_ok=True)
+    cmd: list[str] = [
+        exe,
+        "-y",
+        "-ss",
+        f"{start:.3f}",
+        "-t",
+        f"{dur:.3f}",
+        "-i",
+        req.source_path,
+        "-filter_complex",
+        fg,
+        "-map",
+        "[vout]",
+        "-c:v",
+        "libx264",
+        "-preset",
+        "veryfast",
+        "-crf",
+        "20",
+    ]
+    if include_audio:
+        cmd.extend(["-map", "0:a:0", "-c:a", "aac", "-b:a", "160k"])
+    cmd.extend(["-movflags", "+faststart", req.output_path])
+    return cmd
+def probe_source_size(source_path: str) -> tuple[int, int]:
+    exe = shutil.which("ffprobe")
+    if not exe:
+        return 1920, 1080
+    out = subprocess.run(
+        [
+            exe,
+            "-v",
+            "error",
+            "-select_streams",
+            "v:0",
+            "-show_entries",
+            "stream=width,height",
+            "-of",
+            "csv=p=0",
+            source_path,
+        ],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+    try:
+        w, h = out.stdout.strip().split(",")
+        return int(w), int(h)
+    except Exception:
+        return 1920, 1080
+def render_clip(req: RenderRequest) -> RenderResult:
+    try:
+        src_w, src_h = probe_source_size(req.source_path) if req.mode != "dry_run" else (1920, 1080)
+    except Exception:
+        src_w, src_h = 1920, 1080
+    include_audio = True
+    if req.mode != "dry_run":
+        include_audio = _has_audio_stream(req.source_path)
+        if not include_audio:
+            return RenderResult(
+                clip_id=req.clip.clip_id,
+                output_path=req.output_path,
+                ffmpeg_cmd=[],
+                success=False,
+                error="Source media has no detectable audio stream (a:0).",
+            )
+    cmd = build_ffmpeg_cmd(req, src_w=src_w, src_h=src_h, include_audio=include_audio)
+    if req.mode == "dry_run":
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=True,
+        )
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, env=_ensure_windows_fontconfig())
+        if include_audio and not _has_audio_stream(req.output_path):
+            return RenderResult(
+                clip_id=req.clip.clip_id,
+                output_path=req.output_path,
+                ffmpeg_cmd=cmd,
+                success=False,
+                error="Rendered output is missing audio stream (a:0).",
+            )
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=True,
+        )
+    except subprocess.CalledProcessError as e:
+        return RenderResult(
+            clip_id=req.clip.clip_id,
+            output_path=req.output_path,
+            ffmpeg_cmd=cmd,
+            success=False,
+            error=e.stderr.decode("utf-8", errors="replace")[-4000:] if e.stderr else str(e),
+        )

humeo-core/src/humeo_core/primitives/face_detect.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Local face-detection primitive — the MediaPipe path as another ``SceneRegions`` producer.
+Three detection backends share the *same output schema* (``SceneRegions``):
+* ``primitives/classify.py``          — pixel variance heuristic, no model.
+* ``primitives/face_detect.py``       — MediaPipe face rectangle (this file).
+* ``primitives/vision.py``            — multimodal LLM + OCR bboxes.
+Because all three emit ``SceneRegions``, the layout planner in
+``primitives/vision.py`` (``classify_from_regions`` + ``layout_instruction_from_regions``)
+works on all of them unchanged. That is the whole point of the primitive
+boundary — the *detector* is swappable, the *renderer* is fixed.
+MediaPipe is imported lazily so it remains an optional extra.
+"""
+from __future__ import annotations
+import logging
+from typing import Callable
+from ..schemas import BoundingBox, Scene, SceneRegions
+logger = logging.getLogger(__name__)
+# A bbox loader for any future cloud face API. Takes a keyframe path,
+# returns a normalized face bbox or ``None``. Same shape as the MediaPipe
+# wrapper below, which lets tests pass a stub and skip MediaPipe.
+FaceBBoxFn = Callable[[str], BoundingBox | None]
+def detect_face_regions(
+    scenes: list[Scene],
+    face_fn: FaceBBoxFn | None = None,
+    chart_split_threshold: float = 0.65,
+) -> list[SceneRegions]:
+    """Populate ``SceneRegions.person_bbox`` (+ ``chart_bbox``) from a face detector.
+    The face bbox is treated as the *person bbox*. If the face sits in the
+    right ``(1 - chart_split_threshold)`` of the frame, a *chart bbox* is
+    synthesised over the left region — mirroring the original
+    ``reframe.py`` split heuristic.
+    Args:
+        scenes: scenes with ``keyframe_path`` populated.
+        face_fn: pluggable face detector. Defaults to MediaPipe (lazy
+            import) if not supplied. Pass a stub in tests.
+        chart_split_threshold: face x-center above this normalized value
+            triggers a synthetic chart bbox on the left.
+    """
+    if face_fn is None:
+        face_fn = _mediapipe_face_bbox
+    out: list[SceneRegions] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available"))
+            continue
+        try:
+            face = face_fn(s.keyframe_path)
+        except Exception as e:  # one bad scene should not kill the batch
+            logger.warning("face detector failed on %s: %r", s.keyframe_path, e)
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason=f"face detector error: {e!r}"))
+            continue
+        if face is None:
+            out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no face detected"))
+            continue
+        chart = None
+        if face.center_x >= chart_split_threshold:
+            # Face pushed right → assume a chart occupies the left region.
+            chart = BoundingBox(
+                x1=0.0,
+                y1=0.0,
+                x2=min(chart_split_threshold, face.x1),
+                y2=1.0,
+                label="chart_inferred",
+                confidence=max(0.0, face.center_x - chart_split_threshold + 0.5),
+            )
+        out.append(
+            SceneRegions(
+                scene_id=s.scene_id,
+                person_bbox=face,
+                chart_bbox=chart,
+                raw_reason="face detected" + (" + synthetic chart bbox" if chart else ""),
+            )
+        )
+    return out
+def _mediapipe_face_bbox(keyframe_path: str) -> BoundingBox | None:
+    """Return the largest-confidence face as a ``BoundingBox``, or ``None``.
+    Imports MediaPipe + OpenCV lazily so they remain optional dependencies
+    (install ``humeo-core[face]``).
+    """
+    try:
+        import cv2  # type: ignore
+        import mediapipe as mp  # type: ignore
+    except ImportError as e:
+        raise RuntimeError(
+            "MediaPipe face detection requires `pip install humeo-core[face]`"
+        ) from e
+    img = cv2.imread(keyframe_path)
+    if img is None:
+        return None
+    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    with mp.solutions.face_detection.FaceDetection(
+        model_selection=1, min_detection_confidence=0.5
+    ) as detector:
+        results = detector.process(rgb)
+        if not results.detections:
+            return None
+        best = max(results.detections, key=lambda d: d.score[0])
+        box = best.location_data.relative_bounding_box
+        x1 = max(0.0, min(1.0, float(box.xmin)))
+        y1 = max(0.0, min(1.0, float(box.ymin)))
+        x2 = max(x1 + 1e-6, min(1.0, x1 + float(box.width)))
+        y2 = max(y1 + 1e-6, min(1.0, y1 + float(box.height)))
+        return BoundingBox(
+            x1=x1,
+            y1=y1,
+            x2=x2,
+            y2=y2,
+            label="face",
+            confidence=float(best.score[0]),
+        )

humeo-core/src/humeo_core/primitives/ingest.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""Landing gear: deterministic, local extraction.
+Everything here can run without a GPU, without an API key, and without the
+internet (once inputs are present). This follows the HIVE guide's rule
+"extraction stays local; LLMs only reason".
+Functions:
+    probe_duration      — ffprobe wrapper
+    detect_scenes       — PySceneDetect (ContentDetector)
+    extract_keyframes   — ffmpeg snapshot at each scene midpoint
+    transcribe_audio    — faster-whisper (optional dependency)
+    ingest              — one-shot convenience runner that returns IngestResult
+"""
+from __future__ import annotations
+import json
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from ..schemas import IngestResult, Scene, TranscriptWord
+class IngestError(RuntimeError):
+    pass
+def _require(binary: str) -> str:
+    path = shutil.which(binary)
+    if not path:
+        raise IngestError(
+            f"Required binary not on PATH: {binary!r}. Install it or add the path."
+        )
+    return path
+def probe_duration(source_path: str) -> float:
+    ffprobe = _require("ffprobe")
+    out = subprocess.run(
+        [
+            ffprobe,
+            "-v",
+            "error",
+            "-show_entries",
+            "format=duration",
+            "-of",
+            "json",
+            source_path,
+        ],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    data = json.loads(out.stdout)
+    return float(data["format"]["duration"])
+def detect_scenes(
+    source_path: str, threshold: float = 27.0, min_scene_sec: float = 1.0
+) -> list[Scene]:
+    """Use PySceneDetect's ContentDetector to split the video into scenes."""
+    try:
+        from scenedetect import detect, ContentDetector  # type: ignore
+    except ModuleNotFoundError as e:
+        # scenedetect depends on OpenCV; surface the real missing module.
+        missing = getattr(e, "name", "") or str(e)
+        hint = "pip install 'scenedetect[opencv]'" if "cv2" in missing else "pip install scenedetect"
+        raise IngestError(
+            f"Scene detection unavailable (missing module: {missing}). Install with: {hint}"
+        ) from e
+    result = detect(
+        source_path,
+        ContentDetector(threshold=threshold, min_scene_len=int(min_scene_sec * 24)),
+    )
+    scenes: list[Scene] = []
+    for i, (start, end) in enumerate(result):
+        scenes.append(
+            Scene(
+                scene_id=f"s{i:04d}",
+                start_time=float(start.get_seconds()),
+                end_time=float(end.get_seconds()),
+            )
+        )
+    # Guard: if PySceneDetect returns empty (e.g. a single long shot),
+    # fall back to one scene spanning the whole video.
+    if not scenes:
+        duration = probe_duration(source_path)
+        scenes.append(Scene(scene_id="s0000", start_time=0.0, end_time=duration))
+    return scenes
+def extract_keyframes(
+    source_path: str, scenes: list[Scene], out_dir: str
+) -> list[Scene]:
+    """Extract one JPG per scene at its midpoint. Mutates nothing; returns copies."""
+    ffmpeg = _require("ffmpeg")
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+    updated: list[Scene] = []
+    for s in scenes:
+        mid = s.start_time + (s.end_time - s.start_time) / 2.0
+        out_path = os.path.join(out_dir, f"{s.scene_id}.jpg")
+        subprocess.run(
+            [
+                ffmpeg,
+                "-y",
+                "-loglevel",
+                "error",
+                "-ss",
+                f"{mid:.3f}",
+                "-i",
+                source_path,
+                "-frames:v",
+                "1",
+                "-q:v",
+                "3",
+                out_path,
+            ],
+            check=True,
+        )
+        updated.append(s.model_copy(update={"keyframe_path": out_path}))
+    return updated
+def transcribe_audio(
+    source_path: str, model_name: str = "base", language: str | None = None
+) -> list[TranscriptWord]:
+    """Word-level transcript via faster-whisper. Optional dependency."""
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+    except ImportError as e:
+        raise IngestError(
+            "faster-whisper is not installed. pip install faster-whisper"
+        ) from e
+    model = WhisperModel(model_name, device="auto", compute_type="auto")
+    segments, _info = model.transcribe(source_path, word_timestamps=True, language=language)
+    words: list[TranscriptWord] = []
+    for seg in segments:
+        for w in getattr(seg, "words", []) or []:
+            if w.word is None:
+                continue
+            words.append(
+                TranscriptWord(
+                    word=str(w.word).strip(),
+                    start_time=float(w.start or 0.0),
+                    end_time=float(w.end or 0.0),
+                )
+            )
+    return words
+def ingest(
+    source_path: str,
+    work_dir: str,
+    *,
+    with_transcript: bool = False,
+    whisper_model: str = "base",
+) -> IngestResult:
+    """Run all extraction stages and return a single ``IngestResult``."""
+    if not os.path.exists(source_path):
+        raise IngestError(f"source_path does not exist: {source_path}")
+    Path(work_dir).mkdir(parents=True, exist_ok=True)
+    keyframes_dir = os.path.join(work_dir, "keyframes")
+    duration = probe_duration(source_path)
+    scenes = detect_scenes(source_path)
+    scenes = extract_keyframes(source_path, scenes, keyframes_dir)
+    words: list[TranscriptWord] = []
+    if with_transcript:
+        words = transcribe_audio(source_path, model_name=whisper_model)
+    return IngestResult(
+        source_path=os.path.abspath(source_path),
+        duration_sec=duration,
+        scenes=scenes,
+        transcript_words=words,
+        keyframes_dir=keyframes_dir,
+    )

humeo-core/src/humeo_core/primitives/layouts.py ADDED Viewed

	@@ -0,0 +1,707 @@

+"""The 9:16 layout thrusters — deterministic crop + compose math.
+First principles: this video format has a hard constraint of **at most two
+on-screen items** per short (see :class:`humeo_core.schemas.LayoutKind`). That
+gives exactly five recipes:
+* 1 person alone, tight  → ``ZOOM_CALL_CENTER``
+* 1 person alone, wider  → ``SIT_CENTER``
+* 1 chart + 1 person     → ``SPLIT_CHART_PERSON``
+* 2 persons              → ``SPLIT_TWO_PERSONS``
+* 2 charts               → ``SPLIT_TWO_CHARTS``
+Each planner returns a pure ``ffmpeg -filter_complex`` fragment ending in
+``[vout]``. The compiler (``compile.py``) glues the fragment to the cut +
+audio + subtitle chain. Because every planner is a pure function that
+returns a string, the whole layout system is unit-testable without ever
+invoking ffmpeg.
+Split layouts share one contract:
+* Output: 9:16 frame split into a **top band** and **bottom band**.
+  Band heights are driven by :attr:`LayoutInstruction.top_band_ratio`.
+  Default is ``0.5`` (even 50/50), matching the user-requested symmetric look.
+* Source strips for the two items are **complementary** — they partition
+  the source width at a single seam so the two items never overlap and
+  together cover the full frame width.
+* Each strip is scaled to fill its output band using the "cover"
+  convention (``force_original_aspect_ratio=increase`` + center crop), so
+  the band is fully painted (no letterbox bars, no stretch).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from ..schemas import (
+    BoundingBox,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    TimedCenterPoint,
+)
+# Source geometry assumption. Most podcast sources are 1920x1080; we still
+# normalize everything by the actual source size so changing this is safe.
+DEFAULT_SRC_W = 1920
+DEFAULT_SRC_H = 1080
+TRACKING_BLEND_SEC = 0.30
+@dataclass(frozen=True)
+class FilterPlan:
+    """Result of planning a layout.
+    ``filtergraph`` is the body of ``-filter_complex`` and ends with
+    ``[vout]`` as the final labelled stream.
+    """
+    filtergraph: str
+    out_label: str = "vout"
+# ---------------------------------------------------------------------------
+# Tiny pixel helpers
+# ---------------------------------------------------------------------------
+def _clamp01(v: float) -> float:
+    return max(0.0, min(1.0, v))
+def _even(v: int) -> int:
+    """Floor ``v`` to an even integer (ffmpeg ``crop``/``scale`` need even dims)."""
+    return v - (v % 2)
+def _bbox_to_crop_pixels(
+    box: BoundingBox, src_w: int, src_h: int
+) -> tuple[int, int, int, int]:
+    """Normalized bbox → ``(cw, ch, x, y)`` with even dimensions for ffmpeg."""
+    x1 = int(round(_clamp01(box.x1) * float(src_w)))
+    y1 = int(round(_clamp01(box.y1) * float(src_h)))
+    x2 = int(round(_clamp01(box.x2) * float(src_w)))
+    y2 = int(round(_clamp01(box.y2) * float(src_h)))
+    x1 = max(0, min(src_w - 2, x1))
+    y1 = max(0, min(src_h - 2, y1))
+    x2 = max(x1 + 2, min(src_w, x2))
+    y2 = max(y1 + 2, min(src_h, y2))
+    cw = _even(x2 - x1)
+    ch = _even(y2 - y1)
+    return max(2, cw), max(2, ch), _even(x1), _even(y1)
+def _base_crop_size(
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+) -> tuple[int, int]:
+    if src_w / src_h >= target_aspect:
+        base_ch = src_h
+        base_cw = int(round(base_ch * target_aspect))
+    else:
+        base_cw = src_w
+        base_ch = int(round(base_cw / target_aspect))
+    return _even(max(2, base_cw)), _even(max(2, base_ch))
+def _crop_box(
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+    zoom: float,
+    center_x_norm: float,
+    center_y_norm: float = 0.5,
+) -> tuple[int, int, int, int]:
+    """Return ``(cw, ch, x, y)`` crop values for a centered aspect-ratio crop.
+    ``zoom > 1`` means tighter crop (smaller window around the center). The
+    function always keeps the crop window fully inside the source frame.
+    """
+    zoom = max(1.0, zoom)
+    base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
+    cw = _even(max(2, int(round(base_cw / zoom))))
+    ch = _even(max(2, int(round(base_ch / zoom))))
+    cx = int(round(_clamp01(center_x_norm) * src_w))
+    cy = int(round(_clamp01(center_y_norm) * src_h))
+    x = _even(max(0, min(src_w - cw, cx - cw // 2)))
+    y = _even(max(0, min(src_h - ch, cy - ch // 2)))
+    return cw, ch, x, y
+def _center_crop_to_9x16(
+    src_w: int, src_h: int, zoom: float, person_x_norm: float
+) -> tuple[int, int, int, int]:
+    return _crop_box(src_w, src_h, 9 / 16, zoom, person_x_norm, 0.5)
+def _crop_x_from_center(src_w: int, cw: int, center_x_norm: float) -> int:
+    """Return an even, in-bounds crop x for a normalized horizontal center."""
+    cx = int(round(_clamp01(center_x_norm) * src_w))
+    return _even(max(0, min(src_w - cw, cx - cw // 2)))
+def _tracked_value_expr(
+    values: list[tuple[float, float]],
+    *,
+    clamp_min: float | None = None,
+    clamp_max: float | None = None,
+    round_even: bool = False,
+) -> str:
+    if not values:
+        raise ValueError("values must not be empty")
+    expr = f"{float(values[-1][0]):.3f}"
+    for idx in range(len(values) - 2, -1, -1):
+        v0, t0 = float(values[idx][0]), float(values[idx][1])
+        v1, t1 = float(values[idx + 1][0]), float(values[idx + 1][1])
+        if t1 <= t0:
+            expr = f"if(lt(t\\,{t1:.3f})\\,{v0:.3f}\\,{expr})"
+            continue
+        switch_t = (t0 + t1) / 2.0
+        blend_half = TRACKING_BLEND_SEC / 2.0
+        blend_start = max(t0, switch_t - blend_half)
+        blend_end = min(t1, switch_t + blend_half)
+        if blend_end <= blend_start:
+            expr = f"if(lt(t\\,{switch_t:.3f})\\,{v0:.3f}\\,{expr})"
+            continue
+        blend_expr = (
+            f"{v0:.3f}+({v1 - v0:.3f})*(t-{blend_start:.3f})/({blend_end - blend_start:.3f})"
+        )
+        expr = (
+            f"if(lt(t\\,{blend_start:.3f})\\,{v0:.3f}\\,"
+            f"if(lt(t\\,{blend_end:.3f})\\,{blend_expr}\\,{expr}))"
+        )
+    if clamp_min is not None:
+        expr = f"max({clamp_min:.3f}\\,{expr})"
+    if clamp_max is not None:
+        expr = f"min({clamp_max:.3f}\\,{expr})"
+    if round_even:
+        expr = f"floor(({expr})/2)*2"
+    return expr
+def _tracked_crop_x_expr(
+    *,
+    src_w: int,
+    crop_w: int,
+    tracking: list[TimedCenterPoint],
+) -> str:
+    """Return an ffmpeg expression for a time-varying crop x position.
+    We mostly hold each framing until the midpoint between adjacent samples,
+    then blend over a short window. That keeps edited talk footage from
+    drifting for seconds after a cut while still avoiding a one-frame jump
+    in the crop position.
+    """
+    if not tracking:
+        raise ValueError("tracking must not be empty")
+    center_points = [
+        (_clamp01(point.x_norm) * src_w, float(point.t_sec))
+        for point in tracking
+    ]
+    center_expr = _tracked_value_expr(
+        center_points,
+        clamp_min=0.0,
+        clamp_max=float(src_w),
+    )
+    max_x = max(0, src_w - crop_w)
+    return f"floor(max(0\\,min({max_x}\\,({center_expr})-{crop_w}/2))/2)*2"
+def _tracked_crop_exprs(
+    *,
+    src_w: int,
+    src_h: int,
+    target_aspect: float,
+    default_zoom: float,
+    center_y_norm: float,
+    tracking: list[TimedCenterPoint],
+) -> tuple[str, str, str, str]:
+    if not tracking:
+        raise ValueError("tracking must not be empty")
+    base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
+    width_points: list[tuple[float, float]] = []
+    height_points: list[tuple[float, float]] = []
+    center_points: list[tuple[float, float]] = []
+    for point in tracking:
+        zoom = max(1.0, float(point.zoom if point.zoom is not None else default_zoom))
+        width_points.append((float(_even(max(2, int(round(base_cw / zoom))))), float(point.t_sec)))
+        height_points.append((float(_even(max(2, int(round(base_ch / zoom))))), float(point.t_sec)))
+        center_points.append((_clamp01(point.x_norm) * src_w, float(point.t_sec)))
+    w_expr = _tracked_value_expr(
+        width_points,
+        clamp_min=2.0,
+        clamp_max=float(base_cw),
+        round_even=True,
+    )
+    h_expr = _tracked_value_expr(
+        height_points,
+        clamp_min=2.0,
+        clamp_max=float(base_ch),
+        round_even=True,
+    )
+    center_expr = _tracked_value_expr(
+        center_points,
+        clamp_min=0.0,
+        clamp_max=float(src_w),
+    )
+    center_y_px = _clamp01(center_y_norm) * src_h
+    x_expr = f"floor(max(0\\,min({src_w}-out_w\\,({center_expr})-out_w/2))/2)*2"
+    y_expr = f"floor(max(0\\,min({src_h}-out_h\\,{center_y_px:.3f}-out_h/2))/2)*2"
+    return w_expr, h_expr, x_expr, y_expr
+# ---------------------------------------------------------------------------
+# Split helpers — shared by all three split layouts
+# ---------------------------------------------------------------------------
+# Minimum source-strip width for a split, as a fraction of source width.
+# Prevents a chart/person bbox that hugs one edge from starving the other.
+_MIN_SPLIT_STRIP_FRAC = 0.2
+_CHART_STRIP_VERTICAL_PAD_FRAC = 0.12
+@dataclass(frozen=True)
+class _SplitStrip:
+    """A source-frame crop rectangle destined for one output band."""
+    cw: int
+    ch: int
+    x: int
+    y: int
+    def filter_crop(self, input_label: str, out_w: int, band_h: int, out_label: str) -> str:
+        """Return ``[input]crop=...,scale=...,crop=...,setsar=1[out_label]``.
+        Uses the "cover" convention: scale so the band is fully painted, then
+        center-crop any overflow. Bands always get filled — no letterbox bars.
+        """
+        return (
+            f"[{input_label}]crop={self.cw}:{self.ch}:{self.x}:{self.y},"
+            f"scale={out_w}:{band_h}:force_original_aspect_ratio=increase,"
+            f"crop={out_w}:{band_h},setsar=1[{out_label}]"
+        )
+def _bbox_strip(
+    box: BoundingBox | None,
+    *,
+    src_w: int,
+    src_h: int,
+    x_start: int,
+    x_end: int,
+) -> _SplitStrip:
+    """Build a source crop for one band.
+    Horizontal range is fixed by ``[x_start, x_end)`` (from the seam math so
+    strips partition the source width). Vertical range comes from ``box``
+    when available — that's what makes the chart **fill** the output band
+    instead of being squashed inside full-height source context.
+    """
+    x = _even(max(0, min(src_w - 2, x_start)))
+    cw = _even(max(2, min(src_w - x, x_end - x)))
+    if box is not None:
+        y1 = int(round(_clamp01(box.y1) * float(src_h)))
+        y2 = int(round(_clamp01(box.y2) * float(src_h)))
+        y = _even(max(0, min(src_h - 2, y1)))
+        ch = _even(max(2, min(src_h - y, y2 - y)))
+    else:
+        y = 0
+        ch = _even(src_h)
+    return _SplitStrip(cw=cw, ch=ch, x=x, y=y)
+def _chart_strip_with_vertical_pad(
+    strip: _SplitStrip,
+    *,
+    src_h: int,
+    pad_frac: float = _CHART_STRIP_VERTICAL_PAD_FRAC,
+) -> _SplitStrip:
+    """Relax chart crops vertically so cover-scaling trims fewer chart edges."""
+    pad = _even(max(0, int(round(strip.ch * max(0.0, pad_frac)))))
+    if pad <= 0:
+        return strip
+    top = max(0, strip.y - pad)
+    bottom = min(src_h, strip.y + strip.ch + pad)
+    ch = _even(max(2, bottom - top))
+    if ch <= strip.ch:
+        return strip
+    y = _even(max(0, min(src_h - ch, top)))
+    return _SplitStrip(cw=strip.cw, ch=ch, x=strip.x, y=y)
+def _compute_seam(
+    *,
+    left_box: BoundingBox | None,
+    right_box: BoundingBox | None,
+    src_w: int,
+    src_h: int,
+    default_fraction: float = 0.5,
+) -> int:
+    """Return an even x-coordinate that partitions the source into two strips.
+    When both bboxes are known, the seam is the midpoint of the gap/overlap
+    between ``left_box.x2`` and ``right_box.x1``. Falls back to
+    ``default_fraction * src_w`` (0.5 = even) otherwise. The seam is clamped
+    so neither strip is thinner than :data:`_MIN_SPLIT_STRIP_FRAC` of source.
+    """
+    if left_box is not None and right_box is not None:
+        _, _, left_x, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
+        left_cw, _, _, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
+        _, _, right_x, _ = _bbox_to_crop_pixels(right_box, src_w, src_h)
+        left_right = left_x + left_cw
+        seam = int(round((left_right + right_x) / 2.0))
+    else:
+        seam = int(round(default_fraction * float(src_w)))
+    seam = _even(seam)
+    min_strip = _even(max(2, int(round(src_w * _MIN_SPLIT_STRIP_FRAC))))
+    if min_strip * 2 >= src_w:
+        min_strip = _even(max(2, src_w // 4))
+    return max(min_strip, min(src_w - min_strip, seam))
+def _band_heights(out_h: int, top_ratio: float) -> tuple[int, int]:
+    """Return ``(top_h, bot_h)`` even band heights that sum to ``out_h``."""
+    top_h = _even(int(round(out_h * top_ratio)))
+    top_h = max(2, min(out_h - 2, top_h))
+    bot_h = out_h - top_h
+    return top_h, bot_h
+def _stack_filtergraph(
+    *,
+    top_strip: _SplitStrip,
+    bot_strip: _SplitStrip,
+    out_w: int,
+    top_h: int,
+    bot_h: int,
+) -> str:
+    """Compose the split filter graph: ``[0:v]split=2 → two crops → vstack → [vout]``."""
+    top_fg = top_strip.filter_crop("src1", out_w, top_h, "top")
+    bot_fg = bot_strip.filter_crop("src2", out_w, bot_h, "bot")
+    return (
+        f"[0:v]split=2[src1][src2];"
+        f"{top_fg};"
+        f"{bot_fg};"
+        f"[top][bot]vstack=inputs=2[vout]"
+    )
+# ---------------------------------------------------------------------------
+# Layout: single-subject (centered) — 1 person
+# ---------------------------------------------------------------------------
+def plan_zoom_call_center(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 person, tight zoom-call framing. ``zoom`` clamped to ``>= 1.25``."""
+    zoom = max(instruction.zoom, 1.25)
+    cw, ch, x, y = _center_crop_to_9x16(src_w, src_h, zoom, instruction.person_x_norm)
+    if instruction.person_tracking:
+        if any(point.zoom is not None for point in instruction.person_tracking):
+            w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
+                src_w=src_w,
+                src_h=src_h,
+                target_aspect=9 / 16,
+                default_zoom=zoom,
+                center_y_norm=0.5,
+                tracking=instruction.person_tracking,
+            )
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+        else:
+            x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+    else:
+        fg = (
+            f"[0:v]crop={cw}:{ch}:{x}:{y},"
+            f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+        )
+    return FilterPlan(filtergraph=fg)
+def plan_sit_center(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 person, interview/seated framing. Vertical center biased to ``0.48``
+    so faces sit slightly above the 9:16 middle instead of centered on a
+    subject's chest.
+    """
+    zoom = max(instruction.zoom, 1.0)
+    cw, ch, x, y = _crop_box(
+        src_w, src_h, 9 / 16, zoom, instruction.person_x_norm, 0.48
+    )
+    if instruction.person_tracking:
+        if any(point.zoom is not None for point in instruction.person_tracking):
+            w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
+                src_w=src_w,
+                src_h=src_h,
+                target_aspect=9 / 16,
+                default_zoom=zoom,
+                center_y_norm=0.48,
+                tracking=instruction.person_tracking,
+            )
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+        else:
+            x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
+            fg = (
+                f"[0:v]setpts=PTS-STARTPTS[vsrc];"
+                f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
+                f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+            )
+    else:
+        fg = (
+            f"[0:v]crop={cw}:{ch}:{x}:{y},"
+            f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
+        )
+    return FilterPlan(filtergraph=fg)
+# ---------------------------------------------------------------------------
+# Split layouts — 2 items stacked vertically
+# ---------------------------------------------------------------------------
+def plan_split_chart_person(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """1 chart + 1 person.
+    **Horizontal partition.** Chart occupies the left source strip, person the
+    right strip. When both bboxes are set (Gemini vision), the seam sits at
+    the midpoint between ``chart.x2`` and ``person.x1`` so the strips are
+    complementary (no overlap, no gap). Otherwise the seam defaults to a
+    2/3 | 1/3 split (chart left, person right), matching the Ark-style
+    explainer-slide geometry this codebase was originally written against.
+    **Vertical crop.** Each strip's vertical extent comes from the
+    corresponding bbox when provided — crucial so the chart **fills** its
+    output band instead of being lost inside full-height source context
+    (plant, background, lower-third graphics, etc.). Falls back to full
+    source height when bboxes are unavailable.
+    **Output bands.** Controlled by :attr:`LayoutInstruction.top_band_ratio`
+    (default 0.5 = even 50/50 — the user-requested symmetric look). Focus
+    stack order picks chart-on-top (default) vs person-on-top.
+    """
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+    chart_box = instruction.split_chart_region
+    person_box = instruction.split_person_region
+    if chart_box is not None and person_box is not None:
+        seam = _compute_seam(
+            left_box=chart_box, right_box=person_box, src_w=src_w, src_h=src_h
+        )
+        chart_start = 0
+    else:
+        # Historical default: chart = left 2/3, person = right 1/3 (the
+        # Ark-style explainer-slide geometry this codebase was originally
+        # written against). ``chart_x_norm`` trims the chart strip from its
+        # left edge when we have no vision bbox to do it precisely.
+        seam = _even(max(2, min(src_w - 2, int(round((2.0 / 3.0) * float(src_w))))))
+        trim = int(round(_clamp01(instruction.chart_x_norm) * float(seam)))
+        chart_start = _even(max(0, min(seam - 2, trim)))
+    chart_strip = _bbox_strip(
+        chart_box, src_w=src_w, src_h=src_h, x_start=chart_start, x_end=seam
+    )
+    if chart_box is not None:
+        chart_strip = _chart_strip_with_vertical_pad(chart_strip, src_h=src_h)
+    person_strip = _bbox_strip(
+        person_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    return _emit_split(
+        chart_strip=chart_strip,
+        person_strip=person_strip,
+        order=instruction.focus_stack_order,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+def _emit_split(
+    *,
+    chart_strip: _SplitStrip,
+    person_strip: _SplitStrip,
+    order: FocusStackOrder,
+    out_w: int,
+    top_h: int,
+    bot_h: int,
+) -> FilterPlan:
+    if order == FocusStackOrder.CHART_THEN_PERSON:
+        fg = _stack_filtergraph(
+            top_strip=chart_strip,
+            bot_strip=person_strip,
+            out_w=out_w,
+            top_h=top_h,
+            bot_h=bot_h,
+        )
+    else:
+        fg = _stack_filtergraph(
+            top_strip=person_strip,
+            bot_strip=chart_strip,
+            out_w=out_w,
+            top_h=top_h,
+            bot_h=bot_h,
+        )
+    return FilterPlan(filtergraph=fg)
+def plan_split_two_persons(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """2 persons (interview two-up) stacked vertically.
+    First person = ``split_person_region``, second person = ``split_second_person_region``.
+    Seam sits at the midpoint between the two bboxes when both are known;
+    otherwise defaults to a centered 50/50 split.
+    """
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+    left_box = instruction.split_person_region
+    right_box = instruction.split_second_person_region
+    seam = _compute_seam(
+        left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
+    )
+    left_strip = _bbox_strip(
+        left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
+    )
+    right_strip = _bbox_strip(
+        right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    fg = _stack_filtergraph(
+        top_strip=left_strip,
+        bot_strip=right_strip,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+    return FilterPlan(filtergraph=fg)
+def plan_split_two_charts(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int,
+    out_h: int,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """2 charts stacked vertically.
+    First chart = ``split_chart_region``, second chart = ``split_second_chart_region``.
+    Uses the same seam/bbox-y-crop recipe as the other splits, so each chart
+    fills its output band instead of being surrounded by source context.
+    """
+    top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
+    left_box = instruction.split_chart_region
+    right_box = instruction.split_second_chart_region
+    seam = _compute_seam(
+        left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
+    )
+    left_strip = _bbox_strip(
+        left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
+    )
+    if left_box is not None:
+        left_strip = _chart_strip_with_vertical_pad(left_strip, src_h=src_h)
+    right_strip = _bbox_strip(
+        right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
+    )
+    if right_box is not None:
+        right_strip = _chart_strip_with_vertical_pad(right_strip, src_h=src_h)
+    fg = _stack_filtergraph(
+        top_strip=left_strip,
+        bot_strip=right_strip,
+        out_w=out_w,
+        top_h=top_h,
+        bot_h=bot_h,
+    )
+    return FilterPlan(filtergraph=fg)
+_DISPATCH = {
+    LayoutKind.ZOOM_CALL_CENTER: plan_zoom_call_center,
+    LayoutKind.SIT_CENTER: plan_sit_center,
+    LayoutKind.SPLIT_CHART_PERSON: plan_split_chart_person,
+    LayoutKind.SPLIT_TWO_PERSONS: plan_split_two_persons,
+    LayoutKind.SPLIT_TWO_CHARTS: plan_split_two_charts,
+}
+def plan_layout(
+    instruction: LayoutInstruction,
+    *,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    src_w: int = DEFAULT_SRC_W,
+    src_h: int = DEFAULT_SRC_H,
+) -> FilterPlan:
+    """Dispatch to one of the five thrusters.
+    Exhaustive over :class:`LayoutKind` — adding a new layout requires adding
+    a planner above **and** an entry in :data:`_DISPATCH`.
+    """
+    fn = _DISPATCH.get(instruction.layout)
+    if fn is None:
+        raise ValueError(f"Unknown layout: {instruction.layout!r}")
+    return fn(instruction, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)

humeo-core/src/humeo_core/primitives/select_clips.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Clip selection: pick the strongest 30-60s segments from a long source.
+Two backends, same contract:
+* ``select_clips_heuristic`` — greedy word-density scoring. Uses the
+  transcript alone; zero model calls. Good baseline when transcript exists.
+* ``select_clips_with_llm`` — pluggable LLM hook. Caller provides a
+  ``(prompt_text) -> str`` function that must return strict JSON matching
+  the ``ClipPlan`` schema. We re-validate before returning.
+Both return a ``ClipPlan``.
+"""
+from __future__ import annotations
+import json
+from typing import Callable
+from ..schemas import Clip, ClipPlan, TranscriptWord
+LLMTextFn = Callable[[str], str]
+CLIP_SELECTOR_PROMPT_TEMPLATE = """You are a viral-clip selector for a podcast editor.
+Return ONLY JSON matching this shape:
+{{
+  "source_path": "{source_path}",
+  "clips": [
+    {{
+      "clip_id": "001",
+      "topic": "<short topic>",
+      "start_time_sec": <float>,
+      "end_time_sec": <float>,
+      "viral_hook": "<one line>",
+      "virality_score": <0..1>,
+      "transcript": "<full clip transcript>",
+      "suggested_overlay_title": "<<=6 words>"
+    }}
+  ]
+}}
+Pick {target_count} clips, each {min_sec}-{max_sec} seconds long, NO overlaps, sorted by virality_score desc.
+Transcript (word, start, end):
+{transcript}
+"""
+def _words_in_window(
+    words: list[TranscriptWord], start: float, end: float
+) -> list[TranscriptWord]:
+    return [w for w in words if w.start_time >= start and w.end_time <= end]
+def select_clips_heuristic(
+    source_path: str,
+    words: list[TranscriptWord],
+    duration_sec: float,
+    *,
+    target_count: int = 5,
+    min_sec: float = 30.0,
+    max_sec: float = 60.0,
+    step_sec: float = 5.0,
+) -> ClipPlan:
+    """Greedy: slide a window, score by words/sec, take top non-overlapping picks."""
+    if duration_sec <= min_sec or not words:
+        # No sensible windowing possible; return one clip of the whole thing.
+        end = min(duration_sec, max_sec) if duration_sec > 0 else max_sec
+        return ClipPlan(
+            source_path=source_path,
+            clips=[
+                Clip(
+                    clip_id="001",
+                    topic="Full source",
+                    start_time_sec=0.0,
+                    end_time_sec=max(end, 1.0),
+                    viral_hook="",
+                    virality_score=0.5,
+                    transcript=" ".join(w.word for w in words),
+                    suggested_overlay_title="Highlight",
+                )
+            ],
+        )
+    candidates: list[tuple[float, float, float, str]] = []
+    window = (min_sec + max_sec) / 2.0
+    t = 0.0
+    while t + window <= duration_sec:
+        ws = _words_in_window(words, t, t + window)
+        if ws:
+            density = len(ws) / window
+            text = " ".join(w.word for w in ws)
+            candidates.append((density, t, t + window, text))
+        t += step_sec
+    candidates.sort(key=lambda c: c[0], reverse=True)
+    picked: list[tuple[float, float, float, str]] = []
+    for c in candidates:
+        if len(picked) >= target_count:
+            break
+        if all(c[2] <= p[1] or c[1] >= p[2] for p in picked):
+            picked.append(c)
+    picked.sort(key=lambda c: c[1])
+    clips: list[Clip] = []
+    for i, (density, s, e, text) in enumerate(picked, start=1):
+        norm = min(1.0, density / 3.0)  # ~3 words/sec is dense talking
+        clips.append(
+            Clip(
+                clip_id=f"{i:03d}",
+                topic=text.split(".")[0][:60] or f"Clip {i}",
+                start_time_sec=round(s, 2),
+                end_time_sec=round(e, 2),
+                viral_hook=text[:120],
+                virality_score=round(norm, 3),
+                transcript=text,
+                suggested_overlay_title=(text.split(".")[0][:40] or f"Clip {i}"),
+            )
+        )
+    return ClipPlan(source_path=source_path, clips=clips)
+def select_clips_with_llm(
+    source_path: str,
+    words: list[TranscriptWord],
+    *,
+    target_count: int,
+    min_sec: float,
+    max_sec: float,
+    text_fn: LLMTextFn,
+) -> ClipPlan:
+    transcript_lines = "\n".join(
+        f"{w.word}\t{w.start_time:.2f}\t{w.end_time:.2f}" for w in words
+    )
+    prompt = CLIP_SELECTOR_PROMPT_TEMPLATE.format(
+        source_path=source_path,
+        target_count=target_count,
+        min_sec=min_sec,
+        max_sec=max_sec,
+        transcript=transcript_lines,
+    )
+    raw = text_fn(prompt)
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"LLM did not return JSON: {e}; raw={raw[:200]!r}") from e
+    return ClipPlan.model_validate(data)

humeo-core/src/humeo_core/primitives/vision.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Vision-LLM + OCR primitive — the alt path to per-scene framing decisions.
+Design (Bryan's "big screen change -> v3 images -> LLM+OCR -> bbox" idea):
+1. Scene detection already produces one keyframe per scene (deterministic,
+   local, cheap). That is ``primitives/ingest.py::extract_keyframes``.
+2. For each keyframe, call a pluggable vision LLM with an OCR hint. The
+   model returns normalized bboxes for the on-screen roles it cares about
+   (``person``, ``chart``) plus any OCR text it reads.
+3. Fold those bboxes into ``LayoutInstruction`` values so the existing
+   layout planner (``primitives/layouts.py``) does the actual ffmpeg math.
+Why this shape:
+* **Pluggable**. Caller supplies ``LLMRegionFn``. We never hard-code a
+  provider. The same primitive works for Gemini, GPT-4o, internal models,
+  tests, or mocks.
+* **Schema-validated**. Raw model output is parsed into ``SceneRegions``
+  (Pydantic). Malformed output degrades to ``None`` regions rather than
+  crashing or corrupting downstream state.
+* **Separable**. ``detect_regions_with_llm`` is one function. Mapping
+  regions to ``LayoutInstruction`` is another. Mapping a ``LayoutKind``
+  guess from regions is a third. Each is independently testable.
+"""
+from __future__ import annotations
+import json
+from typing import Callable
+from ..schemas import (
+    BoundingBox,
+    LayoutInstruction,
+    LayoutKind,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+)
+LLMRegionFn = Callable[[str, str], str]
+"""Signature: (keyframe_path, prompt) -> raw model string (expected JSON).
+The caller is responsible for any image encoding (base64, multipart, etc.).
+The primitive only passes the path + prompt and re-validates the reply.
+"""
+REGION_PROMPT = """You are a vision+OCR system for a short-video editor.
+Look at the provided keyframe and return a STRICT JSON object of this shape:
+{
+  "person_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
+  "chart_bbox":  {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
+  "ocr_text":    "<text visible on screen, empty string if none>",
+  "reason":      "<= 20 words of rationale"
+}
+Rules:
+- All bbox coordinates are normalized to the frame (0=left/top, 1=right/bottom).
+- x2 > x1, y2 > y1.
+- Return null for any region that is not present (e.g. a pure talking-head
+  scene has no chart).
+- "person_bbox" is the *speaker's* body/head region if visible.
+- "chart_bbox" is any chart, graph, slide, screenshare, or diagram.
+- OCR text should be the readable text on screen (titles, labels, chart
+  axis values). Omit subtitle captions.
+- NO markdown, NO prose outside JSON. JSON only.
+"""
+# ---------------------------------------------------------------------------
+# Core: detect regions per scene via pluggable LLM
+# ---------------------------------------------------------------------------
+def detect_regions_with_llm(
+    scenes: list[Scene], vision_fn: LLMRegionFn
+) -> list[SceneRegions]:
+    """Call ``vision_fn`` for each scene's keyframe and return parsed regions.
+    Parse failures degrade to an empty ``SceneRegions`` with ``raw_reason``
+    describing the error — never raise — so a single bad scene can't take
+    down the whole pipeline.
+    """
+    out: list[SceneRegions] = []
+    for s in scenes:
+        if not s.keyframe_path:
+            out.append(
+                SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available")
+            )
+            continue
+        raw = vision_fn(s.keyframe_path, REGION_PROMPT)
+        out.append(_parse_region_reply(s.scene_id, raw))
+    return out
+def _parse_region_reply(scene_id: str, raw: str) -> SceneRegions:
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as e:
+        return SceneRegions(scene_id=scene_id, raw_reason=f"JSON parse error: {e!r}")
+    def _opt_bbox(value: object) -> BoundingBox | None:
+        if not value:
+            return None
+        try:
+            return BoundingBox.model_validate(value)
+        except Exception:
+            return None
+    return SceneRegions(
+        scene_id=scene_id,
+        person_bbox=_opt_bbox(data.get("person_bbox")),
+        chart_bbox=_opt_bbox(data.get("chart_bbox")),
+        ocr_text=str(data.get("ocr_text", ""))[:4000],
+        raw_reason=str(data.get("reason", ""))[:400],
+    )
+# ---------------------------------------------------------------------------
+# Derivation: regions -> LayoutKind / LayoutInstruction
+# ---------------------------------------------------------------------------
+# Width threshold: if the chart bbox covers this much of the frame width, it
+# is wide enough to treat the scene as a split_chart_person. Tuned for the
+# source videos described in the spec (chart ~2/3 of width).
+_CHART_WIDTH_SPLIT_THRESHOLD = 0.45
+def classify_from_regions(regions: SceneRegions) -> SceneClassification:
+    """Pick a ``LayoutKind`` for a scene using only its ``SceneRegions``.
+    Priority:
+      1. If ``chart_bbox`` is present and wide, it's ``SPLIT_CHART_PERSON``.
+      2. Else if ``person_bbox`` is present and tight, ``ZOOM_CALL_CENTER``.
+      3. Else default to ``SIT_CENTER`` with low confidence.
+    "Tight" ≈ the person covers more than half the frame width (zoom-call
+    webcam framing). "Wide" for a chart ≈ 45% of frame width or more.
+    """
+    if regions.chart_bbox and regions.chart_bbox.width >= _CHART_WIDTH_SPLIT_THRESHOLD:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            confidence=float(min(1.0, 0.5 + regions.chart_bbox.width / 2.0)),
+            reason=f"chart bbox covers {regions.chart_bbox.width:.2f} of width",
+        )
+    if regions.person_bbox and regions.person_bbox.width >= 0.5:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.ZOOM_CALL_CENTER,
+            confidence=float(min(1.0, 0.5 + regions.person_bbox.width / 2.0)),
+            reason=f"person bbox wide ({regions.person_bbox.width:.2f}) — tight framing",
+        )
+    if regions.person_bbox:
+        return SceneClassification(
+            scene_id=regions.scene_id,
+            layout=LayoutKind.SIT_CENTER,
+            confidence=0.7,
+            reason="person present, no wide chart, wider framing",
+        )
+    return SceneClassification(
+        scene_id=regions.scene_id,
+        layout=LayoutKind.SIT_CENTER,
+        confidence=0.3,
+        reason=regions.raw_reason or "no regions detected — defaulting to sit_center",
+    )
+def layout_instruction_from_regions(
+    regions: SceneRegions,
+    classification: SceneClassification,
+    *,
+    clip_id: str | None = None,
+    zoom: float = 1.0,
+) -> LayoutInstruction:
+    """Build a ``LayoutInstruction`` whose knobs are populated from bboxes.
+    ``person_x_norm`` uses the person bbox center when available; falls back
+    to 0.5 (center). ``chart_x_norm`` uses the chart bbox left edge; falls
+    back to 0.0.
+    """
+    person_x = regions.person_bbox.center_x if regions.person_bbox else 0.5
+    chart_x = regions.chart_bbox.x1 if regions.chart_bbox else 0.0
+    return LayoutInstruction(
+        clip_id=clip_id or classification.scene_id,
+        layout=classification.layout,
+        zoom=zoom,
+        person_x_norm=person_x,
+        chart_x_norm=chart_x,
+    )
+def classify_scenes_with_vision_llm(
+    scenes: list[Scene], vision_fn: LLMRegionFn
+) -> list[tuple[SceneRegions, SceneClassification]]:
+    """One-shot helper: keyframes -> regions -> classifications.
+    Returns ``(regions, classification)`` pairs per scene so the caller can
+    keep both artefacts on disk (regions = deep detail, classification =
+    what a renderer consumes).
+    """
+    regions = detect_regions_with_llm(scenes, vision_fn)
+    return [(r, classify_from_regions(r)) for r in regions]

humeo-core/src/humeo_core/schemas.py ADDED Viewed

	@@ -0,0 +1,518 @@

+"""Strict JSON contracts — the "container" of the rocket.
+Every primitive reads and writes these. No primitive takes or returns free-form
+strings. This is the non-negotiable interface described in the HIVE paper
+guide (section 7): machine-checkable intermediate artifacts at every stage.
+"""
+from __future__ import annotations
+from enum import Enum
+from typing import Literal
+from pydantic import BaseModel, Field, field_validator, model_serializer, model_validator
+# ---------------------------------------------------------------------------
+# Extraction artifacts
+# ---------------------------------------------------------------------------
+class Scene(BaseModel):
+    """A single shot/scene detected in the source video."""
+    scene_id: str
+    start_time: float = Field(ge=0)
+    end_time: float = Field(gt=0)
+    keyframe_path: str | None = None
+    @field_validator("end_time")
+    @classmethod
+    def _end_after_start(cls, v: float, info) -> float:
+        start = info.data.get("start_time", 0.0)
+        if v <= start:
+            raise ValueError("end_time must be strictly greater than start_time")
+        return v
+    @property
+    def duration(self) -> float:
+        return self.end_time - self.start_time
+class TranscriptWord(BaseModel):
+    """One ASR token with times in **seconds on the source video** timeline."""
+    word: str
+    start_time: float = Field(ge=0)
+    end_time: float = Field(ge=0)
+class ClipSubtitleWords(BaseModel):
+    """Words for one clip with times in **seconds relative to clip start** (t=0 at cut in-point)."""
+    words: list[TranscriptWord] = Field(default_factory=list)
+class FocusStackOrder(str, Enum):
+    """Vertical order for split layouts: which item occupies the top vs bottom band.
+    Bands are split by :attr:`LayoutInstruction.top_band_ratio` (default 0.5 = even).
+    For ``SPLIT_CHART_PERSON`` this picks chart-on-top vs person-on-top.
+    For ``SPLIT_TWO_PERSONS`` / ``SPLIT_TWO_CHARTS`` it has no visible meaning
+    (both bands hold the same kind of item); the enum value is retained only
+    so a single stacking recipe drives all three split layouts.
+    """
+    CHART_THEN_PERSON = "chart_then_person"
+    PERSON_THEN_CHART = "person_then_chart"
+class RenderTheme(str, Enum):
+    """Visual treatment applied by the final renderer."""
+    LEGACY = "legacy"
+    REFERENCE_LOWER_THIRD = "reference_lower_third"
+    NATIVE_HIGHLIGHT = "native_highlight"
+class IngestResult(BaseModel):
+    """Everything Stage 1 (deterministic local extraction) produces."""
+    source_path: str
+    duration_sec: float
+    scenes: list[Scene]
+    transcript_words: list[TranscriptWord]
+    keyframes_dir: str | None = None
+# ---------------------------------------------------------------------------
+# Layout system — the 5 "thrusters" (max 2 on-screen items per short)
+# ---------------------------------------------------------------------------
+class LayoutKind(str, Enum):
+    """The 9:16 layouts. A short contains **at most two** on-screen items.
+    An "item" is one of ``person`` (a human speaker) or ``chart`` (slide, graph,
+    data visual, screenshare). Five combinations are allowed:
+    - ``ZOOM_CALL_CENTER``:   **1 person**, tight webcam/zoom-call framing, centered.
+    - ``SIT_CENTER``:         **1 person**, interview/seated framing, centered.
+    - ``SPLIT_CHART_PERSON``: **1 chart + 1 person** — chart + speaker share the
+                              source frame. Output stacks them vertically
+                              (by default ``focus_stack_order`` = chart-on-top).
+    - ``SPLIT_TWO_PERSONS``:  **2 persons** — two speakers (e.g. interview two-up).
+                              Output stacks them vertically.
+    - ``SPLIT_TWO_CHARTS``:   **2 charts** — two charts/slides side-by-side in source.
+                              Output stacks them vertically.
+    The "max 2 items" constraint is the keep-it-simple rule: every rendered short
+    is either one item centered, or two items stacked evenly top/bottom.
+    """
+    ZOOM_CALL_CENTER = "zoom_call_center"
+    SIT_CENTER = "sit_center"
+    SPLIT_CHART_PERSON = "split_chart_person"
+    SPLIT_TWO_PERSONS = "split_two_persons"
+    SPLIT_TWO_CHARTS = "split_two_charts"
+# Layouts that stack two items vertically in the 9:16 output.
+SPLIT_LAYOUTS: frozenset[LayoutKind] = frozenset(
+    {
+        LayoutKind.SPLIT_CHART_PERSON,
+        LayoutKind.SPLIT_TWO_PERSONS,
+        LayoutKind.SPLIT_TWO_CHARTS,
+    }
+)
+class TimedCenterPoint(BaseModel):
+    """Speaker x-center at a clip-relative time, used for tracked centering."""
+    t_sec: float = Field(ge=0.0)
+    x_norm: float = Field(ge=0.0, le=1.0)
+    zoom: float | None = Field(
+        default=None,
+        gt=0.0,
+        le=4.0,
+        description=(
+            "Optional per-sample crop zoom. When unset, the layout uses the "
+            "clip-level ``zoom`` value for that moment."
+        ),
+    )
+class ClipRenderSpan(BaseModel):
+    """One kept source-timeline span inside a selected clip."""
+    start_time_sec: float = Field(ge=0.0)
+    end_time_sec: float = Field(gt=0.0)
+    @field_validator("end_time_sec")
+    @classmethod
+    def _end_after_start(cls, v: float, info) -> float:
+        start = info.data.get("start_time_sec", 0.0)
+        if v <= start:
+            raise ValueError("render span end_time_sec must be greater than start_time_sec")
+        return v
+    @property
+    def duration_sec(self) -> float:
+        return self.end_time_sec - self.start_time_sec
+class LayoutInstruction(BaseModel):
+    """Per-clip decision telling the compiler which layout to apply and how to crop.
+    Every short is described by exactly one of these, keyed by ``clip_id``. Split
+    layouts additionally carry up to two normalized bounding boxes (chart/person
+    or two-of-a-kind) so the compiler crops source strips that **partition** the
+    source width without overlap or gap.
+    """
+    clip_id: str
+    layout: LayoutKind
+    # Optional per-layout knobs. Defaults are sane for a 1920x1080 source.
+    zoom: float = Field(default=1.0, gt=0, le=4.0)
+    person_x_norm: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Normalized x-center of the human subject in source frame (0=left, 1=right).",
+    )
+    person_tracking: list[TimedCenterPoint] = Field(
+        default_factory=list,
+        description=(
+            "Optional clip-relative speaker framing samples for moving 9:16 crops. "
+            "Each point can shift the x-center and optionally widen/tighten the crop "
+            "for that moment. When empty, the compiler uses the static "
+            "person_x_norm/zoom settings."
+        ),
+    )
+    chart_x_norm: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description=(
+            "split_chart_person only: left-edge trim of the chart strip, as a fraction of the "
+            "left 2/3 pane (0 = use full chart area)."
+        ),
+    )
+    focus_stack_order: FocusStackOrder = Field(
+        default=FocusStackOrder.CHART_THEN_PERSON,
+        description="For split_chart_person only: chart-on-top vs person-on-top in the 9:16 stack.",
+    )
+    split_chart_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "Optional normalized rect for the chart/slide crop (Gemini vision). "
+            "When set with split_person_region, the split layout uses these boxes instead of fixed 2/3|1/3."
+        ),
+    )
+    split_person_region: BoundingBox | None = Field(
+        default=None,
+        description="Optional normalized rect for the speaker crop (Gemini vision).",
+    )
+    split_second_chart_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "For ``SPLIT_TWO_CHARTS`` only: second chart bbox. The first chart occupies "
+            "the top output band, this one occupies the bottom band."
+        ),
+    )
+    split_second_person_region: BoundingBox | None = Field(
+        default=None,
+        description=(
+            "For ``SPLIT_TWO_PERSONS`` only: second speaker bbox. The first person "
+            "occupies the top output band, this one occupies the bottom band."
+        ),
+    )
+    top_band_ratio: float = Field(
+        default=0.5,
+        ge=0.2,
+        le=0.8,
+        description=(
+            "Fraction of 9:16 output height used by the top band for split layouts. "
+            "0.5 = EVEN 50/50 split (default — the user-requested symmetric look). "
+            "0.6 historically matched the 'chart dominant / person small' look."
+        ),
+    )
+    @field_validator("person_tracking")
+    @classmethod
+    def _tracking_times_non_decreasing(
+        cls, points: list[TimedCenterPoint]
+    ) -> list[TimedCenterPoint]:
+        last_t = -1.0
+        for point in points:
+            if point.t_sec < last_t:
+                raise ValueError("person_tracking times must be non-decreasing")
+            last_t = point.t_sec
+        return points
+class SceneClassification(BaseModel):
+    """Result of the classifier: which layout should a given scene use."""
+    scene_id: str
+    layout: LayoutKind
+    confidence: float = Field(ge=0.0, le=1.0)
+    reason: str = ""
+# ---------------------------------------------------------------------------
+# Vision bounding boxes — the LLM+OCR path (alt to pixel heuristics)
+# ---------------------------------------------------------------------------
+class BoundingBox(BaseModel):
+    """Normalized [0..1] bounding box in the source frame coordinate space.
+    Normalized coords keep these outputs portable across source resolutions
+    and stop the model hallucinating pixel values. ``x2 > x1`` and
+    ``y2 > y1`` are enforced.
+    """
+    x1: float = Field(ge=0.0, le=1.0)
+    y1: float = Field(ge=0.0, le=1.0)
+    x2: float = Field(ge=0.0, le=1.0)
+    y2: float = Field(ge=0.0, le=1.0)
+    label: str = ""
+    confidence: float = Field(default=1.0, ge=0.0, le=1.0)
+    @field_validator("x2")
+    @classmethod
+    def _x2_after_x1(cls, v: float, info) -> float:
+        x1 = info.data.get("x1", 0.0)
+        if v <= x1:
+            raise ValueError("x2 must be > x1")
+        return v
+    @field_validator("y2")
+    @classmethod
+    def _y2_after_y1(cls, v: float, info) -> float:
+        y1 = info.data.get("y1", 0.0)
+        if v <= y1:
+            raise ValueError("y2 must be > y1")
+        return v
+    @property
+    def center_x(self) -> float:
+        return (self.x1 + self.x2) / 2.0
+    @property
+    def center_y(self) -> float:
+        return (self.y1 + self.y2) / 2.0
+    @property
+    def width(self) -> float:
+        return self.x2 - self.x1
+class SceneRegions(BaseModel):
+    """Vision-LLM output for a single scene keyframe.
+    Flow: detect a scene change locally (cheap) -> extract one keyframe per
+    scene -> send that keyframe to a vision LLM with an OCR hint -> get
+    normalized bounding boxes for the on-screen roles (``person``,
+    ``chart``). Those boxes drive ``person_x_norm`` / ``chart_x_norm`` on a
+    ``LayoutInstruction`` without any pixel code running in Python.
+    """
+    scene_id: str
+    person_bbox: BoundingBox | None = None
+    chart_bbox: BoundingBox | None = None
+    ocr_text: str = ""
+    raw_reason: str = ""
+# ---------------------------------------------------------------------------
+# Clip planning
+# ---------------------------------------------------------------------------
+class Clip(BaseModel):
+    clip_id: str
+    topic: str
+    start_time_sec: float = Field(ge=0)
+    end_time_sec: float = Field(gt=0)
+    viral_hook: str = ""
+    virality_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    transcript: str = ""
+    suggested_overlay_title: str = ""
+    layout: LayoutKind | None = None
+    score_breakdown: dict[str, float] | None = None
+    origin: Literal["text", "visual", "both"] = "text"
+    visual_notes: str | None = None
+    reasoning: str | None = None
+    # Optional LLM metadata (source timeline is start_time_sec / end_time_sec).
+    hook_start_sec: float | None = Field(
+        default=None,
+        description="Seconds from clip in-point where the viral hook begins (0 = clip start).",
+    )
+    hook_end_sec: float | None = Field(
+        default=None,
+        description="Seconds from clip in-point where the hook ends (exclusive upper bound).",
+    )
+    trim_start_sec: float = Field(
+        default=0.0,
+        ge=0,
+        description="Seconds to remove from the start of this segment when exporting.",
+    )
+    trim_end_sec: float = Field(
+        default=0.0,
+        ge=0,
+        description="Seconds to remove from the end of this segment when exporting.",
+    )
+    render_spans: list[ClipRenderSpan] = Field(
+        default_factory=list,
+        description=(
+            "Optional ordered source-timeline spans to keep when exporting. "
+            "When present, these spans override contiguous trim_start/trim_end export."
+        ),
+    )
+    shorts_title: str = ""
+    description: str = ""
+    hashtags: list[str] = Field(default_factory=list)
+    layout_hint: LayoutKind | None = None
+    needs_review: bool = False
+    review_reason: str = ""
+    @field_validator("score_breakdown")
+    @classmethod
+    def _score_breakdown_in_range(
+        cls, v: dict[str, float] | None
+    ) -> dict[str, float] | None:
+        if v is None:
+            return None
+        cleaned: dict[str, float] = {}
+        for axis, score in v.items():
+            if score < 0.0:
+                raise ValueError(f"score_breakdown[{axis!r}] must be non-negative")
+            cleaned[axis] = min(score, 1.0)
+        return cleaned
+    @model_validator(mode="after")
+    def _timing_consistency(self) -> "Clip":
+        if self.end_time_sec <= self.start_time_sec:
+            raise ValueError("end_time_sec must be greater than start_time_sec")
+        dur = self.end_time_sec - self.start_time_sec
+        hs, he = self.hook_start_sec, self.hook_end_sec
+        if (hs is None) ^ (he is None):
+            raise ValueError("hook_start_sec and hook_end_sec must both be set or both omitted")
+        if hs is not None and he is not None:
+            if not (0 <= hs < he <= dur):
+                raise ValueError(
+                    "hook window must satisfy 0 <= hook_start_sec < hook_end_sec <= clip duration"
+                )
+        if self.trim_start_sec + self.trim_end_sec > dur:
+            raise ValueError("trim_start_sec + trim_end_sec must not exceed clip duration")
+        last_end = None
+        for span in self.render_spans:
+            if span.start_time_sec < self.start_time_sec - 1e-6:
+                raise ValueError("render_spans must stay within the clip start_time_sec")
+            if span.end_time_sec > self.end_time_sec + 1e-6:
+                raise ValueError("render_spans must stay within the clip end_time_sec")
+            if last_end is not None and span.start_time_sec < last_end - 1e-6:
+                raise ValueError("render_spans must be ordered and non-overlapping")
+            last_end = span.end_time_sec
+        return self
+    @model_serializer(mode="wrap")
+    def _serialize_without_default_extensions(self, handler):
+        data = handler(self)
+        if data.get("score_breakdown") is None:
+            data.pop("score_breakdown", None)
+        if data.get("origin") == "text":
+            data.pop("origin", None)
+        if data.get("visual_notes") is None:
+            data.pop("visual_notes", None)
+        if data.get("reasoning") is None:
+            data.pop("reasoning", None)
+        return data
+    @property
+    def duration_sec(self) -> float:
+        return self.end_time_sec - self.start_time_sec
+class ClipPlan(BaseModel):
+    """Output of the clip-selection stage — a list of clips + their layouts."""
+    source_path: str
+    clips: list[Clip]
+class ApprovalResult(BaseModel):
+    action: Literal["proceed", "refine", "quit", "accept_all"]
+    selected_ids: list[str] | None = None
+    steering_note: str | None = None
+class RatingFeedback(BaseModel):
+    rating: Literal[1, 2, 3]
+    issues: list[
+        Literal[
+            "wrong_moments",
+            "bad_cuts",
+            "boring",
+            "confusing",
+            "wrong_layout",
+            "length_off",
+            "other",
+        ]
+    ] = Field(default_factory=list)
+    free_text: str | None = None
+class SessionState(BaseModel):
+    source_key: str = ""
+    iteration: int = 0
+    steering_notes: list[str] = Field(default_factory=list)
+    last_rating: RatingFeedback | None = None
+    last_selected_ids: list[str] | None = None
+# ---------------------------------------------------------------------------
+# Render
+# ---------------------------------------------------------------------------
+class RenderRequest(BaseModel):
+    source_path: str
+    clip: Clip
+    layout: LayoutInstruction
+    output_path: str
+    width: int = 1080
+    height: int = 1920
+    subtitle_path: str | None = None
+    subtitle_font_size: int = Field(
+        default=48,
+        ge=10,
+        le=120,
+        description=(
+            "Caption font size in **output pixels** (libass is pinned to "
+            "``original_size=width x height`` by the compiler, so this is a "
+            "true pixel value, not the old PlayResY=288 unit)."
+        ),
+    )
+    subtitle_margin_v: int = Field(
+        default=160,
+        ge=0,
+        le=800,
+        description="Vertical caption margin in output pixels (bottom-anchored).",
+    )
+    title_text: str = ""
+    render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT
+    mode: Literal["normal", "dry_run"] = "normal"
+class RenderResult(BaseModel):
+    clip_id: str
+    output_path: str
+    ffmpeg_cmd: list[str]
+    success: bool
+    error: str = ""

humeo-core/src/humeo_core/server.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""FastMCP server — the control panel for the reusable rocket.
+Every primitive is exposed as a single MCP ``tool``. Each tool takes and
+returns strict Pydantic-validated JSON, so an MCP client (Cursor, Claude
+Desktop, etc.) can compose a full long-to-short pipeline without guessing
+any interface.
+Tools:
+    humeo.ingest                      — Stage 1 extraction (scenes + keyframes [+ transcript])
+    humeo.classify_scenes             — Assign one of 5 layouts to each scene (pixel heuristic)
+    humeo.classify_scenes_with_vision — Assign layouts using bboxes from a vision LLM + OCR
+    humeo.detect_scene_regions        — Raw LLM bbox output per scene keyframe (OCR-assisted)
+    humeo.select_clips                — Pick top clips from a transcript (heuristic)
+    humeo.plan_layout                 — Return the ffmpeg filtergraph for a given layout
+    humeo.build_render_cmd            — Build the full ffmpeg command (dry-run safe)
+    humeo.render_clip                 — Build + actually run ffmpeg to produce a 9:16 clip
+    humeo.list_layouts                — List the 5 available layouts (discovery)
+Resources:
+    humeo://layouts             — JSON listing of the 5 layouts + description
+"""
+from __future__ import annotations
+import json
+from typing import Any
+from mcp.server.fastmcp import FastMCP
+from .primitives import classify as classify_mod
+from .primitives import compile as compile_mod
+from .primitives import ingest as ingest_mod
+from .primitives import layouts as layouts_mod
+from .primitives import select_clips as select_mod
+from .primitives import vision as vision_mod
+from .schemas import (
+    IngestResult,
+    LayoutInstruction,
+    LayoutKind,
+    RenderRequest,
+    RenderResult,
+    Scene,
+    SceneRegions,
+    TranscriptWord,
+)
+mcp = FastMCP(
+    "humeo-core",
+    instructions=(
+        "Humeo MCP: reusable primitives for turning long videos into 9:16 shorts. "
+        "Compose tools in this order: ingest -> classify_scenes -> select_clips -> "
+        "plan_layout/build_render_cmd -> render_clip. All IO is strict JSON."
+    ),
+)
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def list_layouts() -> dict[str, Any]:
+    """Return the 5 fixed 9:16 layouts this server supports.
+    Every short shows **at most two** on-screen items (person/chart), which
+    gives exactly five recipes. Use this to discover the set of
+    :class:`LayoutKind` values before classifying scenes or requesting
+    renders.
+    """
+    return {
+        "layouts": [
+            {
+                "kind": LayoutKind.ZOOM_CALL_CENTER.value,
+                "items": ["person"],
+                "description": "1 person, tight zoom-call / webcam framing, centered.",
+            },
+            {
+                "kind": LayoutKind.SIT_CENTER.value,
+                "items": ["person"],
+                "description": "1 person, interview / seated framing, centered.",
+            },
+            {
+                "kind": LayoutKind.SPLIT_CHART_PERSON.value,
+                "items": ["chart", "person"],
+                "description": (
+                    "1 chart + 1 person. Source is partitioned left/right by the chart and "
+                    "person bboxes (falling back to a 2/3 | 1/3 split); each strip is scaled "
+                    "to fill its output band. Bands default to an even 50/50 vertical split; "
+                    "configurable via ``top_band_ratio`` and swappable via ``focus_stack_order``."
+                ),
+            },
+            {
+                "kind": LayoutKind.SPLIT_TWO_PERSONS.value,
+                "items": ["person", "person"],
+                "description": (
+                    "2 people (interview two-up / panel). Left speaker in the top band, right "
+                    "speaker in the bottom band; seam sits between the two person bboxes."
+                ),
+            },
+            {
+                "kind": LayoutKind.SPLIT_TWO_CHARTS.value,
+                "items": ["chart", "chart"],
+                "description": (
+                    "2 charts / slides side-by-side in source. Left chart on top, right chart "
+                    "on bottom; each is scaled to fill its band."
+                ),
+            },
+        ]
+    }
+@mcp.resource("humeo://layouts")
+def layouts_resource() -> str:
+    return json.dumps(list_layouts(), indent=2)
+# ---------------------------------------------------------------------------
+# Landing gear: ingest
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def ingest(
+    source_path: str,
+    work_dir: str,
+    with_transcript: bool = False,
+    whisper_model: str = "base",
+) -> dict[str, Any]:
+    """Run deterministic local extraction (scenes + keyframes, optional transcript).
+    Args:
+        source_path: absolute path to a local video file.
+        work_dir: directory where keyframes/ and temp artifacts will be written.
+        with_transcript: if True, run faster-whisper word-level transcription.
+        whisper_model: whisper model name (e.g. "tiny", "base", "small").
+    """
+    result: IngestResult = ingest_mod.ingest(
+        source_path,
+        work_dir,
+        with_transcript=with_transcript,
+        whisper_model=whisper_model,
+    )
+    return result.model_dump()
+# ---------------------------------------------------------------------------
+# Pilot: classify scenes
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def classify_scenes(scenes: list[dict[str, Any]]) -> dict[str, Any]:
+    """Classify each scene into exactly one of the 5 supported layouts.
+    Uses an offline pixel heuristic on each scene's keyframe. Agents that
+    want a smarter classifier can post-process or overwrite the result,
+    or call ``classify_scenes_with_vision`` with bboxes from a vision LLM.
+    """
+    parsed = [Scene.model_validate(s) for s in scenes]
+    results = classify_mod.classify_scenes_heuristic(parsed)
+    return {"classifications": [r.model_dump() for r in results]}
+# ---------------------------------------------------------------------------
+# Pilot (alt path): vision-LLM + OCR bbox classifier
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def detect_scene_regions(scenes: list[dict[str, Any]]) -> dict[str, Any]:
+    """Return the prompt + per-scene stubs used for LLM+OCR bbox detection.
+    This tool is the *adapter* half of the vision primitive. The MCP server
+    itself never calls an LLM — the agent does. So this endpoint returns:
+    1. the exact ``REGION_PROMPT`` to send along with each keyframe, and
+    2. a list of ``{scene_id, keyframe_path, prompt}`` jobs.
+    The agent runs its own vision model for each job, then feeds the
+    resulting JSON back via ``classify_scenes_with_vision``.
+    """
+    parsed = [Scene.model_validate(s) for s in scenes]
+    return {
+        "prompt": vision_mod.REGION_PROMPT,
+        "jobs": [
+            {
+                "scene_id": s.scene_id,
+                "keyframe_path": s.keyframe_path,
+                "prompt": vision_mod.REGION_PROMPT,
+            }
+            for s in parsed
+        ],
+    }
+@mcp.tool()
+def classify_scenes_with_vision(regions: list[dict[str, Any]]) -> dict[str, Any]:
+    """Classify scenes from already-gathered ``SceneRegions`` bbox records.
+    Input is a list of ``SceneRegions`` JSON dicts (output of the agent's
+    vision-LLM pass). Output is a ``{classifications, layout_instructions}``
+    pair — the layout kind per scene plus a ready-to-render
+    ``LayoutInstruction`` with ``person_x_norm`` / ``chart_x_norm`` already
+    populated from the bboxes.
+    """
+    parsed_regions = [SceneRegions.model_validate(r) for r in regions]
+    classifications = [vision_mod.classify_from_regions(r) for r in parsed_regions]
+    instructions = [
+        vision_mod.layout_instruction_from_regions(r, c)
+        for r, c in zip(parsed_regions, classifications)
+    ]
+    return {
+        "classifications": [c.model_dump() for c in classifications],
+        "layout_instructions": [i.model_dump() for i in instructions],
+    }
+# ---------------------------------------------------------------------------
+# Pilot: select clips
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def select_clips(
+    source_path: str,
+    transcript_words: list[dict[str, Any]],
+    duration_sec: float,
+    target_count: int = 5,
+    min_sec: float = 30.0,
+    max_sec: float = 60.0,
+) -> dict[str, Any]:
+    """Heuristically select top clips from a word-level transcript.
+    Scoring is word-density per window. Returns a ``ClipPlan`` with up to
+    ``target_count`` non-overlapping clips.
+    """
+    words = [TranscriptWord.model_validate(w) for w in transcript_words]
+    plan = select_mod.select_clips_heuristic(
+        source_path,
+        words,
+        duration_sec,
+        target_count=target_count,
+        min_sec=min_sec,
+        max_sec=max_sec,
+    )
+    return plan.model_dump()
+# ---------------------------------------------------------------------------
+# Thrusters: plan + render
+# ---------------------------------------------------------------------------
+@mcp.tool()
+def plan_layout(
+    layout: str,
+    out_w: int = 1080,
+    out_h: int = 1920,
+    src_w: int = 1920,
+    src_h: int = 1080,
+    zoom: float = 1.0,
+    person_x_norm: float = 0.5,
+    chart_x_norm: float = 0.0,
+    clip_id: str = "preview",
+) -> dict[str, Any]:
+    """Return the ffmpeg filter_complex fragment for one layout.
+    This is the pure, deterministic function underpinning the 5 thrusters.
+    No rendering is performed. Useful for agents that want to preview the
+    filtergraph or compose it with their own ffmpeg invocation.
+    """
+    instr = LayoutInstruction(
+        clip_id=clip_id,
+        layout=LayoutKind(layout),
+        zoom=zoom,
+        person_x_norm=person_x_norm,
+        chart_x_norm=chart_x_norm,
+    )
+    fp = layouts_mod.plan_layout(instr, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
+    return {"filtergraph": fp.filtergraph, "out_label": fp.out_label}
+@mcp.tool()
+def build_render_cmd(request: dict[str, Any]) -> dict[str, Any]:
+    """Build (but do NOT run) the ffmpeg command for a render request.
+    ``request`` must conform to the ``RenderRequest`` schema. This is a
+    dry-run helper so an agent can review the command before executing it.
+    """
+    req = RenderRequest.model_validate({**request, "mode": "dry_run"})
+    result = compile_mod.render_clip(req)
+    return result.model_dump()
+@mcp.tool()
+def render_clip(request: dict[str, Any]) -> dict[str, Any]:
+    """Render a single 9:16 clip with the specified layout.
+    ``request`` must conform to ``RenderRequest``. If ``request.mode`` is
+    ``"dry_run"`` the ffmpeg command is returned without execution.
+    """
+    req = RenderRequest.model_validate(request)
+    result: RenderResult = compile_mod.render_clip(req)
+    return result.model_dump()
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+def main() -> None:
+    """stdio entrypoint for ``humeo-core`` console-script."""
+    mcp.run()
+if __name__ == "__main__":
+    main()

humeo-core/tests/__init__.py ADDED Viewed

File without changes

humeo-core/tests/test_classify.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import json
+from humeo_core.primitives.classify import (
+    classify_scenes_heuristic,
+    classify_scenes_with_llm,
+)
+from humeo_core.schemas import LayoutKind, Scene
+def test_heuristic_no_keyframe_defaults_sit_center():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path=None)]
+    result = classify_scenes_heuristic(scenes)
+    assert len(result) == 1
+    assert result[0].scene_id == "s0"
+    assert result[0].layout == LayoutKind.SIT_CENTER
+def test_llm_classifier_uses_callback_and_validates():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
+    def fake_vision(image_path: str, prompt: str) -> str:
+        return json.dumps(
+            {"layout": "split_chart_person", "confidence": 0.88, "reason": "chart left"}
+        )
+    result = classify_scenes_with_llm(scenes, fake_vision)
+    assert result[0].layout == LayoutKind.SPLIT_CHART_PERSON
+    assert result[0].confidence == 0.88
+def test_llm_classifier_parse_error_is_safe():
+    scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
+    def bad_vision(image_path: str, prompt: str) -> str:
+        return "not json"
+    result = classify_scenes_with_llm(scenes, bad_vision)
+    assert result[0].layout == LayoutKind.SIT_CENTER
+    assert "parse error" in result[0].reason.lower()

humeo-core/tests/test_compile.py ADDED Viewed

	@@ -0,0 +1,329 @@

+from pathlib import Path
+from humeo_core.primitives import compile as compile_mod
+from humeo_core.primitives.compile import (
+    _ensure_windows_fontconfig,
+    build_ffmpeg_cmd,
+    plan_title_drawtext,
+)
+from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RenderRequest, RenderTheme
+def _req(**overrides):
+    c = Clip(clip_id="1", topic="t", start_time_sec=10.0, end_time_sec=40.0)
+    li = LayoutInstruction(clip_id="1", layout=LayoutKind.SIT_CENTER)
+    data = dict(
+        source_path="/tmp/src.mp4",
+        clip=c,
+        layout=li,
+        output_path="/tmp/out.mp4",
+        render_theme=RenderTheme.LEGACY,
+        mode="dry_run",
+    )
+    data.update(overrides)
+    return RenderRequest(**data)
+def test_ffmpeg_cmd_has_ss_duration_filtergraph_output():
+    cmd = build_ffmpeg_cmd(_req())
+    assert "-ss" in cmd
+    assert "-t" in cmd
+    assert "-filter_complex" in cmd
+    # duration = 30.0
+    t_idx = cmd.index("-t")
+    assert float(cmd[t_idx + 1]) == 30.0
+    ss_idx = cmd.index("-ss")
+    assert float(cmd[ss_idx + 1]) == 10.0
+    assert cmd[-1] == "/tmp/out.mp4"
+def test_title_text_injects_drawtext():
+    cmd = build_ffmpeg_cmd(_req(title_text="Hello: world's"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext" in fg
+    # colon should be escaped
+    assert "Hello\\:" in fg
+    assert "worlds" in fg
+    assert "world's" not in fg
+    assert "expansion=none" in fg
+def test_map_vout_and_primary_audio():
+    cmd = build_ffmpeg_cmd(_req())
+    assert "[vout]" in cmd
+    assert "0:a:0" in cmd
+def test_subtitle_style_uses_requested_font_and_margin():
+    cmd = build_ffmpeg_cmd(
+        _req(subtitle_path="/tmp/clip.srt", subtitle_font_size=18, subtitle_margin_v=64)
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "subtitles='" in fg
+    assert "FontSize=18" in fg
+    assert "MarginV=64" in fg
+    # Smart word wrap so long captions break into multiple readable lines.
+    assert "WrapStyle=0" in fg
+def test_subtitle_original_size_pins_libass_to_output_resolution():
+    """Without original_size=W x H, libass uses PlayResY=288 and blows up fonts/margins.
+    This is the root cause of the "subtitles floating in the middle of the
+    frame / blocked" bug the user reported.
+    """
+    cmd = build_ffmpeg_cmd(_req(subtitle_path="/tmp/clip.srt"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "original_size=1080x1920" in fg
+def test_subtitles_applied_after_crop_and_title():
+    """Order: crop/compose -> drawtext title -> subtitles.
+    The pipeline must crop **first**, then draw text on the finished frame.
+    """
+    cmd = build_ffmpeg_cmd(
+        _req(title_text="Hook", subtitle_path="/tmp/clip.srt")
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    crop_pos = fg.index("[0:v]crop=")
+    drawtext_pos = fg.index("drawtext")
+    subs_pos = fg.index("subtitles=")
+    assert crop_pos < drawtext_pos < subs_pos
+def test_build_is_layout_specific():
+    c = Clip(clip_id="1", topic="t", start_time_sec=0, end_time_sec=10)
+    split_req = _req(
+        clip=c,
+        layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
+    )
+    cmd = build_ffmpeg_cmd(split_req)
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "vstack" in fg
+def test_title_is_suppressed_on_split_layouts():
+    """Split layouts already contain a slide/chart with its own title.
+    Overlaying an additional drawtext title just obscures content -- that's
+    what was happening in the Cathy Wood "chart overlaps subject" report.
+    """
+    for kind in (
+        LayoutKind.SPLIT_CHART_PERSON,
+        LayoutKind.SPLIT_TWO_PERSONS,
+        LayoutKind.SPLIT_TWO_CHARTS,
+    ):
+        cmd = build_ffmpeg_cmd(
+            _req(
+                layout=LayoutInstruction(clip_id="1", layout=kind),
+                title_text="This should not render",
+            )
+        )
+        fg = cmd[cmd.index("-filter_complex") + 1]
+        assert "drawtext" not in fg, f"title leaked into split layout {kind}"
+def test_title_is_drawn_on_single_subject_layouts():
+    """Titles are still rendered on ZOOM_CALL_CENTER and SIT_CENTER."""
+    for kind in (LayoutKind.ZOOM_CALL_CENTER, LayoutKind.SIT_CENTER):
+        cmd = build_ffmpeg_cmd(
+            _req(
+                layout=LayoutInstruction(clip_id="1", layout=kind),
+                title_text="Hook title",
+            )
+        )
+        fg = cmd[cmd.index("-filter_complex") + 1]
+        assert "drawtext=text='Hook title'" in fg
+# ---------------------------------------------------------------------------
+# Title wrapping / auto-shrink (P2: fixes the "Prediction Markets vs
+# Derivatives" clipped-title bug reported against the Cathy Wood run).
+# ---------------------------------------------------------------------------
+def test_plan_title_short_stays_single_line_at_72px():
+    """Backward compat: short titles keep the pre-P2 single-drawtext form.
+    Byte-identical output for short titles is important because it keeps
+    previously-calibrated visual output unchanged and avoids needless cache
+    churn on existing renders.
+    """
+    frag = plan_title_drawtext("Hook title", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 1
+    assert "fontsize=72" in frag
+    assert "y=80" in frag
+    assert "drawtext=text='Hook title'" in frag
+def test_plan_title_long_wraps_to_two_lines_below_72px():
+    """Long titles wrap at the best word boundary and shrink to fit.
+    "Prediction Markets vs Derivatives" is 33 chars — it overflows a 1080px
+    canvas at 72px. It must wrap into "Prediction Markets" / "vs Derivatives"
+    (balanced halves) at a smaller font.
+    """
+    frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 2, "long titles must split into two drawtext calls"
+    assert "drawtext=text='Prediction Markets'" in frag
+    assert "drawtext=text='vs Derivatives'" in frag
+    assert "fontsize=72" not in frag, "two-line layout must use a smaller font"
+    # Both lines share the same shrunken fontsize.
+    import re
+    sizes = re.findall(r"fontsize=(\d+)", frag)
+    assert len(sizes) == 2 and sizes[0] == sizes[1]
+    assert 44 <= int(sizes[0]) <= 64
+def test_plan_title_empty_returns_none():
+    assert plan_title_drawtext("", out_w=1080) is None
+    assert plan_title_drawtext("   ", out_w=1080) is None
+def test_plan_title_single_huge_word_shrinks_instead_of_wrapping():
+    """A single word cannot be word-wrapped; it must shrink to fit."""
+    frag = plan_title_drawtext("Supercalifragilisticexpialidocious", out_w=1080)
+    assert frag is not None
+    assert frag.count("drawtext=") == 1  # no wrap possible
+    assert "fontsize=72" not in frag
+def test_title_uses_arial_font_not_default_serif():
+    """Titles must render in Arial (matching the ASS subtitle font), not the
+    platform default which is Times New Roman on Windows.
+    Regression test for the "ugly serif title on the finance short" bug.
+    Both the single-line and the two-line drawtext variants must carry a
+    ``font=Arial`` directive so fontconfig resolves to the same family as
+    the subtitle ``Fontname=Arial``.
+    """
+    short = plan_title_drawtext("Hook title", out_w=1080)
+    assert short is not None
+    assert "font=Arial" in short or "fontfile='" in short
+    long_frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
+    assert long_frag is not None
+    if "font=Arial" in long_frag:
+        assert long_frag.count("font=Arial") == 2
+    else:
+        assert long_frag.count("fontfile='") == 2
+def test_title_font_matches_subtitle_font_family():
+    """Title overlay and subtitle captions must read as one typographic
+    family. Both routes through ``build_ffmpeg_cmd`` should carry the same
+    Arial reference.
+    """
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="Hook title",
+            subtitle_path="/tmp/clip.ass",
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "font=Arial" in fg or "fontfile='" in fg
+    assert "Fontname=Arial" in fg
+def test_long_title_pipes_through_build_ffmpeg_cmd():
+    """End-to-end: a long title routed through the full command builder
+    produces a valid filtergraph with two drawtext filters and no syntax
+    errors ffmpeg would choke on.
+    """
+    cmd = build_ffmpeg_cmd(_req(title_text="Prediction Markets vs Derivatives"))
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert fg.count("drawtext=") == 2
+    assert "[v_prepad]drawtext=text='Prediction Markets'" in fg
+    assert "[vout]" in fg
+    assert ";;" not in fg  # no empty chain links
+    assert ",," not in fg  # no stray commas
+def test_reference_theme_draws_title_and_caption_bars():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="A Multi-Trillion Dollar Opportunity",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawbox=x=28:y=32" in fg
+    assert "drawbox=x=0:y=" in fg
+    assert "Fontname=Source Sans 3" in fg
+    assert "Alignment=2" in fg
+    assert "Outline=2" in fg
+def test_reference_theme_wraps_long_titles_inside_the_title_bar():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="12% Youth Unemployment? Start a Business With AI",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert fg.count("drawtext=") >= 2
+    assert "..." not in fg
+def test_reference_theme_draws_frosted_caption_ribbon_when_subtitles_exist():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="Hook title",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawbox=x=0:y=" in fg
+def test_reference_theme_allows_titles_on_split_layouts():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
+            title_text="Hook title",
+            render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext=" in fg
+def test_native_highlight_theme_skips_title_card_and_keeps_ass_styles():
+    cmd = build_ffmpeg_cmd(
+        _req(
+            title_text="This title should not render",
+            subtitle_path="/tmp/clip.ass",
+            render_theme=RenderTheme.NATIVE_HIGHLIGHT,
+        )
+    )
+    fg = cmd[cmd.index("-filter_complex") + 1]
+    assert "drawtext" not in fg
+    assert "subtitles='" in fg
+    assert "force_style='" not in fg
+def test_ensure_windows_fontconfig_is_noop_off_windows():
+    env = _ensure_windows_fontconfig()
+    assert isinstance(env, dict)
+def test_ensure_windows_fontconfig_creates_config(monkeypatch, tmp_path):
+    monkeypatch.setattr(compile_mod.os, "name", "nt", raising=False)
+    monkeypatch.delenv("FONTCONFIG_FILE", raising=False)
+    monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "localappdata"))
+    monkeypatch.setenv("WINDIR", str(tmp_path / "winroot"))
+    env = _ensure_windows_fontconfig()
+    cfg_file = Path(env["FONTCONFIG_FILE"])
+    assert cfg_file.is_file()
+    text = cfg_file.read_text(encoding="utf-8")
+    assert (tmp_path / "winroot" / "Fonts").as_posix() in text
+    assert "fontconfig-cache" in text

humeo-core/tests/test_face_detect.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Tests for the MediaPipe-backed face detection primitive.
+Uses a stub ``face_fn`` so MediaPipe itself is not required to run the
+tests — the primitive contract is what we care about: *given* a face
+bbox, does the primitive produce the right ``SceneRegions``.
+"""
+from humeo_core.primitives.face_detect import detect_face_regions
+from humeo_core.schemas import BoundingBox, Scene
+def _scene(i: int, kf: str | None = "/tmp/k.jpg") -> Scene:
+    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
+def test_no_keyframe_returns_raw_reason():
+    out = detect_face_regions([_scene(0, kf=None)], face_fn=lambda _p: None)
+    assert out[0].person_bbox is None
+    assert "no keyframe" in out[0].raw_reason.lower()
+def test_no_face_detected_returns_raw_reason():
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: None)
+    assert out[0].person_bbox is None
+    assert "no face" in out[0].raw_reason.lower()
+def test_face_centered_produces_person_only():
+    centered = BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.7, label="face", confidence=0.9)
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: centered)
+    r = out[0]
+    assert r.person_bbox is not None
+    assert r.person_bbox.center_x == centered.center_x
+    assert r.chart_bbox is None
+def test_face_pushed_right_synthesises_chart_bbox():
+    # face center x ~ 0.86 -> above default threshold 0.65 -> chart bbox inferred
+    face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9, label="face", confidence=0.95)
+    out = detect_face_regions([_scene(0)], face_fn=lambda _p: face)
+    r = out[0]
+    assert r.person_bbox is not None
+    assert r.chart_bbox is not None
+    assert r.chart_bbox.x1 == 0.0
+    assert r.chart_bbox.x2 <= 0.75  # can't overlap the face
+    assert r.chart_bbox.x2 <= 0.65  # bounded by threshold too
+    assert "synthetic chart" in r.raw_reason
+def test_face_detector_exception_is_isolated_per_scene():
+    scenes = [_scene(0), _scene(1)]
+    calls: list[str] = []
+    def flaky_fn(path: str) -> BoundingBox | None:
+        calls.append(path)
+        if len(calls) == 1:
+            raise RuntimeError("boom")
+        return BoundingBox(x1=0.3, y1=0.2, x2=0.7, y2=0.8)
+    out = detect_face_regions(scenes, face_fn=flaky_fn)
+    assert out[0].person_bbox is None
+    assert "error" in out[0].raw_reason.lower()
+    assert out[1].person_bbox is not None
+def test_custom_threshold_prevents_false_chart_split():
+    face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9)
+    out = detect_face_regions(
+        [_scene(0)],
+        face_fn=lambda _p: face,
+        chart_split_threshold=0.95,
+    )
+    assert out[0].chart_bbox is None

humeo-core/tests/test_layout_bbox.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Split layout uses optional normalized bbox regions (Gemini vision)."""
+from humeo_core.primitives.layouts import plan_layout
+from humeo_core.schemas import BoundingBox, FocusStackOrder, LayoutInstruction, LayoutKind
+def test_split_with_bbox_regions_not_fixed_thirds():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.64, y2=1.0),
+        split_person_region=BoundingBox(x1=0.64, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "crop=1228:1080:0:0" in fg or "crop=1224:1080:0:0" in fg
+    assert "vstack=inputs=2" in fg

humeo-core/tests/test_layouts.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import re
+from humeo_core.primitives.layouts import (
+    _center_crop_to_9x16,
+    _crop_box,
+    plan_layout,
+)
+from humeo_core.schemas import (
+    BoundingBox,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    TimedCenterPoint,
+)
+def test_crop_box_aspect_exact():
+    cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 1.0, 0.5, 0.5)
+    # 9:16 inside 1920x1080 -> height-limited: ch=1080, cw ~= 608
+    assert ch == 1080
+    assert abs(cw / ch - 9 / 16) < 0.01
+    assert 0 <= x <= 1920 - cw
+    assert y == 0
+def test_crop_box_clamps_inside_frame():
+    cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 2.0, 0.99, 0.5)
+    assert x + cw <= 1920
+    assert y + ch <= 1080
+def test_crop_box_zoom_tightens():
+    cw_small, ch_small, _, _ = _center_crop_to_9x16(1920, 1080, 2.0, 0.5)
+    cw_large, ch_large, _, _ = _center_crop_to_9x16(1920, 1080, 1.0, 0.5)
+    assert cw_small < cw_large
+    assert ch_small < ch_large
+def test_even_dimensions():
+    cw, ch, x, y = _crop_box(1921, 1081, 9 / 16, 1.3, 0.4, 0.5)
+    assert cw % 2 == 0 and ch % 2 == 0
+    assert x % 2 == 0 and y % 2 == 0
+def _contains(s: str, *subs: str) -> bool:
+    return all(sub in s for sub in subs)
+def test_zoom_call_layout_filtergraph_shape():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.5, person_x_norm=0.5
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    fg = plan.filtergraph
+    assert _contains(fg, "[0:v]crop=", "scale=1080:1920", "[vout]")
+def test_sit_center_layout_filtergraph_shape():
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    assert "[vout]" in plan.filtergraph
+    assert plan.out_label == "vout"
+def test_sit_center_tracking_uses_dynamic_crop_expression():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2),
+            TimedCenterPoint(t_sec=10.0, x_norm=0.8),
+        ],
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    assert "setpts=PTS-STARTPTS" in fg
+    assert "[vsrc]crop=" in fg
+    assert "if(lt(t\\,4.850)" in fg
+    assert "*(t-4.850)/(0.300)" in fg
+def test_sit_center_tracking_with_zoom_uses_dynamic_crop_window_expressions():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.28),
+            TimedCenterPoint(t_sec=10.0, x_norm=0.8, zoom=1.0),
+        ],
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    assert "setpts=PTS-STARTPTS" in fg
+    assert "[vsrc]crop=" in fg
+    assert "out_w/2" in fg
+    assert "out_h/2" in fg
+    assert "floor((min(" in fg
+def test_split_layout_contains_vstack():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        person_x_norm=0.83,
+        chart_x_norm=0.0,
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    fg = plan.filtergraph
+    assert _contains(fg, "split=2", "vstack=inputs=2", "[vout]")
+    assert "[top]" in fg and "[bot]" in fg
+def test_split_layout_person_crop_is_right_third():
+    """Chart uses left 2/3; person uses right 1/3 (non-overlapping)."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # Right third: x=1280, w=640 for 1920-wide source.
+    assert "crop=640:1080:1280:0" in fg
+def test_split_layout_can_swap_stack_order():
+    """PERSON_THEN_CHART puts the right-strip (person) crop into the top band."""
+    chart_first = plan_layout(
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
+        ),
+        out_w=1080,
+        out_h=1920,
+    ).filtergraph
+    person_first = plan_layout(
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SPLIT_CHART_PERSON,
+            focus_stack_order=FocusStackOrder.PERSON_THEN_CHART,
+        ),
+        out_w=1080,
+        out_h=1920,
+    ).filtergraph
+    def top_crop(fg: str) -> str:
+        m = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg)
+        assert m is not None, fg
+        return m.group(1)
+    # chart strip = left 1280px of source (2/3 split seam).
+    assert top_crop(chart_first) == "1280:1080:0:0"
+    # person strip = right 640px -> x=1280.
+    assert top_crop(person_first) == "640:1080:1280:0"
+    assert "vstack=inputs=2" in chart_first
+    assert "vstack=inputs=2" in person_first
+def test_split_layout_person_clamped():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, person_x_norm=1.0
+    )
+    plan = plan_layout(instr, out_w=1080, out_h=1920)
+    assert "crop=" in plan.filtergraph  # no OOB math crash
+def test_plan_layout_dispatch_covers_all_kinds():
+    for k in LayoutKind:
+        instr = LayoutInstruction(clip_id="c", layout=k)
+        plan = plan_layout(instr)
+        assert plan.out_label == "vout"
+        assert plan.filtergraph.endswith("[vout]")
+def test_default_split_is_even_50_50_bands():
+    """The user-requested symmetric look: top and bottom bands are equal."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    # Each strip should scale to the same height (half of 1920).
+    heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
+    assert len(heights) == 2
+    assert heights[0] == heights[1] == "960", f"expected even 960/960, got {heights}"
+def test_top_band_ratio_honored_for_uneven_splits():
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, top_band_ratio=0.6
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
+    heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
+    assert heights == ["1152", "768"], heights
+def test_split_seam_is_midpoint_between_bboxes():
+    """When both bboxes are provided, strips partition the source -- no overlap, no gap."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.50, y2=1.0),
+        split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # chart.x2 = 960px, person.x1 = 1056px -> midpoint = 1008 -> even -> 1008.
+    # Chart strip: x=0, cw=1008. Person strip: x=1008, cw=912.
+    top_crop = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
+    bot_crop = re.search(r"\[src2\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
+    assert top_crop == "1008:1080:0:0"
+    assert bot_crop == "912:1080:1008:0"
+def test_split_uses_bbox_y_for_tight_band_fill():
+    """Chart bboxes anchor the crop, with a little extra height for edge safety."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.1, x2=0.5, y2=0.7),
+        split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    # Chart bbox y: 0.1..0.7 -> y=108, ch=648, then a modest 12% pad per side.
+    assert "crop=1008:804:0:30" in fg
+def test_split_chart_person_adds_vertical_pad_to_reduce_chart_side_crop():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.02, y1=0.03, x2=0.58, y2=0.7),
+        split_person_region=BoundingBox(x1=0.585, y1=0.0, x2=0.995, y2=0.62),
+        top_band_ratio=0.436,
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=640, src_h=360).filtergraph
+    assert "[src1]crop=372:280:0:0" in fg
+def test_split_minimum_strip_width_enforced():
+    """If chart/person bboxes are pathological (seam at edge), don't starve a strip."""
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_CHART_PERSON,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.05, y2=1.0),
+        split_person_region=BoundingBox(x1=0.05, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    widths = [int(m) for m in re.findall(r"crop=(\d+):\d+:\d+:\d+", fg)]
+    # Min strip = 20% of 1920 = 384 px. Neither strip should be narrower.
+    assert all(w >= 384 for w in widths), widths
+def test_split_two_persons_stacks_two_crops():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_TWO_PERSONS,
+        split_person_region=BoundingBox(x1=0.0, y1=0.05, x2=0.5, y2=0.95),
+        split_second_person_region=BoundingBox(x1=0.5, y1=0.05, x2=1.0, y2=0.95),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "split=2" in fg and "vstack=inputs=2" in fg
+    # Seam at x=960. bbox y: 0.05..0.95 -> y=54, ch=972 (even).
+    assert "[src1]crop=960:972:0:54" in fg
+    assert "[src2]crop=960:972:960:54" in fg
+def test_split_two_charts_stacks_two_crops():
+    instr = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SPLIT_TWO_CHARTS,
+        split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.5, y2=1.0),
+        split_second_chart_region=BoundingBox(x1=0.5, y1=0.0, x2=1.0, y2=1.0),
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "split=2" in fg and "vstack=inputs=2" in fg
+    assert "[src1]crop=960:1080:0:0" in fg
+    assert "[src2]crop=960:1080:960:0" in fg
+def test_split_two_persons_without_bboxes_defaults_to_centered():
+    """No bboxes -> centered 50/50 seam, full source height fallback."""
+    instr = LayoutInstruction(
+        clip_id="c", layout=LayoutKind.SPLIT_TWO_PERSONS
+    )
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert "[src1]crop=960:1080:0:0" in fg
+    assert "[src2]crop=960:1080:960:0" in fg
+def test_split_bands_use_cover_scale_plus_center_crop():
+    """Each band is painted edge-to-edge -- no letterbox bars."""
+    instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
+    fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
+    assert fg.count("force_original_aspect_ratio=increase") == 2
+    assert fg.count("setsar=1") == 2
+def test_zoom_tighter_means_smaller_crop_window():
+    from humeo_core.primitives.layouts import plan_zoom_call_center
+    wide = plan_zoom_call_center(
+        LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.0),
+        out_w=1080,
+        out_h=1920,
+    )
+    tight = plan_zoom_call_center(
+        LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=2.0),
+        out_w=1080,
+        out_h=1920,
+    )
+    # Parse crop=CW:CH:X:Y out of each filtergraph.
+    import re
+    def crop(fg: str) -> tuple[int, int]:
+        m = re.search(r"crop=(\d+):(\d+):", fg)
+        assert m is not None
+        return int(m.group(1)), int(m.group(2))
+    wcw, wch = crop(wide.filtergraph)
+    tcw, tch = crop(tight.filtergraph)
+    assert tcw < wcw and tch < wch

humeo-core/tests/test_schemas.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import pytest
+from pydantic import ValidationError
+from humeo_core.schemas import (
+    ApprovalResult,
+    Clip,
+    ClipPlan,
+    ClipSubtitleWords,
+    FocusStackOrder,
+    LayoutInstruction,
+    LayoutKind,
+    RatingFeedback,
+    RenderRequest,
+    Scene,
+    SessionState,
+    TimedCenterPoint,
+    TranscriptWord,
+)
+def test_scene_requires_end_after_start():
+    Scene(scene_id="s1", start_time=0.0, end_time=1.0)
+    with pytest.raises(ValueError):
+        Scene(scene_id="s1", start_time=5.0, end_time=5.0)
+    with pytest.raises(ValueError):
+        Scene(scene_id="s1", start_time=5.0, end_time=1.0)
+def test_layout_instruction_defaults_and_bounds():
+    li = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
+    assert li.zoom == 1.0
+    assert 0 <= li.person_x_norm <= 1
+    assert li.person_tracking == []
+    assert li.focus_stack_order == FocusStackOrder.CHART_THEN_PERSON
+    with pytest.raises(ValueError):
+        LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, zoom=0.0)
+    with pytest.raises(ValueError):
+        LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, person_x_norm=2.0)
+def test_layout_instruction_accepts_sorted_tracking_points():
+    li = LayoutInstruction(
+        clip_id="c",
+        layout=LayoutKind.SIT_CENTER,
+        person_tracking=[
+            TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.25),
+            TimedCenterPoint(t_sec=5.0, x_norm=0.8, zoom=1.0),
+        ],
+    )
+    assert [point.t_sec for point in li.person_tracking] == [0.0, 5.0]
+    assert li.person_tracking[0].zoom == pytest.approx(1.25)
+def test_layout_instruction_rejects_unsorted_tracking_points():
+    with pytest.raises(ValueError, match="person_tracking times"):
+        LayoutInstruction(
+            clip_id="c",
+            layout=LayoutKind.SIT_CENTER,
+            person_tracking=[
+                TimedCenterPoint(t_sec=5.0, x_norm=0.8),
+                TimedCenterPoint(t_sec=1.0, x_norm=0.2),
+            ],
+        )
+def test_clip_duration():
+    c = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=10.0,
+        end_time_sec=42.5,
+    )
+    assert c.duration_sec == pytest.approx(32.5)
+def test_clip_hook_relative_to_clip_in_point():
+    c = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=100.0,
+        end_time_sec=130.0,
+        hook_start_sec=0.0,
+        hook_end_sec=3.0,
+    )
+    assert c.hook_end_sec == 3.0
+def test_clip_hook_must_be_within_duration():
+    with pytest.raises(ValueError, match="hook window"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            hook_start_sec=0.0,
+            hook_end_sec=15.0,
+        )
+def test_clip_hook_both_or_neither():
+    with pytest.raises(ValueError, match="hook_start_sec and hook_end_sec"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            hook_start_sec=1.0,
+            hook_end_sec=None,
+        )
+def test_clip_trim_cannot_exceed_duration():
+    with pytest.raises(ValueError, match="trim"):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=10.0,
+            trim_start_sec=6.0,
+            trim_end_sec=6.0,
+        )
+def test_clip_plan_roundtrip():
+    plan = ClipPlan(
+        source_path="/tmp/x.mp4",
+        clips=[
+            Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+        ],
+    )
+    d = plan.model_dump()
+    assert ClipPlan.model_validate(d) == plan
+def test_clip_roundtrip_with_extended_fields():
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"message_wow": 0.9, "hook_emotion": 0.7},
+        origin="both",
+        visual_notes="Speaker leans in.",
+        reasoning="Strong explanation and hook.",
+    )
+    dumped = clip.model_dump()
+    assert dumped["score_breakdown"] == {"message_wow": 0.9, "hook_emotion": 0.7}
+    assert dumped["origin"] == "both"
+    assert dumped["visual_notes"] == "Speaker leans in."
+    assert dumped["reasoning"] == "Strong explanation and hook."
+    assert Clip.model_validate(dumped) == clip
+def test_clip_defaults_validate_and_do_not_serialize_new_fields():
+    clip = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+    assert clip.origin == "text"
+    assert clip.score_breakdown is None
+    assert clip.visual_notes is None
+    assert clip.reasoning is None
+    dumped = clip.model_dump()
+    assert "score_breakdown" not in dumped
+    assert "origin" not in dumped
+    assert "visual_notes" not in dumped
+    assert "reasoning" not in dumped
+    assert Clip.model_validate(dumped) == clip
+def test_clip_score_breakdown_validation():
+    with pytest.raises(ValidationError):
+        Clip(
+            clip_id="1",
+            topic="t",
+            start_time_sec=0.0,
+            end_time_sec=30.0,
+            score_breakdown={"hook": -0.1},
+        )
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"hook": 1.2},
+    )
+    assert clip.score_breakdown == {"hook": 1.0}
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={},
+    )
+    assert clip.score_breakdown == {}
+    clip = Clip(
+        clip_id="1",
+        topic="t",
+        start_time_sec=0.0,
+        end_time_sec=30.0,
+        score_breakdown={"hook": 0.5},
+    )
+    assert clip.score_breakdown == {"hook": 0.5}
+def test_clip_subtitle_words_relative_times():
+    w = ClipSubtitleWords(
+        words=[TranscriptWord(word="hi", start_time=0.0, end_time=0.2)]
+    )
+    assert w.words[0].start_time == 0.0
+def test_render_request_modes():
+    c = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
+    li = LayoutInstruction(clip_id="1", layout=LayoutKind.ZOOM_CALL_CENTER)
+    req = RenderRequest(
+        source_path="/tmp/x.mp4",
+        clip=c,
+        layout=li,
+        output_path="/tmp/out.mp4",
+    )
+    assert req.mode == "normal"
+    req2 = RenderRequest(**{**req.model_dump(), "mode": "dry_run"})
+    assert req2.mode == "dry_run"
+def test_approval_result_roundtrip():
+    result = ApprovalResult(
+        action="proceed",
+        selected_ids=["001", "003"],
+        steering_note="prefer emotional moments",
+    )
+    assert ApprovalResult.model_validate(result.model_dump()) == result
+def test_approval_result_rejects_invalid_action():
+    with pytest.raises(ValidationError):
+        ApprovalResult(action="invalid")
+def test_rating_feedback_roundtrip():
+    feedback = RatingFeedback(
+        rating=2,
+        issues=["wrong_moments", "other"],
+        free_text="needs more context",
+    )
+    assert RatingFeedback.model_validate(feedback.model_dump()) == feedback
+def test_rating_feedback_rejects_invalid_rating():
+    with pytest.raises(ValidationError):
+        RatingFeedback(rating=4)
+def test_session_state_roundtrip():
+    state = SessionState(
+        source_key="youtube:PdVv_vLkUgk",
+        iteration=3,
+        steering_notes=["be punchier"],
+        last_rating=RatingFeedback(rating=3),
+        last_selected_ids=["001", "002"],
+    )
+    assert SessionState.model_validate(state.model_dump()) == state

humeo-core/tests/test_select_clips.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from humeo_core.primitives.select_clips import select_clips_heuristic
+from humeo_core.schemas import TranscriptWord
+def _words(start: float, end: float, n: int) -> list[TranscriptWord]:
+    step = (end - start) / max(1, n)
+    return [
+        TranscriptWord(word=f"w{i}", start_time=start + i * step, end_time=start + (i + 1) * step)
+        for i in range(n)
+    ]
+def test_no_transcript_returns_single_clip():
+    plan = select_clips_heuristic("/tmp/x.mp4", [], duration_sec=600.0)
+    assert len(plan.clips) == 1
+def test_prefers_dense_windows():
+    # dense between 30-90, sparse elsewhere
+    dense = _words(30.0, 90.0, 240)  # 4 words/sec
+    sparse_before = _words(0.0, 30.0, 6)
+    sparse_after = _words(90.0, 600.0, 30)
+    words = sparse_before + dense + sparse_after
+    plan = select_clips_heuristic(
+        "/tmp/x.mp4", words, duration_sec=600.0, target_count=1, min_sec=30, max_sec=60
+    )
+    assert len(plan.clips) == 1
+    c = plan.clips[0]
+    assert 30 <= c.start_time_sec <= 90
+    assert c.end_time_sec <= 120
+def test_no_overlap_when_multiple_picked():
+    dense_a = _words(30.0, 90.0, 240)
+    dense_b = _words(200.0, 260.0, 240)
+    words = dense_a + dense_b
+    plan = select_clips_heuristic(
+        "/tmp/x.mp4",
+        words,
+        duration_sec=400.0,
+        target_count=3,
+        min_sec=30,
+        max_sec=60,
+    )
+    # Should pick both dense regions without overlap.
+    assert len(plan.clips) >= 2
+    starts_ends = sorted((c.start_time_sec, c.end_time_sec) for c in plan.clips)
+    for (s1, e1), (s2, e2) in zip(starts_ends, starts_ends[1:]):
+        assert e1 <= s2

humeo-core/tests/test_server_tools.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Exercise the MCP server tools as plain Python callables.
+FastMCP tools are registered on the server instance, but the underlying
+functions are ordinary Python functions decorated with ``@mcp.tool()``.
+We import the module and invoke those functions directly to verify the
+end-to-end wiring (schemas validated, dispatch correct, JSON-serializable).
+"""
+import humeo_core.server as srv
+from humeo_core.schemas import LayoutKind
+def test_list_layouts_lists_all_three():
+    result = srv.list_layouts()
+    kinds = {layout["kind"] for layout in result["layouts"]}
+    assert kinds == {k.value for k in LayoutKind}
+def test_plan_layout_tool_returns_filtergraph():
+    for k in LayoutKind:
+        out = srv.plan_layout(layout=k.value)
+        assert out["out_label"] == "vout"
+        assert "[vout]" in out["filtergraph"]
+def test_build_render_cmd_dry_run():
+    req = {
+        "source_path": "/tmp/src.mp4",
+        "clip": {
+            "clip_id": "1",
+            "topic": "t",
+            "start_time_sec": 0.0,
+            "end_time_sec": 30.0,
+        },
+        "layout": {"clip_id": "1", "layout": LayoutKind.SIT_CENTER.value},
+        "output_path": "/tmp/out.mp4",
+    }
+    out = srv.build_render_cmd(request=req)
+    assert out["success"] is True
+    assert out["output_path"] == "/tmp/out.mp4"
+    assert any("-filter_complex" == part for part in out["ffmpeg_cmd"])
+def test_select_clips_tool_happy_path():
+    words = [
+        {"word": f"w{i}", "start_time": float(i), "end_time": float(i) + 0.5}
+        for i in range(120)
+    ]
+    plan = srv.select_clips(
+        source_path="/tmp/x.mp4",
+        transcript_words=words,
+        duration_sec=120.0,
+        target_count=2,
+        min_sec=30.0,
+        max_sec=60.0,
+    )
+    assert plan["source_path"] == "/tmp/x.mp4"
+    assert 1 <= len(plan["clips"]) <= 2
+def test_classify_scenes_tool_no_keyframes():
+    scenes = [{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0}]
+    out = srv.classify_scenes(scenes=scenes)
+    assert out["classifications"][0]["scene_id"] == "s0"
+    assert out["classifications"][0]["layout"] in {k.value for k in LayoutKind}
+def test_detect_scene_regions_returns_jobs_and_prompt():
+    scenes = [
+        {"scene_id": "s0", "start_time": 0.0, "end_time": 5.0, "keyframe_path": "/tmp/k0.jpg"},
+        {"scene_id": "s1", "start_time": 5.0, "end_time": 10.0, "keyframe_path": "/tmp/k1.jpg"},
+    ]
+    out = srv.detect_scene_regions(scenes=scenes)
+    assert "STRICT JSON" in out["prompt"]
+    assert len(out["jobs"]) == 2
+    assert out["jobs"][0]["scene_id"] == "s0"
+    assert out["jobs"][0]["keyframe_path"] == "/tmp/k0.jpg"
+def test_classify_scenes_with_vision_derives_instructions():
+    regions = [
+        {
+            "scene_id": "s0",
+            "chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 0.66, "y2": 1.0},
+            "person_bbox": {"x1": 0.72, "y1": 0.1, "x2": 0.99, "y2": 0.95},
+            "ocr_text": "CPI YoY",
+        }
+    ]
+    out = srv.classify_scenes_with_vision(regions=regions)
+    assert out["classifications"][0]["layout"] == LayoutKind.SPLIT_CHART_PERSON.value
+    instr = out["layout_instructions"][0]
+    assert instr["chart_x_norm"] == 0.0
+    assert 0.8 < instr["person_x_norm"] < 0.9

humeo-core/tests/test_vision.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Tests for the scene-change + vision-LLM + OCR bbox primitive.
+Covers:
+* happy path: well-formed JSON -> populated ``SceneRegions``.
+* bad JSON: degrade to empty regions + raw_reason, never raise.
+* bad bbox: one malformed bbox does not take down the whole scene record.
+* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.
+* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come
+  from the bboxes when present, defaults when not.
+"""
+import json
+import pytest
+from humeo_core.primitives.vision import (
+    _CHART_WIDTH_SPLIT_THRESHOLD,
+    classify_from_regions,
+    classify_scenes_with_vision_llm,
+    detect_regions_with_llm,
+    layout_instruction_from_regions,
+)
+from humeo_core.schemas import (
+    BoundingBox,
+    LayoutKind,
+    Scene,
+    SceneClassification,
+    SceneRegions,
+)
+# ---------------------------------------------------------------------------
+# Schema
+# ---------------------------------------------------------------------------
+def test_bounding_box_requires_x2_gt_x1():
+    BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
+    with pytest.raises(ValueError):
+        BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
+    with pytest.raises(ValueError):
+        BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)
+def test_bounding_box_center_and_width():
+    b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
+    assert b.center_x == pytest.approx(0.4)
+    assert b.center_y == pytest.approx(0.65)
+    assert b.width == pytest.approx(0.4)
+# ---------------------------------------------------------------------------
+# detect_regions_with_llm
+# ---------------------------------------------------------------------------
+def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
+    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
+def test_detect_regions_happy_path():
+    scenes = [_scene(0)]
+    def vision_fn(_img: str, _prompt: str) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
+                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
+                "ocr_text": "Inflation YoY",
+                "reason": "explainer layout",
+            }
+        )
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert len(out) == 1
+    r = out[0]
+    assert r.scene_id == "s0"
+    assert r.person_bbox and r.person_bbox.center_x > 0.8
+    assert r.chart_bbox and r.chart_bbox.width > 0.6
+    assert "Inflation" in r.ocr_text
+def test_detect_regions_bad_json_is_safe():
+    scenes = [_scene(0)]
+    def vision_fn(*_a) -> str:
+        return "not json"
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert out[0].chart_bbox is None
+    assert "parse error" in out[0].raw_reason.lower()
+def test_detect_regions_missing_keyframe_is_safe():
+    scenes = [_scene(0, kf=None)]
+    def vision_fn(*_a) -> str:  # pragma: no cover - should not be called
+        raise AssertionError("vision_fn must not be called without a keyframe")
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert "no keyframe" in out[0].raw_reason.lower()
+def test_detect_regions_bad_bbox_degrades_gracefully():
+    scenes = [_scene(0)]
+    def vision_fn(*_a) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
+                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
+                "ocr_text": "",
+                "reason": "person bbox inverted",
+            }
+        )
+    out = detect_regions_with_llm(scenes, vision_fn)
+    assert out[0].person_bbox is None
+    assert out[0].chart_bbox is not None
+# ---------------------------------------------------------------------------
+# classify_from_regions
+# ---------------------------------------------------------------------------
+def test_classify_wide_chart_is_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
+        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SPLIT_CHART_PERSON
+    assert c.confidence > 0.5
+def test_classify_narrow_chart_not_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
+        person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    # chart width (0.1) is below the split threshold -> not split
+    assert c.layout != LayoutKind.SPLIT_CHART_PERSON
+def test_classify_wide_person_is_zoom_call():
+    r = SceneRegions(
+        scene_id="s0",
+        person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.ZOOM_CALL_CENTER
+def test_classify_small_person_is_sit_center():
+    r = SceneRegions(
+        scene_id="s0",
+        person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
+    )
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SIT_CENTER
+def test_classify_nothing_detected_defaults_sit_center_low_conf():
+    r = SceneRegions(scene_id="s0", raw_reason="model returned null")
+    c = classify_from_regions(r)
+    assert c.layout == LayoutKind.SIT_CENTER
+    assert c.confidence <= 0.5
+def test_chart_threshold_is_exported():
+    # guard against the tuning constant silently being removed
+    assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0
+# ---------------------------------------------------------------------------
+# layout_instruction_from_regions
+# ---------------------------------------------------------------------------
+def test_layout_instruction_from_regions_split():
+    r = SceneRegions(
+        scene_id="s0",
+        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
+        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
+    )
+    c = classify_from_regions(r)
+    instr = layout_instruction_from_regions(r, c)
+    assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
+    # person_x_norm = center of (0.72, 0.99) = 0.855
+    assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
+    # chart_x_norm = left edge = 0.0
+    assert instr.chart_x_norm == pytest.approx(0.0)
+def test_layout_instruction_defaults_when_no_regions():
+    r = SceneRegions(scene_id="s0")
+    c = SceneClassification(
+        scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
+    )
+    instr = layout_instruction_from_regions(r, c)
+    assert instr.person_x_norm == 0.5
+    assert instr.chart_x_norm == 0.0
+def test_classify_scenes_with_vision_llm_returns_pairs():
+    scenes = [_scene(0)]
+    def vision_fn(*_a) -> str:
+        return json.dumps(
+            {
+                "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
+                "chart_bbox": None,
+                "ocr_text": "",
+                "reason": "solo subject",
+            }
+        )
+    pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
+    assert len(pairs) == 1
+    regions, classification = pairs[0]
+    assert regions.person_bbox is not None
+    assert classification.layout == LayoutKind.ZOOM_CALL_CENTER

pyproject.toml ADDED Viewed

	@@ -0,0 +1,56 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "humeo"
+version = "0.1.0"
+description = "Automated podcast-to-shorts pipeline"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "yt-dlp>=2024.0",
+    "fastapi>=0.115",
+    "openai>=1.0",
+    "google-genai>=1.0",
+    "httpx>=0.28",
+    "jinja2>=3.1",
+    "numpy>=1.24",
+    "Pillow>=10.0",
+    "python-dotenv>=1.0",
+    "replicate>=0.34.2",
+    "tqdm>=4.60",
+    "python-multipart>=0.0.9",
+    "uvicorn[standard]>=0.30",
+    "humeo-core",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest-asyncio>=0.23",
+    "ruff",
+    "pytest",
+]
+whisper = [
+    "whisperx @ git+https://github.com/m-bain/whisperX.git",
+]
+[tool.uv.sources]
+humeo-core = { path = "humeo-core", editable = true }
+[project.scripts]
+humeo = "humeo.cli:main"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+humeo = ["prompts/*.jinja2"]
+[tool.pytest.ini_options]
+testpaths = ["tests", "humeo-core/tests"]
+addopts = "-ra -q"
+[tool.ruff]
+line-length = 100
+target-version = "py310"

src/humeo.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,223 @@

+Metadata-Version: 2.4
+Name: humeo
+Version: 0.1.0
+Summary: Automated podcast-to-shorts pipeline
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: yt-dlp>=2024.0
+Requires-Dist: openai>=1.0
+Requires-Dist: google-genai>=1.0
+Requires-Dist: httpx>=0.28
+Requires-Dist: jinja2>=3.1
+Requires-Dist: numpy>=1.24
+Requires-Dist: Pillow>=10.0
+Requires-Dist: python-dotenv>=1.0
+Requires-Dist: replicate>=0.34.2
+Requires-Dist: tqdm>=4.60
+Requires-Dist: humeo-core
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Provides-Extra: whisper
+Requires-Dist: whisperx @ git+https://github.com/m-bain/whisperX.git ; extra == "whisper"
+Dynamic: license-file
+---
+title: Humeo
+sdk: docker
+app_port: 7860
+---
+# Humeo
+Current default preset:
+- `native_highlight` captions
+- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
+- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
+- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
+Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
+**Architecture (static HTML, GitHub Pages):**
+[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
+## Hugging Face Space
+This repo includes a Hugging Face Docker Space entrypoint in `app.py`.
+- Upload one local MP4
+- Watch live pipeline logs and stage progress
+- Download rendered `short_*.mp4` clips from the UI
+Required Space secrets:
+- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
+- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
+The Docker image pins `HUMEO_TRANSCRIBE_PROVIDER=openai` for the Space demo.
+## Repo layout
+| Path | Role |
+|------|------|
+| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
+| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
+## Pipeline (actual order)
+```text
+YouTube URL
+  → ingest (source.mp4, transcript.json)
+  → clip selection (Gemini → clips.json)
+  → hook detection (Gemini → hooks.json)
+  → content pruning (Gemini → prune.json)
+  → keyframes + layout vision (Gemini vision → layout_vision.json)
+  → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
+```
+Details: **`docs/PIPELINE.md`**.
+## Five layouts
+A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
+## Requirements
+- **Python** ≥ 3.10
+- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
+- **`ffmpeg`** — on `PATH` for extract/render
+- **API keys** — see **`docs/ENVIRONMENT.md`**
+  - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
+  - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
+  - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
+Copy **`.env.example`** → **`.env`** (never commit `.env`).
+## Install
+```bash
+uv venv
+uv sync
+```
+Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
+```bash
+uv sync --extra whisper
+```
+## Run
+```bash
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+humeo --long-to-shorts "C:\path\to\video.mp4"
+```
+Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
+## CLI guide (all flags)
+Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
+### Required
+| Flag | Meaning |
+|------|---------|
+| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
+### Paths and cache behavior
+| Flag | Meaning |
+|------|---------|
+| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
+| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
+| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
+| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
+| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
+### Model selection and stage forcing
+| Flag | Meaning |
+|------|---------|
+| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
+| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
+| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
+| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
+| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
+| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
+| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
+### Pruning and subtitles
+| Flag | Meaning |
+|------|---------|
+| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
+| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
+| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
+| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
+| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
+### Logging
+| Flag | Meaning |
+|------|---------|
+| `--verbose`, `-v` | Enable debug logging. |
+### Common command recipes
+```bash
+# Basic run
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
+# Local MP4
+humeo --long-to-shorts "C:\path\to\video.mp4"
+# Full fresh run for debugging / prompt tuning
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
+# Re-run only clip selection after prompt edits
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
+# Keep intermediates in a fixed local folder
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
+# Compare different prune levels on same source
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
+humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
+```
+## Documentation
+| Doc | Purpose |
+|-----|---------|
+| **`docs/README.md`** | Index of all files under `docs/` |
+| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
+| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
+| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
+| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
+| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
+| **`docs/full_run_output.txt`** | Example full run log (text) |
+| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
+| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
+| **`docs/TODO.md`** | Backlog |
+| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
+| **`docs/SOLUTIONS.md`** | Design rationale |
+| **`TERMINOLOGY.md`** | Glossary |
+## Tests
+```bash
+uv sync --extra dev
+uv run pytest
+```
+## Sharing outputs
+`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
+## License
+See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.

src/humeo.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,58 @@

+LICENSE
+README.md
+pyproject.toml
+src/humeo/__init__.py
+src/humeo/best_of.py
+src/humeo/cli.py
+src/humeo/clip_assembly.py
+src/humeo/clip_selection_cache.py
+src/humeo/clip_selector.py
+src/humeo/config.py
+src/humeo/content_pruning.py
+src/humeo/cutter.py
+src/humeo/env.py
+src/humeo/gemini_generate.py
+src/humeo/hook_detector.py
+src/humeo/hook_library.py
+src/humeo/ingest.py
+src/humeo/interactive.py
+src/humeo/layout_vision.py
+src/humeo/pipeline.py
+src/humeo/prompt_loader.py
+src/humeo/reframe_ffmpeg.py
+src/humeo/render_window.py
+src/humeo/session_state.py
+src/humeo/transcript_align.py
+src/humeo/video_cache.py
+src/humeo.egg-info/PKG-INFO
+src/humeo.egg-info/SOURCES.txt
+src/humeo.egg-info/dependency_links.txt
+src/humeo.egg-info/entry_points.txt
+src/humeo.egg-info/requires.txt
+src/humeo.egg-info/top_level.txt
+src/humeo/prompts/clip_selection_system.jinja2
+src/humeo/prompts/clip_selection_user.jinja2
+src/humeo/prompts/content_pruning_system.jinja2
+src/humeo/prompts/hook_detection_system.jinja2
+tests/test_ass_subtitles.py
+tests/test_best_of.py
+tests/test_clip_assembly.py
+tests/test_clip_ranking.py
+tests/test_clip_selection_cache.py
+tests/test_clip_selector.py
+tests/test_content_pruning.py
+tests/test_cutter_native_highlight.py
+tests/test_gemini_generate.py
+tests/test_hook_detector.py
+tests/test_hook_library.py
+tests/test_ingest_openai_chunks.py
+tests/test_interactive.py
+tests/test_layout_vision_unit.py
+tests/test_pipeline_interactive.py
+tests/test_pipeline_quality_gate.py
+tests/test_prompt_loader.py
+tests/test_reframe_ffmpeg.py
+tests/test_render_window.py
+tests/test_session_state.py
+tests/test_transcript_align.py
+tests/test_video_cache.py

src/humeo.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/humeo.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ humeo = humeo.cli:main

src/humeo.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+yt-dlp>=2024.0
+openai>=1.0
+google-genai>=1.0
+httpx>=0.28
+jinja2>=3.1
+numpy>=1.24
+Pillow>=10.0
+python-dotenv>=1.0
+replicate>=0.34.2
+tqdm>=4.60
+humeo-core
+[dev]
+pytest-asyncio>=0.23
+ruff
+pytest
+[whisper]
+whisperx @ git+https://github.com/m-bain/whisperX.git