moonlantern1 commited on
Commit
eda316b
·
verified ·
1 Parent(s): 0bc3d15

Deploy ClipForge Docker Space

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +15 -0
  2. .gitattributes +1 -0
  3. Dockerfile +21 -0
  4. LICENSE +21 -0
  5. README.md +199 -10
  6. app.py +808 -0
  7. humeo-core/.gitignore +9 -0
  8. humeo-core/LICENSE +21 -0
  9. humeo-core/README.md +165 -0
  10. humeo-core/docs/ARCHITECTURE.md +128 -0
  11. humeo-core/docs/MCP_USAGE.md +100 -0
  12. humeo-core/examples/render_request.json +23 -0
  13. humeo-core/pyproject.toml +46 -0
  14. humeo-core/src/humeo_core.egg-info/PKG-INFO +197 -0
  15. humeo-core/src/humeo_core.egg-info/SOURCES.txt +33 -0
  16. humeo-core/src/humeo_core.egg-info/dependency_links.txt +1 -0
  17. humeo-core/src/humeo_core.egg-info/entry_points.txt +3 -0
  18. humeo-core/src/humeo_core.egg-info/requires.txt +21 -0
  19. humeo-core/src/humeo_core.egg-info/top_level.txt +1 -0
  20. humeo-core/src/humeo_core/__init__.py +49 -0
  21. humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf +0 -0
  22. humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt +93 -0
  23. humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt +93 -0
  24. humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf +3 -0
  25. humeo-core/src/humeo_core/primitives/__init__.py +1 -0
  26. humeo-core/src/humeo_core/primitives/classify.py +232 -0
  27. humeo-core/src/humeo_core/primitives/compile.py +602 -0
  28. humeo-core/src/humeo_core/primitives/face_detect.py +135 -0
  29. humeo-core/src/humeo_core/primitives/ingest.py +187 -0
  30. humeo-core/src/humeo_core/primitives/layouts.py +707 -0
  31. humeo-core/src/humeo_core/primitives/select_clips.py +150 -0
  32. humeo-core/src/humeo_core/primitives/vision.py +210 -0
  33. humeo-core/src/humeo_core/schemas.py +518 -0
  34. humeo-core/src/humeo_core/server.py +332 -0
  35. humeo-core/tests/__init__.py +0 -0
  36. humeo-core/tests/test_classify.py +39 -0
  37. humeo-core/tests/test_compile.py +329 -0
  38. humeo-core/tests/test_face_detect.py +73 -0
  39. humeo-core/tests/test_layout_bbox.py +17 -0
  40. humeo-core/tests/test_layouts.py +312 -0
  41. humeo-core/tests/test_schemas.py +267 -0
  42. humeo-core/tests/test_select_clips.py +49 -0
  43. humeo-core/tests/test_server_tools.py +93 -0
  44. humeo-core/tests/test_vision.py +228 -0
  45. pyproject.toml +56 -0
  46. src/humeo.egg-info/PKG-INFO +223 -0
  47. src/humeo.egg-info/SOURCES.txt +58 -0
  48. src/humeo.egg-info/dependency_links.txt +1 -0
  49. src/humeo.egg-info/entry_points.txt +2 -0
  50. src/humeo.egg-info/requires.txt +19 -0
.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .env
3
+ .env.*
4
+ !.env.example
5
+ .venv
6
+ __pycache__
7
+ .pytest_cache
8
+ .humeo_*
9
+ .tmp_review_frames
10
+ .tmp_review_frames_ticketc
11
+ output
12
+ output*
13
+ *.log
14
+ *.zip
15
+ *.pyc
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim-bookworm
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PIP_NO_CACHE_DIR=1 \
5
+ PORT=7860
6
+
7
+ WORKDIR /app
8
+
9
+ RUN apt-get update && \
10
+ apt-get install -y ffmpeg && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY . /app
14
+
15
+ RUN pip install --upgrade pip && \
16
+ pip install ./humeo-core && \
17
+ pip install .
18
+
19
+ EXPOSE 7860
20
+
21
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 NotABot
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,199 @@
1
- ---
2
- title: Clipforge
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: gray
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ClipForge
3
+ sdk: docker
4
+ app_port: 7860
5
+ ---
6
+
7
+ # ClipForge
8
+
9
+ Current default preset:
10
+
11
+ - `native_highlight` captions
12
+ - OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
13
+ - Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
14
+ - ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
15
+
16
+ Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
17
+
18
+ **Architecture (static HTML, GitHub Pages):**
19
+ [https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
20
+
21
+ ## Hugging Face Space
22
+
23
+ This repo includes a Hugging Face Docker Space entrypoint in `app.py` with the ClipForge upload/link UI.
24
+
25
+ - Paste a YouTube/video URL or upload one local video file
26
+ - Watch live pipeline progress in the ClipForge UI
27
+ - Preview and download rendered `short_*.mp4` clips from the UI
28
+ - Regenerate from the same source with a steering prompt
29
+
30
+ Required Space secrets:
31
+
32
+ - `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
33
+ - `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
34
+
35
+ If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
36
+ `ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
37
+
38
+ ## Repo layout
39
+
40
+ | Path | Role |
41
+ |------|------|
42
+ | `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
43
+ | `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
44
+
45
+ ## Pipeline (actual order)
46
+
47
+ ```text
48
+ YouTube URL
49
+ → ingest (source.mp4, transcript.json)
50
+ → clip selection (Gemini → clips.json)
51
+ → hook detection (Gemini → hooks.json)
52
+ → content pruning (Gemini → prune.json)
53
+ → keyframes + layout vision (Gemini vision → layout_vision.json)
54
+ → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
55
+ ```
56
+
57
+ Details: **`docs/PIPELINE.md`**.
58
+
59
+ ## Five layouts
60
+
61
+ A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
62
+
63
+ ## Requirements
64
+
65
+ - **Python** ≥ 3.10
66
+ - **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
67
+ - **`ffmpeg`** — on `PATH` for extract/render
68
+ - **API keys** — see **`docs/ENVIRONMENT.md`**
69
+ - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
70
+ - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
71
+ - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
72
+
73
+ Copy **`.env.example`** → **`.env`** (never commit `.env`).
74
+
75
+ ## Install
76
+
77
+ ```bash
78
+ uv venv
79
+ uv sync
80
+ ```
81
+
82
+ Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
83
+
84
+ ```bash
85
+ uv sync --extra whisper
86
+ ```
87
+
88
+ ## Run
89
+
90
+ ```bash
91
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
92
+ humeo --long-to-shorts "C:\path\to\video.mp4"
93
+ ```
94
+
95
+ Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
96
+
97
+ ## CLI guide (all flags)
98
+
99
+ Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
100
+
101
+ ### Required
102
+
103
+ | Flag | Meaning |
104
+ |------|---------|
105
+ | `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
106
+
107
+ ### Paths and cache behavior
108
+
109
+ | Flag | Meaning |
110
+ |------|---------|
111
+ | `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
112
+ | `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
113
+ | `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
114
+ | `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
115
+ | `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
116
+
117
+ ### Model selection and stage forcing
118
+
119
+ | Flag | Meaning |
120
+ |------|---------|
121
+ | `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
122
+ | `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
123
+ | `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
124
+ | `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
125
+ | `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
126
+ | `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
127
+ | `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
128
+
129
+ ### Pruning and subtitles
130
+
131
+ | Flag | Meaning |
132
+ |------|---------|
133
+ | `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
134
+ | `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
135
+ | `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
136
+ | `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
137
+ | `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
138
+
139
+ ### Logging
140
+
141
+ | Flag | Meaning |
142
+ |------|---------|
143
+ | `--verbose`, `-v` | Enable debug logging. |
144
+
145
+ ### Common command recipes
146
+
147
+ ```bash
148
+ # Basic run
149
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
150
+
151
+ # Local MP4
152
+ humeo --long-to-shorts "C:\path\to\video.mp4"
153
+
154
+ # Full fresh run for debugging / prompt tuning
155
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
156
+
157
+ # Re-run only clip selection after prompt edits
158
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
159
+
160
+ # Keep intermediates in a fixed local folder
161
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
162
+
163
+ # Compare different prune levels on same source
164
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
165
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
166
+ ```
167
+
168
+ ## Documentation
169
+
170
+ | Doc | Purpose |
171
+ |-----|---------|
172
+ | **`docs/README.md`** | Index of all files under `docs/` |
173
+ | **`docs/STUDY_ORDER.md`** | Read order for onboarding |
174
+ | **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
175
+ | **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
176
+ | **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
177
+ | **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
178
+ | **`docs/full_run_output.txt`** | Example full run log (text) |
179
+ | **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
180
+ | **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
181
+ | **`docs/TODO.md`** | Backlog |
182
+ | **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
183
+ | **`docs/SOLUTIONS.md`** | Design rationale |
184
+ | **`TERMINOLOGY.md`** | Glossary |
185
+
186
+ ## Tests
187
+
188
+ ```bash
189
+ uv sync --extra dev
190
+ uv run pytest
191
+ ```
192
+
193
+ ## Sharing outputs
194
+
195
+ `output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
196
+
197
+ ## License
198
+
199
+ See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
app.py ADDED
@@ -0,0 +1,808 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import json
5
+ import logging
6
+ import os
7
+ import queue
8
+ import re
9
+ import shutil
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import threading
14
+ import time
15
+ import traceback
16
+ import uuid
17
+ from dataclasses import dataclass, field
18
+ from pathlib import Path
19
+ from typing import Annotated
20
+
21
+
22
+ def _bootstrap_local_paths() -> None:
23
+ repo_root = Path(__file__).resolve().parent
24
+ for candidate in (repo_root / "src", repo_root / "humeo-core" / "src"):
25
+ candidate_str = str(candidate)
26
+ if candidate.is_dir() and candidate_str not in sys.path:
27
+ sys.path.insert(0, candidate_str)
28
+
29
+
30
+ _bootstrap_local_paths()
31
+ if not (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip():
32
+ os.environ["HUMEO_TRANSCRIBE_PROVIDER"] = (
33
+ "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
34
+ )
35
+
36
+ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
37
+ from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
38
+
39
+ from humeo.config import PipelineConfig
40
+ from humeo.pipeline import run_pipeline
41
+
42
+
43
+ APP_TITLE = "ClipForge"
44
+ LOG_FORMAT = "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s"
45
+ MAX_LOG_LINES = 700
46
+ LLM_KEY_NAMES = ("GOOGLE_API_KEY", "GEMINI_API_KEY", "OPENROUTER_API_KEY")
47
+
48
+
49
+ class QueueLogHandler(logging.Handler):
50
+ def __init__(self, sink: queue.Queue[str]):
51
+ super().__init__()
52
+ self._sink = sink
53
+
54
+ def emit(self, record: logging.LogRecord) -> None:
55
+ try:
56
+ self._sink.put_nowait(self.format(record))
57
+ except Exception:
58
+ pass
59
+
60
+
61
+ @dataclass
62
+ class ClipFile:
63
+ name: str
64
+ url: str
65
+ duration: str
66
+
67
+
68
+ @dataclass
69
+ class Job:
70
+ id: str
71
+ run_root: Path
72
+ output_dir: Path
73
+ work_dir: Path
74
+ source: str
75
+ source_path: Path | None = None
76
+ steering_note: str | None = None
77
+ status: str = "Queued"
78
+ nav_status: str = "Processing..."
79
+ error: str | None = None
80
+ done: bool = False
81
+ created_at: float = field(default_factory=time.time)
82
+ logs: list[str] = field(default_factory=list)
83
+ clips: dict[str, ClipFile] = field(default_factory=dict)
84
+ steps: list[dict[str, object]] = field(
85
+ default_factory=lambda: [
86
+ {"name": "Uploading video", "pct": 100, "state": "done"},
87
+ {"name": "Generating transcript", "pct": 5, "state": "active"},
88
+ {"name": "Choosing short clips", "pct": 0, "state": "pending"},
89
+ {"name": "Producing clips", "pct": 0, "state": "pending"},
90
+ {"name": "Adding subtitles & light edits", "pct": 0, "state": "pending"},
91
+ ]
92
+ )
93
+
94
+
95
+ JOBS: dict[str, Job] = {}
96
+ JOBS_LOCK = threading.Lock()
97
+
98
+
99
+ def _append_log(job: Job, line: str) -> None:
100
+ job.logs.append(line)
101
+ if len(job.logs) > MAX_LOG_LINES:
102
+ job.logs = job.logs[-MAX_LOG_LINES:]
103
+
104
+
105
+ def _set_step(job: Job, idx: int, pct: int, state: str = "active") -> None:
106
+ for step_idx, step in enumerate(job.steps):
107
+ if step_idx < idx:
108
+ step["pct"] = 100
109
+ step["state"] = "done"
110
+ elif step_idx == idx:
111
+ step["pct"] = max(int(step.get("pct", 0)), min(100, pct))
112
+ step["state"] = state
113
+ elif step.get("state") != "done":
114
+ step["state"] = "pending"
115
+
116
+
117
+ def _update_stage_from_log(job: Job, line: str) -> None:
118
+ if "STAGE 1: INGESTION" in line:
119
+ job.status = "Generating transcript"
120
+ _set_step(job, 1, 15)
121
+ elif "Transcribing" in line:
122
+ job.status = "Generating transcript"
123
+ _set_step(job, 1, 45)
124
+ elif "Transcript already exists" in line or "Transcription complete" in line:
125
+ _set_step(job, 1, 90)
126
+ elif "STAGE 2: CLIP SELECTION" in line:
127
+ job.status = "Choosing short clips"
128
+ _set_step(job, 2, 20)
129
+ elif "STAGE 2.25: HOOK DETECTION" in line:
130
+ job.status = "Finding hooks"
131
+ _set_step(job, 2, 55)
132
+ elif "STAGE 2.5: CONTENT PRUNING" in line:
133
+ job.status = "Tightening clip windows"
134
+ _set_step(job, 2, 78)
135
+ elif "STAGE 2.75: CLIP ASSEMBLY" in line:
136
+ job.status = "Assembling clips"
137
+ _set_step(job, 3, 18)
138
+ elif "STAGE 3: CLIP LAYOUTS" in line:
139
+ job.status = "Choosing layout"
140
+ _set_step(job, 3, 38)
141
+ elif "STAGE 4: RENDER" in line:
142
+ job.status = "Producing clips"
143
+ _set_step(job, 3, 62)
144
+ elif "reframe_clip_ffmpeg" in line:
145
+ _set_step(job, 4, min(90, 20 + len(job.clips) * 12))
146
+ elif "RENDER QA" in line or "Render QA summary" in line:
147
+ job.status = "Checking clips"
148
+ _set_step(job, 4, 82)
149
+ elif "PIPELINE COMPLETE" in line:
150
+ job.status = "Complete"
151
+ job.nav_status = "Done"
152
+ for step in job.steps:
153
+ step["pct"] = 100
154
+ step["state"] = "done"
155
+
156
+
157
+ def _install_log_handler(message_queue: queue.Queue[str]) -> tuple[logging.Handler, int, dict[str, int]]:
158
+ handler = QueueLogHandler(message_queue)
159
+ handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt="%H:%M:%S"))
160
+
161
+ root_logger = logging.getLogger()
162
+ previous_level = root_logger.level
163
+ root_logger.addHandler(handler)
164
+ root_logger.setLevel(logging.INFO)
165
+
166
+ previous_logger_levels: dict[str, int] = {}
167
+ for logger_name in ("urllib3", "httpx", "httpcore"):
168
+ logger = logging.getLogger(logger_name)
169
+ previous_logger_levels[logger_name] = logger.level
170
+ logger.setLevel(logging.WARNING)
171
+
172
+ return handler, previous_level, previous_logger_levels
173
+
174
+
175
+ def _remove_log_handler(
176
+ handler: logging.Handler,
177
+ previous_root_level: int,
178
+ previous_logger_levels: dict[str, int],
179
+ ) -> None:
180
+ root_logger = logging.getLogger()
181
+ root_logger.removeHandler(handler)
182
+ root_logger.setLevel(previous_root_level)
183
+ for logger_name, level in previous_logger_levels.items():
184
+ logging.getLogger(logger_name).setLevel(level)
185
+
186
+
187
+ def _duration_label(path: Path) -> str:
188
+ try:
189
+ result = subprocess.run(
190
+ [
191
+ "ffprobe",
192
+ "-v",
193
+ "error",
194
+ "-show_entries",
195
+ "format=duration",
196
+ "-of",
197
+ "default=noprint_wrappers=1:nokey=1",
198
+ str(path),
199
+ ],
200
+ check=True,
201
+ capture_output=True,
202
+ text=True,
203
+ timeout=15,
204
+ )
205
+ total = max(0, int(round(float(result.stdout.strip()))))
206
+ except Exception:
207
+ total = 0
208
+ return f"{total // 60}:{total % 60:02d}" if total else "0:00"
209
+
210
+
211
+ def _publish_files(job: Job) -> None:
212
+ for path in sorted(job.output_dir.glob("short_*.mp4")):
213
+ if path.name not in job.clips and path.is_file():
214
+ job.clips[path.name] = ClipFile(
215
+ name=path.name,
216
+ url=f"/api/jobs/{job.id}/files/{path.name}",
217
+ duration=_duration_label(path),
218
+ )
219
+
220
+
221
+ def _validate_credentials() -> None:
222
+ if not any((os.environ.get(name) or "").strip() for name in LLM_KEY_NAMES):
223
+ raise HTTPException(
224
+ status_code=400,
225
+ detail="Missing LLM secret. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENROUTER_API_KEY in the Space secrets.",
226
+ )
227
+
228
+ provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip().lower()
229
+ if provider in {"", "auto"}:
230
+ provider = "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
231
+ if provider == "elevenlabs" and not (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
232
+ raise HTTPException(status_code=400, detail="Missing ELEVENLABS_API_KEY Space secret.")
233
+ if provider in {"openai", "api"} and not (os.environ.get("OPENAI_API_KEY") or "").strip():
234
+ raise HTTPException(status_code=400, detail="Missing OPENAI_API_KEY Space secret.")
235
+
236
+
237
+ def _safe_url(value: str | None) -> str | None:
238
+ value = (value or "").strip()
239
+ if not value:
240
+ return None
241
+ if not re.match(r"^https?://", value, flags=re.I):
242
+ raise HTTPException(status_code=400, detail="Paste a valid http(s) video URL.")
243
+ return value
244
+
245
+
246
+ def _snapshot(job: Job) -> dict[str, object]:
247
+ return {
248
+ "id": job.id,
249
+ "status": job.status,
250
+ "nav_status": job.nav_status,
251
+ "done": job.done,
252
+ "error": job.error,
253
+ "logs": "\n".join(job.logs[-MAX_LOG_LINES:]),
254
+ "steps": job.steps,
255
+ "clips": [clip.__dict__ for clip in job.clips.values()],
256
+ }
257
+
258
+
259
+ def _run_job(job_id: str) -> None:
260
+ with JOBS_LOCK:
261
+ job = JOBS[job_id]
262
+ message_queue: queue.Queue[str] = queue.Queue()
263
+ handler, previous_root_level, previous_logger_levels = _install_log_handler(message_queue)
264
+
265
+ def drain_queue() -> None:
266
+ with JOBS_LOCK:
267
+ local_job = JOBS[job_id]
268
+ while True:
269
+ try:
270
+ line = message_queue.get_nowait()
271
+ except queue.Empty:
272
+ break
273
+ _append_log(local_job, line)
274
+ _update_stage_from_log(local_job, line)
275
+ _publish_files(local_job)
276
+
277
+ try:
278
+ with JOBS_LOCK:
279
+ _append_log(job, f"Prepared source: {job.source}")
280
+ _append_log(job, f"Run id: {job.id}")
281
+ _set_step(job, 1, 8)
282
+
283
+ config = PipelineConfig(
284
+ source=job.source,
285
+ youtube_url=job.source,
286
+ output_dir=job.output_dir,
287
+ work_dir=job.work_dir,
288
+ use_video_cache=False,
289
+ clean_run=True,
290
+ interactive=False,
291
+ prune_level="balanced",
292
+ overwrite_outputs=True,
293
+ steering_notes=[job.steering_note] if job.steering_note else [],
294
+ )
295
+
296
+ worker_error: str | None = None
297
+ outputs: list[Path] = []
298
+
299
+ def pipeline_worker() -> None:
300
+ nonlocal outputs, worker_error
301
+ try:
302
+ outputs = run_pipeline(config)
303
+ except Exception as exc:
304
+ worker_error = str(exc)
305
+ for line in traceback.format_exc().splitlines():
306
+ if line.strip():
307
+ message_queue.put_nowait(line)
308
+
309
+ thread = threading.Thread(target=pipeline_worker, daemon=True)
310
+ thread.start()
311
+ while thread.is_alive():
312
+ drain_queue()
313
+ time.sleep(0.35)
314
+ drain_queue()
315
+
316
+ with JOBS_LOCK:
317
+ local_job = JOBS[job_id]
318
+ for output in outputs:
319
+ if Path(output).exists():
320
+ local_job.clips[Path(output).name] = ClipFile(
321
+ name=Path(output).name,
322
+ url=f"/api/jobs/{job_id}/files/{Path(output).name}",
323
+ duration=_duration_label(Path(output)),
324
+ )
325
+ if worker_error:
326
+ local_job.error = worker_error
327
+ local_job.status = f"Failed: {worker_error}"
328
+ local_job.nav_status = "Failed"
329
+ else:
330
+ local_job.status = "Complete" if local_job.clips else "Complete - no clips generated"
331
+ local_job.nav_status = "Done"
332
+ for step in local_job.steps:
333
+ step["pct"] = 100
334
+ step["state"] = "done"
335
+ local_job.done = True
336
+ finally:
337
+ _remove_log_handler(handler, previous_root_level, previous_logger_levels)
338
+
339
+
340
+ async def _stage_upload(uploaded_file: UploadFile, run_root: Path) -> Path:
341
+ suffix = Path(uploaded_file.filename or "input.mp4").suffix or ".mp4"
342
+ staged_path = run_root / f"input{suffix}"
343
+ with staged_path.open("wb") as handle:
344
+ while chunk := await uploaded_file.read(1024 * 1024):
345
+ handle.write(chunk)
346
+ return staged_path
347
+
348
+
349
+ app = FastAPI(title=APP_TITLE)
350
+
351
+
352
+ @app.get("/", response_class=HTMLResponse)
353
+ def index() -> str:
354
+ return INDEX_HTML
355
+
356
+
357
+ @app.post("/api/jobs")
358
+ async def create_job(
359
+ video_url: Annotated[str | None, Form()] = None,
360
+ regen_prompt: Annotated[str | None, Form()] = None,
361
+ source_job_id: Annotated[str | None, Form()] = None,
362
+ file: Annotated[UploadFile | None, File()] = None,
363
+ ) -> JSONResponse:
364
+ _validate_credentials()
365
+ job_id = uuid.uuid4().hex[:12]
366
+ run_root = Path(tempfile.mkdtemp(prefix=f"clipforge-{job_id}-"))
367
+ work_dir = run_root / "work"
368
+ output_dir = run_root / "output"
369
+ work_dir.mkdir(parents=True, exist_ok=True)
370
+ output_dir.mkdir(parents=True, exist_ok=True)
371
+
372
+ source_path: Path | None = None
373
+ source = _safe_url(video_url)
374
+ source_job_id = (source_job_id or "").strip()
375
+ if source_job_id:
376
+ with JOBS_LOCK:
377
+ previous = JOBS.get(source_job_id)
378
+ if previous is None:
379
+ raise HTTPException(status_code=404, detail="Previous job not found for regeneration.")
380
+ if previous.source_path and previous.source_path.exists():
381
+ source_path = run_root / previous.source_path.name
382
+ shutil.copy2(previous.source_path, source_path)
383
+ source = str(source_path)
384
+ else:
385
+ source = previous.source
386
+ elif file is not None:
387
+ source_path = await _stage_upload(file, run_root)
388
+ source = str(source_path)
389
+
390
+ if not source:
391
+ raise HTTPException(status_code=400, detail="Upload a video file or paste a video URL first.")
392
+
393
+ job = Job(
394
+ id=job_id,
395
+ run_root=run_root,
396
+ output_dir=output_dir,
397
+ work_dir=work_dir,
398
+ source=source,
399
+ source_path=source_path,
400
+ steering_note=(regen_prompt or "").strip() or None,
401
+ )
402
+ with JOBS_LOCK:
403
+ JOBS[job_id] = job
404
+
405
+ threading.Thread(target=_run_job, args=(job_id,), daemon=True).start()
406
+ return JSONResponse(_snapshot(job))
407
+
408
+
409
+ @app.get("/api/jobs/{job_id}")
410
+ def get_job(job_id: str) -> JSONResponse:
411
+ with JOBS_LOCK:
412
+ job = JOBS.get(job_id)
413
+ if job is None:
414
+ raise HTTPException(status_code=404, detail="Job not found.")
415
+ _publish_files(job)
416
+ return JSONResponse(_snapshot(job))
417
+
418
+
419
+ @app.get("/api/jobs/{job_id}/files/{filename}")
420
+ def get_job_file(job_id: str, filename: str) -> FileResponse:
421
+ with JOBS_LOCK:
422
+ job = JOBS.get(job_id)
423
+ if job is None:
424
+ raise HTTPException(status_code=404, detail="Job not found.")
425
+ path = (job.output_dir / Path(filename).name).resolve(strict=False)
426
+ if job.output_dir.resolve(strict=False) not in path.parents or not path.is_file():
427
+ raise HTTPException(status_code=404, detail="File not found.")
428
+ return FileResponse(path, media_type="video/mp4", filename=path.name)
429
+
430
+
431
+ @app.get("/health")
432
+ def health() -> dict[str, str]:
433
+ return {"ok": "true"}
434
+
435
+
436
+ INDEX_HTML = r"""<!DOCTYPE html>
437
+ <html lang="en">
438
+ <head>
439
+ <meta charset="UTF-8">
440
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
441
+ <title>ClipForge - Video to Clips</title>
442
+ <link rel="preconnect" href="https://fonts.googleapis.com">
443
+ <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,500;0,600;1,300;1,400&family=DM+Sans:wght@300;400;500&display=swap" rel="stylesheet">
444
+ <style>
445
+ :root {
446
+ --cream: #F7F2E9; --champagne: #EDE3CC; --champagne-deep: #D9C9A6;
447
+ --gold: #B8924A; --gold-light: #D4AA6A; --ink: #2A1F0E;
448
+ --ink-soft: #5C4A2E; --ink-muted: #9A8560; --white: #FDFAF4;
449
+ --surface: #F0E9D8; --border: #DDD0B3; --success: #6B8C5A;
450
+ --radius: 12px; --radius-lg: 20px;
451
+ }
452
+ * { margin: 0; padding: 0; box-sizing: border-box; }
453
+ body { font-family: 'DM Sans', sans-serif; background: var(--cream); color: var(--ink); min-height: 100vh; overflow-x: hidden; }
454
+ nav { display: flex; align-items: center; justify-content: space-between; padding: 20px 32px; border-bottom: 1px solid var(--border); background: var(--white); position: sticky; top: 0; z-index: 100; }
455
+ .logo { font-family: 'Cormorant Garamond', serif; font-size: 1.6rem; font-weight: 600; color: var(--ink); letter-spacing: 0.02em; }
456
+ .logo span { color: var(--gold); }
457
+ .screen { display: none; animation: fadeIn 0.5s ease; }
458
+ .screen.active { display: block; }
459
+ @keyframes fadeIn { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: translateY(0); } }
460
+ #screen-input { display: flex; flex-direction: column; align-items: center; justify-content: center; min-height: calc(100vh - 65px); padding: 40px 20px; text-align: center; }
461
+ .eyebrow { font-size: 0.75rem; letter-spacing: 0.18em; text-transform: uppercase; color: var(--gold); font-weight: 500; margin-bottom: 16px; }
462
+ .hero-title { font-family: 'Cormorant Garamond', serif; font-size: clamp(2rem, 5vw, 3.6rem); font-weight: 500; line-height: 1.15; color: var(--ink); max-width: 620px; margin-bottom: 12px; }
463
+ .hero-title em { font-style: italic; color: var(--gold); }
464
+ .hero-sub { font-size: 0.95rem; color: var(--ink-muted); margin-bottom: 48px; font-weight: 300; }
465
+ .input-card { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 36px; width: 100%; max-width: 520px; box-shadow: 0 8px 32px rgba(42,31,14,0.07); }
466
+ .mode-tabs { display: flex; background: var(--surface); border-radius: 10px; padding: 4px; margin-bottom: 28px; gap: 4px; }
467
+ .mode-tab { flex: 1; padding: 10px 0; border: none; background: transparent; border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.85rem; font-weight: 500; color: var(--ink-muted); cursor: pointer; transition: all 0.2s; }
468
+ .mode-tab.active { background: var(--white); color: var(--ink); box-shadow: 0 2px 8px rgba(42,31,14,0.1); }
469
+ .input-section { display: none; } .input-section.active { display: block; }
470
+ .input-label { font-size: 0.78rem; letter-spacing: 0.08em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 8px; display: block; font-weight: 500; text-align:left; }
471
+ .yt-input { width: 100%; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; transition: border-color 0.2s; }
472
+ .yt-input:focus { border-color: var(--gold); } .yt-input::placeholder { color: var(--ink-muted); }
473
+ .upload-zone { border: 2px dashed var(--champagne-deep); border-radius: var(--radius); padding: 36px 20px; text-align: center; cursor: pointer; transition: all 0.2s; background: var(--cream); }
474
+ .upload-zone:hover, .upload-zone.dragover { border-color: var(--gold); background: var(--champagne); }
475
+ .upload-icon { width: 44px; height: 44px; background: var(--champagne); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 12px; font-size: 1.2rem; }
476
+ .upload-text { font-size: 0.9rem; color: var(--ink-soft); font-weight: 400; }
477
+ .upload-sub { font-size: 0.78rem; color: var(--ink-muted); margin-top: 4px; }
478
+ .convert-btn { width: 100%; margin-top: 28px; padding: 16px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.95rem; font-weight: 500; cursor: pointer; letter-spacing: 0.03em; transition: all 0.2s; position: relative; overflow: hidden; }
479
+ .convert-btn:hover { background: var(--ink-soft); transform: translateY(-1px); box-shadow: 0 6px 20px rgba(42,31,14,0.2); } .convert-btn:active { transform: translateY(0); }
480
+ .convert-btn:disabled { opacity: .65; cursor: progress; transform:none; }
481
+ #screen-processing { max-width: 780px; margin: 0 auto; padding: 48px 20px 80px; }
482
+ .processing-header { text-align: center; margin-bottom: 40px; }
483
+ .processing-title { font-family: 'Cormorant Garamond', serif; font-size: 2rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
484
+ .processing-sub { font-size: 0.88rem; color: var(--ink-muted); font-weight: 300; }
485
+ .pipeline { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 28px; box-shadow: 0 4px 20px rgba(42,31,14,0.06); margin-bottom: 32px; }
486
+ .pipeline-step { display: flex; align-items: flex-start; gap: 16px; padding: 16px 0; border-bottom: 1px solid var(--champagne); opacity: 0.4; transition: opacity 0.4s; }
487
+ .pipeline-step:last-child { border-bottom: none; } .pipeline-step.active, .pipeline-step.done { opacity: 1; }
488
+ .step-icon { width: 36px; height: 36px; flex-shrink: 0; background: var(--surface); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1rem; transition: all 0.4s; border: 1.5px solid var(--border); }
489
+ .pipeline-step.active .step-icon { background: var(--champagne); border-color: var(--gold); }
490
+ .pipeline-step.done .step-icon { background: var(--gold); border-color: var(--gold); color: white; font-size: 0.85rem; }
491
+ .step-content { flex: 1; padding-top: 4px; }
492
+ .step-name { font-size: 0.9rem; font-weight: 500; color: var(--ink); margin-bottom: 8px; display: flex; align-items: center; justify-content: space-between; }
493
+ .step-pct { font-size: 0.8rem; color: var(--gold); font-weight: 500; }
494
+ .progress-track { height: 6px; background: var(--surface); border-radius: 99px; overflow: hidden; }
495
+ .progress-fill { height: 100%; border-radius: 99px; background: linear-gradient(90deg, var(--gold-light), var(--gold)); width: 0%; transition: width 0.25s ease; }
496
+ .pipeline-step.done .progress-fill { width: 100%; background: var(--gold); }
497
+ .tips-section { margin-bottom: 40px; }
498
+ .tips-label { font-size: 0.72rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 12px; font-weight: 500; }
499
+ .tip-card { background: var(--champagne); border-radius: var(--radius); padding: 14px 18px; font-size: 0.85rem; color: var(--ink-soft); display: flex; align-items: flex-start; gap: 10px; margin-bottom: 8px; line-height: 1.5; }
500
+ .tip-dot { color: var(--gold); margin-top: 2px; flex-shrink: 0; }
501
+ .clips-section { margin-top: 8px; }
502
+ .clips-title { font-family: 'Cormorant Garamond', serif; font-size: 1.4rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
503
+ .clips-sub { font-size: 0.82rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
504
+ .clips-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 16px; }
505
+ .clip-card { border-radius: var(--radius); overflow: hidden; cursor: pointer; background: var(--white); border: 1px solid var(--border); box-shadow: 0 2px 10px rgba(42,31,14,0.06); transition: all 0.2s; animation: clipAppear 0.5s ease both; }
506
+ .clip-card:hover { transform: translateY(-3px); box-shadow: 0 8px 24px rgba(42,31,14,0.13); }
507
+ @keyframes clipAppear { from { opacity: 0; transform: scale(0.9) translateY(10px); } to { opacity: 1; transform: scale(1) translateY(0); } }
508
+ .clip-thumb { aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; overflow: hidden; }
509
+ .clip-play { width: 44px; height: 44px; background: rgba(255,255,255,0.88); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1.1rem; z-index: 2; box-shadow: 0 2px 12px rgba(0,0,0,0.2); transition: transform 0.2s; }
510
+ .clip-card:hover .clip-play { transform: scale(1.1); }
511
+ .clip-meta { padding: 10px 12px; } .clip-num { font-size: 0.72rem; color: var(--ink-muted); text-transform: uppercase; letter-spacing: 0.08em; font-weight: 500; }
512
+ .clip-dur { font-size: 0.82rem; color: var(--ink); font-weight: 400; margin-top: 2px; }
513
+ .clip-download { margin-top: 8px; display:inline-block; font-size:.74rem; color:var(--gold); text-decoration:none; }
514
+ .regen-section { margin-top: 56px; background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 32px; display: none; animation: fadeIn 0.5s ease; box-shadow: 0 4px 20px rgba(42,31,14,0.06); }
515
+ .regen-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 500; margin-bottom: 6px; }
516
+ .regen-sub { font-size: 0.85rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
517
+ .regen-textarea { width: 100%; min-height: 100px; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; resize: vertical; transition: border-color 0.2s; line-height: 1.6; margin-bottom: 14px; }
518
+ .regen-textarea:focus { border-color: var(--gold); } .regen-textarea::placeholder { color: var(--ink-muted); }
519
+ .regen-row { display: flex; gap: 10px; align-items: center; flex-wrap: wrap; }
520
+ .chip { padding: 7px 14px; background: var(--champagne); border: 1px solid var(--border); border-radius: 99px; font-size: 0.78rem; color: var(--ink-soft); cursor: pointer; transition: all 0.15s; font-weight: 400; white-space: nowrap; }
521
+ .chip:hover { background: var(--champagne-deep); color: var(--ink); border-color: var(--gold); }
522
+ .regen-btn { margin-left: auto; padding: 12px 24px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.88rem; font-weight: 500; cursor: pointer; transition: all 0.2s; white-space: nowrap; }
523
+ .regen-btn:hover { background: var(--ink-soft); }
524
+ .modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
525
+ .modal-overlay.open { display: flex; }
526
+ .modal-box { background: var(--white); border-radius: var(--radius-lg); width: 100%; max-width: 390px; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
527
+ @keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
528
+ .modal-video { aspect-ratio: 9/16; max-height: 70vh; display: flex; align-items: center; justify-content: center; position: relative; background:var(--ink); }
529
+ .modal-video video { width:100%; height:100%; object-fit:contain; background:#000; }
530
+ .modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
531
+ .modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
532
+ .modal-actions { display:flex; align-items:center; gap:8px; }
533
+ .modal-close, .modal-download { padding: 8px 14px; background: var(--surface); border: 1px solid var(--border); border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.82rem; cursor: pointer; transition: all 0.15s; color:var(--ink); text-decoration:none; }
534
+ .modal-close:hover, .modal-download:hover { background: var(--champagne); }
535
+ .log-panel { display:none; margin-top:24px; background:var(--ink); color:var(--cream); border-radius:12px; padding:14px; font:12px/1.45 ui-monospace, SFMono-Regular, Consolas, monospace; white-space:pre-wrap; max-height:240px; overflow:auto; text-align:left; }
536
+ @media (max-width: 600px) { nav { padding: 16px 20px; } .input-card { padding: 24px 20px; } #screen-processing { padding: 32px 16px 60px; } .pipeline { padding: 20px 16px; } .clips-grid { grid-template-columns: repeat(2, 1fr); gap: 10px; } .regen-section { padding: 22px 18px; } .regen-btn { width: 100%; margin-left: 0; } .regen-row { flex-direction: column; align-items: flex-start; } }
537
+ .thumb-1 { background: linear-gradient(135deg, #D4A96A 0%, #8B5E3C 100%); } .thumb-2 { background: linear-gradient(135deg, #7A9E8A 0%, #3D6650 100%); }
538
+ .thumb-3 { background: linear-gradient(135deg, #9E8A7A 0%, #5C3E2E 100%); } .thumb-4 { background: linear-gradient(135deg, #8A7A9E 0%, #4A3866 100%); }
539
+ .thumb-5 { background: linear-gradient(135deg, #9E9A7A 0%, #5C5820 100%); } .thumb-6 { background: linear-gradient(135deg, #C4856A 0%, #7A3020 100%); }
540
+ .thumb-7 { background: linear-gradient(135deg, #7AABBE 0%, #2A5A6E 100%); } .thumb-8 { background: linear-gradient(135deg, #9EAA7A 0%, #4A5E20 100%); }
541
+ .thumb-9 { background: linear-gradient(135deg, #AA7A9E 0%, #5E2060 100%); } .thumb-0 { background: linear-gradient(135deg, #D4C36A 0%, #8B7820 100%); }
542
+ .spin { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--border); border-top-color: var(--gold); border-radius: 50%; animation: spin 0.8s linear infinite; }
543
+ @keyframes spin { to { transform: rotate(360deg); } }
544
+ </style>
545
+ </head>
546
+ <body>
547
+ <nav>
548
+ <div class="logo">Clip<span>Forge</span></div>
549
+ <div style="font-size:0.8rem;color:var(--ink-muted);font-weight:300;display:none" id="nav-status">Processing...</div>
550
+ </nav>
551
+ <div class="screen active" id="screen-input">
552
+ <div style="display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:calc(100vh - 65px);padding:40px 20px;text-align:center;">
553
+ <div class="eyebrow">AI Video Editor</div>
554
+ <h1 class="hero-title">Convert your long video to <em>short clips</em> for social media</h1>
555
+ <p class="hero-sub">Paste a link or upload a file - we handle the rest</p>
556
+ <div class="input-card">
557
+ <div class="mode-tabs">
558
+ <button class="mode-tab active" onclick="switchMode('yt')">Link</button>
559
+ <button class="mode-tab" onclick="switchMode('upload')">Upload File</button>
560
+ </div>
561
+ <div class="input-section active" id="mode-yt">
562
+ <label class="input-label">Video URL</label>
563
+ <input class="yt-input" type="text" placeholder="https://youtube.com/watch?v=..." id="yt-url">
564
+ </div>
565
+ <div class="input-section" id="mode-upload">
566
+ <input type="file" id="file-input" accept="video/mp4,video/quicktime,video/*" hidden>
567
+ <div class="upload-zone" id="upload-zone" onclick="openUpload()">
568
+ <div class="upload-icon">File</div>
569
+ <div class="upload-text">Click to browse or drag & drop</div>
570
+ <div class="upload-sub">MP4, MOV, AVI - up to your Space limit</div>
571
+ </div>
572
+ </div>
573
+ <button class="convert-btn" id="convert-btn" onclick="startProcessing()">Convert to Clips -></button>
574
+ </div>
575
+ </div>
576
+ </div>
577
+ <div class="screen" id="screen-processing">
578
+ <div class="processing-header">
579
+ <div class="eyebrow">Working on it</div>
580
+ <h2 class="processing-title">Your clips are being crafted</h2>
581
+ <p class="processing-sub" id="processing-sub">Sit back - long videos can take a little while</p>
582
+ </div>
583
+ <div class="pipeline" id="pipeline">
584
+ <div class="pipeline-step" id="step-0"><div class="step-icon">Up</div><div class="step-content"><div class="step-name">Uploading video <span class="step-pct" id="pct-0">0%</span></div><div class="progress-track"><div class="progress-fill" id="fill-0"></div></div></div></div>
585
+ <div class="pipeline-step" id="step-1"><div class="step-icon">Text</div><div class="step-content"><div class="step-name">Generating transcript <span class="step-pct" id="pct-1"></span></div><div class="progress-track"><div class="progress-fill" id="fill-1"></div></div></div></div>
586
+ <div class="pipeline-step" id="step-2"><div class="step-icon">Cut</div><div class="step-content"><div class="step-name">Choosing short clips <span class="step-pct" id="pct-2"></span></div><div class="progress-track"><div class="progress-fill" id="fill-2"></div></div></div></div>
587
+ <div class="pipeline-step" id="step-3"><div class="step-icon">Film</div><div class="step-content"><div class="step-name">Producing clips <span class="step-pct" id="pct-3"></span></div><div class="progress-track"><div class="progress-fill" id="fill-3"></div></div></div></div>
588
+ <div class="pipeline-step" id="step-4"><div class="step-icon">Edit</div><div class="step-content"><div class="step-name">Adding subtitles &amp; light edits <span class="step-pct" id="pct-4"></span></div><div class="progress-track"><div class="progress-fill" id="fill-4"></div></div></div></div>
589
+ </div>
590
+ <div class="tips-section" id="tips-section">
591
+ <div class="tips-label">Tips while you wait</div>
592
+ <div class="tip-card"><span class="tip-dot">◆</span> Clips are automatically trimmed around the strongest hook.</div>
593
+ <div class="tip-card"><span class="tip-dot">◆</span> The system can pick centered speaker or split presentation layout per clip.</div>
594
+ <div class="tip-card"><span class="tip-dot">◆</span> Word-by-word subtitles are added by default.</div>
595
+ <div class="tip-card"><span class="tip-dot">◆</span> You can regenerate with different instructions after the first batch.</div>
596
+ </div>
597
+ <div class="clips-section" id="clips-section" style="display:none">
598
+ <div class="clips-title">Your clips</div>
599
+ <p class="clips-sub" id="clips-sub-text">Tap any clip to preview</p>
600
+ <div class="clips-grid" id="clips-grid"></div>
601
+ </div>
602
+ <div class="regen-section" id="regen-section">
603
+ <div class="regen-title">Produce a different set</div>
604
+ <p class="regen-sub">Describe what you're looking for and we'll re-cut your video</p>
605
+ <textarea class="regen-textarea" placeholder="e.g. Focus on the funniest moments, keep clips under 30 seconds, add a text hook at the start..." id="regen-prompt"></textarea>
606
+ <div class="regen-row">
607
+ <span class="chip" onclick="setChip('Highlight key insights')">Key insights</span>
608
+ <span class="chip" onclick="setChip('Funny & entertaining moments')">Funny moments</span>
609
+ <span class="chip" onclick="setChip('Emotional or inspiring clips')">Emotional</span>
610
+ <span class="chip" onclick="setChip('Fast-paced, high energy edits')">High energy</span>
611
+ <button class="regen-btn" onclick="triggerRegen()">Regenerate Clips -></button>
612
+ </div>
613
+ </div>
614
+ <pre class="log-panel" id="log-panel"></pre>
615
+ </div>
616
+ <div class="modal-overlay" id="modal" onclick="closeModal(event)">
617
+ <div class="modal-box">
618
+ <div class="modal-video" id="modal-video"><div class="clip-play" style="width:56px;height:56px;font-size:1.4rem;background:rgba(255,255,255,0.9)">▶</div></div>
619
+ <div class="modal-footer">
620
+ <div class="modal-clip-label" id="modal-label">Clip 1</div>
621
+ <div class="modal-actions"><a class="modal-download" id="modal-download" href="#" download>Download</a><button class="modal-close" onclick="document.getElementById('modal').classList.remove('open')">Close</button></div>
622
+ </div>
623
+ </div>
624
+ </div>
625
+ <script>
626
+ let currentMode = 'yt';
627
+ let selectedFile = null;
628
+ let currentJobId = null;
629
+ let renderedClips = [];
630
+ const iconLabels = ['Up','Text','Cut','Film','Edit'];
631
+
632
+ function switchMode(m) {
633
+ currentMode = m;
634
+ document.querySelectorAll('.mode-tab').forEach((t,i) => t.classList.toggle('active', (i===0 && m==='yt') || (i===1 && m==='upload')));
635
+ document.getElementById('mode-yt').classList.toggle('active', m==='yt');
636
+ document.getElementById('mode-upload').classList.toggle('active', m==='upload');
637
+ }
638
+
639
+ function openUpload() { document.getElementById('file-input').click(); }
640
+
641
+ function setSelectedFile(file) {
642
+ selectedFile = file;
643
+ const zone = document.getElementById('upload-zone');
644
+ zone.innerHTML = `<div class="upload-icon">OK</div><div class="upload-text" style="color:var(--gold)">File selected: ${escapeHtml(file.name)}</div><div class="upload-sub">Ready to convert</div>`;
645
+ }
646
+
647
+ const uploadZone = document.getElementById('upload-zone');
648
+ document.getElementById('file-input').addEventListener('change', e => { if (e.target.files[0]) setSelectedFile(e.target.files[0]); });
649
+ uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
650
+ uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
651
+ uploadZone.addEventListener('drop', e => { e.preventDefault(); uploadZone.classList.remove('dragover'); if (e.dataTransfer.files[0]) setSelectedFile(e.dataTransfer.files[0]); });
652
+
653
+ function escapeHtml(s) {
654
+ return String(s).replace(/[&<>"']/g, c => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#039;'}[c]));
655
+ }
656
+
657
+ async function createJob(extraPrompt = '') {
658
+ const form = new FormData();
659
+ if (extraPrompt && currentJobId) {
660
+ form.append('source_job_id', currentJobId);
661
+ form.append('regen_prompt', extraPrompt);
662
+ } else if (currentMode === 'upload') {
663
+ if (!selectedFile) throw new Error('Choose a video file first.');
664
+ form.append('file', selectedFile);
665
+ } else {
666
+ const url = document.getElementById('yt-url').value.trim();
667
+ if (!url) throw new Error('Paste a video URL first.');
668
+ form.append('video_url', url);
669
+ }
670
+ const res = await fetch('/api/jobs', { method: 'POST', body: form });
671
+ const data = await res.json();
672
+ if (!res.ok) throw new Error(data.detail || 'Could not start job.');
673
+ return data;
674
+ }
675
+
676
+ async function startProcessing() {
677
+ const btn = document.getElementById('convert-btn');
678
+ try {
679
+ btn.disabled = true;
680
+ btn.textContent = 'Starting...';
681
+ const job = await createJob();
682
+ currentJobId = job.id;
683
+ renderedClips = [];
684
+ document.getElementById('clips-grid').innerHTML = '';
685
+ document.getElementById('screen-input').classList.remove('active');
686
+ document.getElementById('screen-processing').classList.add('active');
687
+ document.getElementById('nav-status').style.display = 'block';
688
+ syncJob(job);
689
+ pollJob(job.id);
690
+ } catch (err) {
691
+ alert(err.message || err);
692
+ } finally {
693
+ btn.disabled = false;
694
+ btn.textContent = 'Convert to Clips ->';
695
+ }
696
+ }
697
+
698
+ async function pollJob(id) {
699
+ let done = false;
700
+ while (!done && currentJobId === id) {
701
+ await new Promise(r => setTimeout(r, 1400));
702
+ const res = await fetch(`/api/jobs/${id}`);
703
+ const job = await res.json();
704
+ syncJob(job);
705
+ done = job.done;
706
+ }
707
+ }
708
+
709
+ function syncJob(job) {
710
+ document.getElementById('nav-status').textContent = job.nav_status || 'Processing...';
711
+ document.getElementById('processing-sub').textContent = job.error ? job.error : job.status;
712
+ document.getElementById('log-panel').textContent = job.logs || '';
713
+ (job.steps || []).forEach((step, i) => {
714
+ const el = document.getElementById(`step-${i}`);
715
+ const fill = document.getElementById(`fill-${i}`);
716
+ const pct = document.getElementById(`pct-${i}`);
717
+ el.classList.toggle('active', step.state === 'active');
718
+ el.classList.toggle('done', step.state === 'done');
719
+ el.querySelector('.step-icon').innerHTML = step.state === 'done' ? '✓' : (step.state === 'active' ? '<span class="spin"></span>' : iconLabels[i]);
720
+ fill.style.width = `${step.pct || 0}%`;
721
+ pct.textContent = step.pct ? `${Math.floor(step.pct)}%` : '';
722
+ });
723
+ (job.clips || []).forEach((clip, idx) => {
724
+ if (!renderedClips.some(c => c.name === clip.name)) {
725
+ renderedClips.push(clip);
726
+ addClip(renderedClips.length - 1);
727
+ }
728
+ });
729
+ if (renderedClips.length) {
730
+ document.getElementById('clips-section').style.display = 'block';
731
+ document.getElementById('clips-sub-text').textContent = job.done
732
+ ? `All ${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - tap to preview`
733
+ : `${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - more coming...`;
734
+ }
735
+ if (job.done) {
736
+ document.getElementById('regen-section').style.display = 'block';
737
+ if (job.error) document.getElementById('log-panel').style.display = 'block';
738
+ }
739
+ }
740
+
741
+ function addClip(idx) {
742
+ const clip = renderedClips[idx];
743
+ const grid = document.getElementById('clips-grid');
744
+ const card = document.createElement('div');
745
+ card.className = 'clip-card';
746
+ card.innerHTML = `<div class="clip-thumb thumb-${idx % 10}"><div class="clip-play">▶</div></div><div class="clip-meta"><div class="clip-num">Clip ${idx + 1}</div><div class="clip-dur">${clip.duration || '0:00'}</div><a class="clip-download" href="${clip.url}" download onclick="event.stopPropagation()">Download</a></div>`;
747
+ card.onclick = () => openModal(idx);
748
+ grid.appendChild(card);
749
+ }
750
+
751
+ function openModal(idx) {
752
+ const clip = renderedClips[idx];
753
+ const modal = document.getElementById('modal');
754
+ const video = document.getElementById('modal-video');
755
+ video.className = 'modal-video';
756
+ video.innerHTML = `<video src="${clip.url}" controls autoplay playsinline></video>`;
757
+ document.getElementById('modal-label').textContent = `Clip ${idx + 1}`;
758
+ document.getElementById('modal-download').href = clip.url;
759
+ modal.classList.add('open');
760
+ }
761
+
762
+ function closeModal(e) {
763
+ if (e.target === document.getElementById('modal')) {
764
+ document.getElementById('modal').classList.remove('open');
765
+ document.getElementById('modal-video').innerHTML = '';
766
+ }
767
+ }
768
+
769
+ function setChip(text) {
770
+ const ta = document.getElementById('regen-prompt');
771
+ ta.value = text;
772
+ ta.focus();
773
+ }
774
+
775
+ async function triggerRegen() {
776
+ const prompt = document.getElementById('regen-prompt').value.trim();
777
+ if (!prompt) { document.getElementById('regen-prompt').focus(); return; }
778
+ if (!currentJobId) { alert('Run a video first.'); return; }
779
+ renderedClips = [];
780
+ document.getElementById('clips-grid').innerHTML = '';
781
+ document.getElementById('clips-section').style.display = 'none';
782
+ document.getElementById('regen-section').style.display = 'none';
783
+ document.getElementById('nav-status').textContent = 'Regenerating...';
784
+ document.querySelectorAll('.pipeline-step').forEach((s, i) => {
785
+ s.classList.remove('active', 'done');
786
+ s.querySelector('.step-icon').innerHTML = iconLabels[i];
787
+ document.getElementById(`fill-${i}`).style.width = '0%';
788
+ document.getElementById(`pct-${i}`).textContent = '';
789
+ });
790
+ window.scrollTo({ top: 0, behavior: 'smooth' });
791
+ try {
792
+ const job = await createJob(prompt);
793
+ currentJobId = job.id;
794
+ syncJob(job);
795
+ pollJob(job.id);
796
+ } catch (err) {
797
+ alert(err.message || err);
798
+ }
799
+ }
800
+ </script>
801
+ </body>
802
+ </html>"""
803
+
804
+
805
+ if __name__ == "__main__":
806
+ import uvicorn
807
+
808
+ uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))
humeo-core/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ .pytest_cache/
6
+ build/
7
+ dist/
8
+ .venv/
9
+ .env
humeo-core/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 NotABot
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
humeo-core/README.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # humeo-core
2
+
3
+ **Reusable-rocket MCP server for long-video → 9:16 shorts.**
4
+
5
+ First-principles design, from the HIVE paper + Bryan's rocket analogy:
6
+ we don't build doors and windows (general subject-tracker UI, retraining
7
+ models). We build the **container** (schemas), **landing gear** (deterministic
8
+ local extraction), and **five thrusters** (the five 9:16 layouts this video
9
+ format actually uses). Everything else is pluggable.
10
+
11
+ ## The rocket, in one picture
12
+
13
+ ```
14
+ ┌──────────────────────────────────────────┐
15
+ │ Control panel (MCP tools) │ <- any MCP client
16
+ └───────────────────┬──────────────────────┘
17
+ │ strict JSON
18
+ ┌────────────────┬───────────┼────────────────┬─────────────────┐
19
+ ▼ ▼ ▼ ▼ ▼
20
+ ingest classify_scenes select_clips plan_layout render_clip
21
+ (scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile,
22
+ keyframes + classifier) heuristic + pure filter dry-run safe)
23
+ transcript) LLM-ready) math)
24
+
25
+
26
+ ┌────────────────────┐
27
+ │ LayoutKind │
28
+ │ ──────────────── │
29
+ │ zoom_call_center │
30
+ │ sit_center │
31
+ │ split_chart_person│
32
+ │ split_two_persons │
33
+ │ split_two_charts │
34
+ └────────────────────┘
35
+ ```
36
+
37
+ Only the classifier and clip-selector have optional LLM hooks; everything
38
+ else is deterministic, local, and cheap.
39
+
40
+ ## Why five layouts? (the "max 2 items" rule)
41
+
42
+ The hard constraint for this format: **a short shows at most two on-screen
43
+ items** — where an "item" is a `person` (a human speaker) or a `chart`
44
+ (slide, graph, data visual, screenshare). That gives exactly five recipes:
45
+
46
+ 1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
47
+ 2. **`sit_center`** — 1 person, interview / seated framing.
48
+ 3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
49
+ (default: **even 50/50** top/bottom, chart on top).
50
+ 4. **`split_two_persons`** — 2 speakers, stacked vertically.
51
+ 5. **`split_two_charts`** — 2 charts, stacked vertically.
52
+
53
+ Because the geometry is bounded, we do NOT need a general subject-tracker
54
+ ML model or a drag-to-highlight UI. We need five small, correct pieces of
55
+ crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
56
+ is.
57
+
58
+ See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
59
+ used across these docs (subject, crop, band, seam, bbox, layout, etc.).
60
+
61
+ ## Install
62
+
63
+ ```bash
64
+ uv venv
65
+ uv sync
66
+ ```
67
+
68
+ External requirements: `ffmpeg` and `ffprobe` on PATH.
69
+
70
+ `scenedetect` requires OpenCV. Install `opencv-python-headless` or
71
+ `opencv-python` alongside `scenedetect`.
72
+
73
+ ## Use it as an MCP server
74
+
75
+ ```bash
76
+ humeo-core # stdio transport (primary console script)
77
+ # humeo-mcp # same entrypoint — kept so existing MCP configs keep working
78
+ ```
79
+
80
+ Example Cursor/Claude Desktop config:
81
+
82
+ ```json
83
+ {
84
+ "mcpServers": {
85
+ "humeo": { "command": "humeo-core" }
86
+ }
87
+ }
88
+ ```
89
+
90
+ Tools exposed:
91
+
92
+ | Tool | Purpose |
93
+ | --------------------------------- | --------------------------------------------------------------------------- |
94
+ | `list_layouts` | Enumerate the 5 supported layouts. |
95
+ | `ingest` | Scene detection + keyframe extraction (+ optional transcript). |
96
+ | `classify_scenes` | Pixel-heuristic per-scene layout classification. |
97
+ | `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). |
98
+ | `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
99
+ | `select_clips` | Heuristic clip picker over a word-level transcript. |
100
+ | `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. |
101
+ | `build_render_cmd` | Build the ffmpeg command (no execution) — review before spend. |
102
+ | `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. |
103
+
104
+ Resource: `humeo://layouts` (JSON listing of the 5 layouts).
105
+
106
+ ### Three interchangeable region detectors
107
+
108
+ All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
109
+
110
+ ```
111
+ classify.py (pixel variance, no ML)
112
+ face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
113
+ vision.py (multimodal LLM + OCR bboxes)
114
+ ```
115
+
116
+ ## JSON contracts (non-negotiable)
117
+
118
+ All tools take and return Pydantic-validated JSON. The contracts live in
119
+ [`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
120
+
121
+ - `Scene` `{scene_id, start_time, end_time, keyframe_path?}`
122
+ - `TranscriptWord` `{word, start_time, end_time}`
123
+ - `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
124
+ - `SceneClassification` `{scene_id, layout, confidence, reason}`
125
+ - `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized)
126
+ - `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
127
+ - `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
128
+ - `ClipPlan` `{source_path, clips[]}`
129
+ - `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
130
+ - `RenderRequest` / `RenderResult`
131
+
132
+ ## First-principles decisions (what we intentionally did NOT build)
133
+
134
+ - **No giant subject-tracker ML.** The video format has 5 fixed layouts
135
+ (with a hard "max 2 items" rule); pixel-level tracking is not needed.
136
+ - **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
137
+ agent-first workflow. If a human wants to override, they pass a
138
+ `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
139
+ `zoom`.
140
+ - **No end-to-end video→video model.** The HIVE paper's core insight is
141
+ that decomposed orchestration beats monolithic generation. We reify
142
+ that insight as six small composable tools.
143
+
144
+ ## Extending the pilot
145
+
146
+ - Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
147
+ - Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
148
+ - Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
149
+ to get per-scene bboxes + OCR text, then feed the results back through
150
+ `classify_scenes_with_vision`. This is the scene-change → v3 images →
151
+ LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
152
+ - All enforce strict JSON outputs, so bad model output can't corrupt
153
+ downstream stages.
154
+
155
+ ## Testing
156
+
157
+ ```bash
158
+ python -m pytest
159
+ ```
160
+
161
+ See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
162
+
163
+ ## License
164
+
165
+ MIT
humeo-core/docs/ARCHITECTURE.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Architecture — Reusable Rocket
2
+
3
+ > *"We don't need to build the door or windows — just a container with landing
4
+ > gear and thrusters that move in different directions."*
5
+ > — Bryan
6
+
7
+ That analogy maps exactly onto this MCP:
8
+
9
+ | Rocket part | Codebase | Purpose |
10
+ | --------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------- |
11
+ | Container | `src/humeo_core/schemas.py` | Strict JSON contracts every stage reads/writes. |
12
+ | Landing gear | `src/humeo_core/primitives/ingest.py` | Deterministic local extraction (scenes, keyframes, transcript). |
13
+ | Thrusters (×5) | `src/humeo_core/primitives/layouts.py` | Five fixed 9:16 crop/compose recipes (max 2 on-screen items). |
14
+ | Pilot | `primitives/classify.py` + `primitives/select_clips.py` | Heuristic + LLM-ready decision makers. |
15
+ | Compiler | `src/humeo_core/primitives/compile.py` | Deterministic ffmpeg assembly. |
16
+ | Control panel | `src/humeo_core/server.py` | MCP tools exposing every primitive. |
17
+ | Control surface | `src/humeo_core/server.py` | MCP tool surface for agents and clients. |
18
+
19
+ ## First-principles reasoning
20
+
21
+ The HIVE paper's core insight is that good short-video editing requires
22
+ **staged reasoning with strict intermediate artifacts**, not a single
23
+ giant model call. Three consequences flow from that:
24
+
25
+ 1. **Extraction must be local and deterministic.** No model call should
26
+ ever touch raw video bytes. `ingest.py` runs ffprobe + PySceneDetect
27
+ + ffmpeg + (optional) faster-whisper. Everything it emits is JSON or
28
+ a file path.
29
+
30
+ 2. **Reasoning must be decomposed into narrow sub-tasks.** Classifying a
31
+ scene's layout is a completely different task from selecting a viral
32
+ clip. Each has its own schema, its own prompt, its own validation.
33
+ This is why `primitives/` is five files instead of one.
34
+
35
+ 3. **Every model call must emit schema-validated JSON.** Free-form model
36
+ output is not allowed to enter the pipeline. `classify_scenes_with_llm`
37
+ and `select_clips_with_llm` both `model_validate(...)` the raw output
38
+ before returning; parse failures degrade gracefully to `SIT_CENTER` +
39
+ low confidence, not crashes.
40
+
41
+ ## Why only five layouts?
42
+
43
+ The hard rule for this format: **a short shows at most two on-screen
44
+ items**, where an "item" is a `person` or a `chart`. That gives exactly
45
+ five recipes — all implemented as pure functions from
46
+ `LayoutInstruction` to an ffmpeg filtergraph string in `layouts.py`:
47
+
48
+ | Layout | Items | Recipe |
49
+ | ---------------------- | --------------- | --------------------------------------------- |
50
+ | `zoom_call_center` | 1 person | tight centered 9:16 crop (zoom ≥ 1.25). |
51
+ | `sit_center` | 1 person | wider centered 9:16 crop. |
52
+ | `split_chart_person` | 1 chart + person| source partitioned L/R by bboxes, stacked. |
53
+ | `split_two_persons` | 2 persons | L/R speakers, stacked top/bottom. |
54
+ | `split_two_charts` | 2 charts | L/R charts, stacked top/bottom. |
55
+
56
+ A general subject-tracker ML model is orders of magnitude more expensive
57
+ and less reliable than five hand-written crop recipes. If a new geometry
58
+ ever shows up in future source videos, adding a sixth thruster is
59
+ strictly additive: write a new `plan_*` function, add it to `_DISPATCH`,
60
+ add an enum variant. No existing code has to change.
61
+
62
+ ## 9:16 layout math
63
+
64
+ Source is assumed 16:9 (1920×1080 by default, but probed per-clip).
65
+ Target is 1080×1920. For each layout:
66
+
67
+ ### `zoom_call_center` and `sit_center`
68
+
69
+ Standard centered aspect-ratio crop to 9:16, then scale to 1080×1920:
70
+
71
+ ```
72
+ crop=cw:ch:x:y,scale=1080:1920:flags=lanczos,setsar=1[vout]
73
+ ```
74
+
75
+ `cw`, `ch` are the largest 9:16 window that fits in the source, divided
76
+ by `zoom`. `x`, `y` center the window on `person_x_norm` / 0.5.
77
+ Dimensions are rounded to even values so libx264 is happy. The window is
78
+ clamped inside the source so a high `person_x_norm` never crops outside.
79
+
80
+ ### Split layouts (`split_chart_person`, `split_two_persons`, `split_two_charts`)
81
+
82
+ All three splits share one recipe — only the items differ:
83
+
84
+ 1. **Horizontal partition.** The source is cut at a single vertical seam
85
+ so the two source strips are **complementary** (no overlap, no gap).
86
+ When both bboxes are set (Gemini vision), the seam is the midpoint
87
+ between `left.x2` and `right.x1`. Otherwise the seam defaults to
88
+ either an even 50/50 (two-of-a-kind splits) or a 2/3 | 1/3 split
89
+ (legacy `split_chart_person` fallback).
90
+ 2. **Vertical crop.** Each strip's vertical extent comes from the
91
+ corresponding bbox when provided, so each item **fills** its output
92
+ band instead of being lost in full-height source context.
93
+ 3. **Cover-scale to the band.** Each strip is scaled with
94
+ `force_original_aspect_ratio=increase` + center-cropped to the band
95
+ dimensions. Bands are always fully painted; no letterbox bars.
96
+ 4. **Stack.** Two branches produced by `split=2` are `vstack`-ed into
97
+ the final 1080×1920.
98
+
99
+ **Band heights** are controlled by `LayoutInstruction.top_band_ratio`,
100
+ which defaults to **0.5** (even 50/50 — the symmetric look Bryan asked
101
+ for after the uneven Cathy Wood shorts). Legacy 60/40 is still reachable
102
+ by setting `top_band_ratio=0.6`.
103
+
104
+ **Stack order** (for `split_chart_person`) is controlled by
105
+ `focus_stack_order`: chart-on-top (default) or person-on-top.
106
+
107
+ ## Extensibility story
108
+
109
+ - **Smarter classifier:** implement `LLMVisionFn` with any multimodal
110
+ model and pass it to `classify_scenes_with_llm`. The fallback heuristic
111
+ stays available for offline runs and tests.
112
+ - **Smarter clip selector:** same pattern, `LLMTextFn` → `select_clips_with_llm`.
113
+ - **New layout:** add a `plan_*` planner, register in `_DISPATCH`, add a
114
+ `LayoutKind` variant. Tests in `test_layouts.py` automatically iterate
115
+ over all `LayoutKind`s, so the dispatch coverage test will catch a
116
+ missing registration immediately.
117
+
118
+ ## What we intentionally did NOT build
119
+
120
+ - Drag-and-highlight subject-selector UI.
121
+ - A general ML subject-tracker.
122
+ - A monolithic video-in-video-out model.
123
+ - Any network calls in the core library. The MCP server is stdio-only;
124
+ the CLI runs fully offline.
125
+
126
+ This keeps the rocket **reusable**: the same primitives power the MCP
127
+ server, the CLI, a Python library, and (soon) a web UI if that's ever
128
+ warranted.
humeo-core/docs/MCP_USAGE.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Using humeo-core from an MCP client
2
+
3
+ The installed console command is **`humeo-core`**. For backward compatibility,
4
+ **`humeo-mcp`** is also registered (same entrypoint); either works in
5
+ `"command": ...` if both are on `PATH` from the same install.
6
+
7
+ ## 1. Add to your client
8
+
9
+ `claude_desktop_config.json` or `.cursor/mcp.json`:
10
+
11
+ ```json
12
+ {
13
+ "mcpServers": {
14
+ "humeo": {
15
+ "command": "humeo-core"
16
+ }
17
+ }
18
+ }
19
+ ```
20
+
21
+ ## 2. A typical agent plan
22
+
23
+ ```
24
+ → humeo.list_layouts()
25
+ # discover the 5 layouts (max 2 on-screen items per short)
26
+
27
+ → humeo.ingest(source_path="/abs/long.mp4", work_dir="/abs/work", with_transcript=true)
28
+ # IngestResult: scenes[], keyframes, transcript_words[]
29
+
30
+ → humeo.classify_scenes(scenes=<IngestResult.scenes>)
31
+ # SceneClassification[] — one layout per scene
32
+
33
+ → humeo.select_clips(
34
+ source_path=..., transcript_words=..., duration_sec=...,
35
+ target_count=5, min_sec=30, max_sec=60
36
+ )
37
+ # ClipPlan — top non-overlapping clips
38
+
39
+ # For each clip, pick the layout of the scene its midpoint falls in,
40
+ # build a LayoutInstruction, and:
41
+
42
+ → humeo.build_render_cmd(request={...})
43
+ # dry-run: returns the exact ffmpeg argv, no execution
44
+
45
+ → humeo.render_clip(request={..., "mode": "normal"})
46
+ # actually renders the 9:16 MP4
47
+ ```
48
+
49
+ ## 3. Strict JSON all the way
50
+
51
+ Every request/response is validated against the schemas in
52
+ [`schemas.py`](../src/humeo_core/schemas.py). Invalid input is rejected
53
+ *before* ffmpeg is touched, so a confused agent can't accidentally
54
+ rm-rf your disk or burn GPU hours.
55
+
56
+ ## 4. Override knobs
57
+
58
+ `LayoutInstruction` accepts:
59
+
60
+ - `zoom`, `person_x_norm`, `chart_x_norm` — single-subject knobs.
61
+ - `split_chart_region`, `split_person_region`,
62
+ `split_second_chart_region`, `split_second_person_region` —
63
+ normalized bboxes that drive split-layout cropping.
64
+ - `top_band_ratio` — fraction of output height used by the top band
65
+ (default 0.5 = even 50/50, the symmetric look).
66
+ - `focus_stack_order` — for `split_chart_person`, chart-on-top vs
67
+ person-on-top.
68
+
69
+ Example: chart + person with a precise bbox crop and an even split.
70
+
71
+ ```json
72
+ {
73
+ "clip_id": "001",
74
+ "layout": "split_chart_person",
75
+ "split_chart_region": {"x1": 0.00, "y1": 0.10, "x2": 0.52, "y2": 0.95},
76
+ "split_person_region": {"x1": 0.55, "y1": 0.05, "x2": 1.00, "y2": 1.00},
77
+ "top_band_ratio": 0.5,
78
+ "focus_stack_order": "chart_then_person"
79
+ }
80
+ ```
81
+
82
+ Example: two-speaker interview.
83
+
84
+ ```json
85
+ {
86
+ "clip_id": "002",
87
+ "layout": "split_two_persons",
88
+ "split_person_region": {"x1": 0.02, "y1": 0.05, "x2": 0.48, "y2": 0.95},
89
+ "split_second_person_region": {"x1": 0.52, "y1": 0.05, "x2": 0.98, "y2": 0.95}
90
+ }
91
+ ```
92
+
93
+ ## 5. When to stay in dry-run
94
+
95
+ - You want to show an approval UI before spending CPU.
96
+ - You want to diff the planned ffmpeg commands against a previous run.
97
+ - You're building tests.
98
+
99
+ `mode="dry_run"` is always safe, never writes output, and returns the
100
+ exact argv list.
humeo-core/examples/render_request.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "source_path": "/absolute/path/to/long.mp4",
3
+ "clip": {
4
+ "clip_id": "001",
5
+ "topic": "Prediction Market Explosion",
6
+ "start_time_sec": 289.0,
7
+ "end_time_sec": 331.5,
8
+ "viral_hook": "Prediction markets could explode to $5 trillion.",
9
+ "virality_score": 0.94,
10
+ "transcript": "Full text for subtitle generation...",
11
+ "suggested_overlay_title": "$5T Prediction Markets"
12
+ },
13
+ "layout": {
14
+ "clip_id": "001",
15
+ "layout": "split_chart_person",
16
+ "zoom": 1.0,
17
+ "person_x_norm": 0.83,
18
+ "chart_x_norm": 0.0
19
+ },
20
+ "output_path": "/absolute/path/to/out/clip_001.mp4",
21
+ "title_text": "$5T Prediction Markets",
22
+ "mode": "dry_run"
23
+ }
humeo-core/pyproject.toml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "humeo-core"
7
+ version = "0.1.0"
8
+ description = "Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Humeo" }]
13
+ keywords = ["mcp", "video", "shorts", "ffmpeg", "editing", "humeo", "hive"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.10",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ ]
20
+ dependencies = [
21
+ "mcp[cli]>=1.2.0",
22
+ "pydantic>=2.0",
23
+ "scenedetect>=0.6",
24
+ ]
25
+
26
+ [project.optional-dependencies]
27
+ transcribe = ["faster-whisper>=1.0"]
28
+ download = ["yt-dlp>=2024.0"]
29
+ face = ["mediapipe>=0.10", "opencv-python>=4.8"]
30
+ vision = ["Pillow>=10.0"]
31
+ dev = ["pytest>=7", "pytest-asyncio>=0.23", "Pillow>=10.0"]
32
+
33
+ [project.scripts]
34
+ humeo-core = "humeo_core.server:main"
35
+ # Backward-compatible entry point (same module); existing MCP configs may still call `humeo-mcp`.
36
+ humeo-mcp = "humeo_core.server:main"
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+
41
+ [tool.setuptools.package-data]
42
+ humeo_core = ["assets/fonts/*"]
43
+
44
+ [tool.pytest.ini_options]
45
+ testpaths = ["tests"]
46
+ addopts = "-ra -q"
humeo-core/src/humeo_core.egg-info/PKG-INFO ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: humeo-core
3
+ Version: 0.1.0
4
+ Summary: Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints).
5
+ Author: Humeo
6
+ License: MIT
7
+ Keywords: mcp,video,shorts,ffmpeg,editing,humeo,hive
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: mcp[cli]>=1.2.0
16
+ Requires-Dist: pydantic>=2.0
17
+ Requires-Dist: scenedetect>=0.6
18
+ Provides-Extra: transcribe
19
+ Requires-Dist: faster-whisper>=1.0; extra == "transcribe"
20
+ Provides-Extra: download
21
+ Requires-Dist: yt-dlp>=2024.0; extra == "download"
22
+ Provides-Extra: face
23
+ Requires-Dist: mediapipe>=0.10; extra == "face"
24
+ Requires-Dist: opencv-python>=4.8; extra == "face"
25
+ Provides-Extra: vision
26
+ Requires-Dist: Pillow>=10.0; extra == "vision"
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=7; extra == "dev"
29
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
30
+ Requires-Dist: Pillow>=10.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # humeo-core
34
+
35
+ **Reusable-rocket MCP server for long-video → 9:16 shorts.**
36
+
37
+ First-principles design, from the HIVE paper + Bryan's rocket analogy:
38
+ we don't build doors and windows (general subject-tracker UI, retraining
39
+ models). We build the **container** (schemas), **landing gear** (deterministic
40
+ local extraction), and **five thrusters** (the five 9:16 layouts this video
41
+ format actually uses). Everything else is pluggable.
42
+
43
+ ## The rocket, in one picture
44
+
45
+ ```
46
+ ┌──────────────────────────────────────────┐
47
+ │ Control panel (MCP tools) │ <- any MCP client
48
+ └───────────────────┬──────────────────────┘
49
+ │ strict JSON
50
+ ┌────────────────┬───────────┼────────────────┬─────────────────┐
51
+ ▼ ▼ ▼ ▼ ▼
52
+ ingest classify_scenes select_clips plan_layout render_clip
53
+ (scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile,
54
+ keyframes + classifier) heuristic + pure filter dry-run safe)
55
+ transcript) LLM-ready) math)
56
+
57
+
58
+ ┌────────────────────┐
59
+ │ LayoutKind │
60
+ │ ──────────────── │
61
+ │ zoom_call_center │
62
+ │ sit_center │
63
+ │ split_chart_person│
64
+ │ split_two_persons │
65
+ │ split_two_charts │
66
+ └────────────────────┘
67
+ ```
68
+
69
+ Only the classifier and clip-selector have optional LLM hooks; everything
70
+ else is deterministic, local, and cheap.
71
+
72
+ ## Why five layouts? (the "max 2 items" rule)
73
+
74
+ The hard constraint for this format: **a short shows at most two on-screen
75
+ items** — where an "item" is a `person` (a human speaker) or a `chart`
76
+ (slide, graph, data visual, screenshare). That gives exactly five recipes:
77
+
78
+ 1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
79
+ 2. **`sit_center`** — 1 person, interview / seated framing.
80
+ 3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
81
+ (default: **even 50/50** top/bottom, chart on top).
82
+ 4. **`split_two_persons`** — 2 speakers, stacked vertically.
83
+ 5. **`split_two_charts`** — 2 charts, stacked vertically.
84
+
85
+ Because the geometry is bounded, we do NOT need a general subject-tracker
86
+ ML model or a drag-to-highlight UI. We need five small, correct pieces of
87
+ crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
88
+ is.
89
+
90
+ See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
91
+ used across these docs (subject, crop, band, seam, bbox, layout, etc.).
92
+
93
+ ## Install
94
+
95
+ ```bash
96
+ uv venv
97
+ uv sync
98
+ ```
99
+
100
+ External requirements: `ffmpeg` and `ffprobe` on PATH.
101
+
102
+ `scenedetect` requires OpenCV. Install `opencv-python-headless` or
103
+ `opencv-python` alongside `scenedetect`.
104
+
105
+ ## Use it as an MCP server
106
+
107
+ ```bash
108
+ humeo-core # stdio transport (primary console script)
109
+ # humeo-mcp # same entrypoint — kept so existing MCP configs keep working
110
+ ```
111
+
112
+ Example Cursor/Claude Desktop config:
113
+
114
+ ```json
115
+ {
116
+ "mcpServers": {
117
+ "humeo": { "command": "humeo-core" }
118
+ }
119
+ }
120
+ ```
121
+
122
+ Tools exposed:
123
+
124
+ | Tool | Purpose |
125
+ | --------------------------------- | --------------------------------------------------------------------------- |
126
+ | `list_layouts` | Enumerate the 5 supported layouts. |
127
+ | `ingest` | Scene detection + keyframe extraction (+ optional transcript). |
128
+ | `classify_scenes` | Pixel-heuristic per-scene layout classification. |
129
+ | `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). |
130
+ | `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
131
+ | `select_clips` | Heuristic clip picker over a word-level transcript. |
132
+ | `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. |
133
+ | `build_render_cmd` | Build the ffmpeg command (no execution) — review before spend. |
134
+ | `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. |
135
+
136
+ Resource: `humeo://layouts` (JSON listing of the 5 layouts).
137
+
138
+ ### Three interchangeable region detectors
139
+
140
+ All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
141
+
142
+ ```
143
+ classify.py (pixel variance, no ML)
144
+ face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
145
+ vision.py (multimodal LLM + OCR bboxes)
146
+ ```
147
+
148
+ ## JSON contracts (non-negotiable)
149
+
150
+ All tools take and return Pydantic-validated JSON. The contracts live in
151
+ [`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
152
+
153
+ - `Scene` `{scene_id, start_time, end_time, keyframe_path?}`
154
+ - `TranscriptWord` `{word, start_time, end_time}`
155
+ - `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
156
+ - `SceneClassification` `{scene_id, layout, confidence, reason}`
157
+ - `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized)
158
+ - `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
159
+ - `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
160
+ - `ClipPlan` `{source_path, clips[]}`
161
+ - `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
162
+ - `RenderRequest` / `RenderResult`
163
+
164
+ ## First-principles decisions (what we intentionally did NOT build)
165
+
166
+ - **No giant subject-tracker ML.** The video format has 5 fixed layouts
167
+ (with a hard "max 2 items" rule); pixel-level tracking is not needed.
168
+ - **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
169
+ agent-first workflow. If a human wants to override, they pass a
170
+ `LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
171
+ `zoom`.
172
+ - **No end-to-end video→video model.** The HIVE paper's core insight is
173
+ that decomposed orchestration beats monolithic generation. We reify
174
+ that insight as six small composable tools.
175
+
176
+ ## Extending the pilot
177
+
178
+ - Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
179
+ - Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
180
+ - Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
181
+ to get per-scene bboxes + OCR text, then feed the results back through
182
+ `classify_scenes_with_vision`. This is the scene-change → v3 images →
183
+ LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
184
+ - All enforce strict JSON outputs, so bad model output can't corrupt
185
+ downstream stages.
186
+
187
+ ## Testing
188
+
189
+ ```bash
190
+ python -m pytest
191
+ ```
192
+
193
+ See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
194
+
195
+ ## License
196
+
197
+ MIT
humeo-core/src/humeo_core.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/humeo_core/__init__.py
5
+ src/humeo_core/schemas.py
6
+ src/humeo_core/server.py
7
+ src/humeo_core.egg-info/PKG-INFO
8
+ src/humeo_core.egg-info/SOURCES.txt
9
+ src/humeo_core.egg-info/dependency_links.txt
10
+ src/humeo_core.egg-info/entry_points.txt
11
+ src/humeo_core.egg-info/requires.txt
12
+ src/humeo_core.egg-info/top_level.txt
13
+ src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
14
+ src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
15
+ src/humeo_core/assets/fonts/SourceSans3-OFL.txt
16
+ src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
17
+ src/humeo_core/primitives/__init__.py
18
+ src/humeo_core/primitives/classify.py
19
+ src/humeo_core/primitives/compile.py
20
+ src/humeo_core/primitives/face_detect.py
21
+ src/humeo_core/primitives/ingest.py
22
+ src/humeo_core/primitives/layouts.py
23
+ src/humeo_core/primitives/select_clips.py
24
+ src/humeo_core/primitives/vision.py
25
+ tests/test_classify.py
26
+ tests/test_compile.py
27
+ tests/test_face_detect.py
28
+ tests/test_layout_bbox.py
29
+ tests/test_layouts.py
30
+ tests/test_schemas.py
31
+ tests/test_select_clips.py
32
+ tests/test_server_tools.py
33
+ tests/test_vision.py
humeo-core/src/humeo_core.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
humeo-core/src/humeo_core.egg-info/entry_points.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [console_scripts]
2
+ humeo-core = humeo_core.server:main
3
+ humeo-mcp = humeo_core.server:main
humeo-core/src/humeo_core.egg-info/requires.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mcp[cli]>=1.2.0
2
+ pydantic>=2.0
3
+ scenedetect>=0.6
4
+
5
+ [dev]
6
+ pytest>=7
7
+ pytest-asyncio>=0.23
8
+ Pillow>=10.0
9
+
10
+ [download]
11
+ yt-dlp>=2024.0
12
+
13
+ [face]
14
+ mediapipe>=0.10
15
+ opencv-python>=4.8
16
+
17
+ [transcribe]
18
+ faster-whisper>=1.0
19
+
20
+ [vision]
21
+ Pillow>=10.0
humeo-core/src/humeo_core.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ humeo_core
humeo-core/src/humeo_core/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """humeo-core: reusable-rocket MCP primitives for long-video-to-shorts editing.
2
+
3
+ First-principles design (rocket analogy):
4
+ Container -> schemas.py (strict JSON contracts)
5
+ Landing gear -> primitives/ingest.py, primitives/compile.py (deterministic local)
6
+ Thrusters -> primitives/layouts.py (5 fixed 9:16 layouts, max 2 items)
7
+ Pilot -> primitives/classify.py, primitives/select_clips.py (heuristic, LLM-ready)
8
+ Control panel -> server.py (FastMCP tools that expose all primitives)
9
+ """
10
+
11
+ from .schemas import (
12
+ BoundingBox,
13
+ Clip,
14
+ ClipPlan,
15
+ ClipRenderSpan,
16
+ ClipSubtitleWords,
17
+ FocusStackOrder,
18
+ IngestResult,
19
+ LayoutInstruction,
20
+ LayoutKind,
21
+ RenderRequest,
22
+ RenderResult,
23
+ RenderTheme,
24
+ Scene,
25
+ SceneClassification,
26
+ SceneRegions,
27
+ TranscriptWord,
28
+ )
29
+
30
+ __all__ = [
31
+ "BoundingBox",
32
+ "Clip",
33
+ "ClipPlan",
34
+ "ClipRenderSpan",
35
+ "ClipSubtitleWords",
36
+ "FocusStackOrder",
37
+ "IngestResult",
38
+ "LayoutInstruction",
39
+ "LayoutKind",
40
+ "RenderRequest",
41
+ "RenderResult",
42
+ "RenderTheme",
43
+ "Scene",
44
+ "SceneClassification",
45
+ "SceneRegions",
46
+ "TranscriptWord",
47
+ ]
48
+
49
+ __version__ = "0.1.0"
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf ADDED
Binary file (95.1 kB). View file
 
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2020 The League Spartan Project Authors (https://github.com/theleagueof/league-spartan)
2
+
3
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
4
+ This license is copied below, and is also available with a FAQ at:
5
+ https://scripts.sil.org/OFL
6
+
7
+
8
+ -----------------------------------------------------------
9
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
10
+ -----------------------------------------------------------
11
+
12
+ PREAMBLE
13
+ The goals of the Open Font License (OFL) are to stimulate worldwide
14
+ development of collaborative font projects, to support the font creation
15
+ efforts of academic and linguistic communities, and to provide a free and
16
+ open framework in which fonts may be shared and improved in partnership
17
+ with others.
18
+
19
+ The OFL allows the licensed fonts to be used, studied, modified and
20
+ redistributed freely as long as they are not sold by themselves. The
21
+ fonts, including any derivative works, can be bundled, embedded,
22
+ redistributed and/or sold with any software provided that any reserved
23
+ names are not used by derivative works. The fonts and derivatives,
24
+ however, cannot be released under any other type of license. The
25
+ requirement for fonts to remain under this license does not apply
26
+ to any document created using the fonts or their derivatives.
27
+
28
+ DEFINITIONS
29
+ "Font Software" refers to the set of files released by the Copyright
30
+ Holder(s) under this license and clearly marked as such. This may
31
+ include source files, build scripts and documentation.
32
+
33
+ "Reserved Font Name" refers to any names specified as such after the
34
+ copyright statement(s).
35
+
36
+ "Original Version" refers to the collection of Font Software components as
37
+ distributed by the Copyright Holder(s).
38
+
39
+ "Modified Version" refers to any derivative made by adding to, deleting,
40
+ or substituting -- in part or in whole -- any of the components of the
41
+ Original Version, by changing formats or by porting the Font Software to a
42
+ new environment.
43
+
44
+ "Author" refers to any designer, engineer, programmer, technical
45
+ writer or other person who contributed to the Font Software.
46
+
47
+ PERMISSION & CONDITIONS
48
+ Permission is hereby granted, free of charge, to any person obtaining
49
+ a copy of the Font Software, to use, study, copy, merge, embed, modify,
50
+ redistribute, and sell modified and unmodified copies of the Font
51
+ Software, subject to the following conditions:
52
+
53
+ 1) Neither the Font Software nor any of its individual components,
54
+ in Original or Modified Versions, may be sold by itself.
55
+
56
+ 2) Original or Modified Versions of the Font Software may be bundled,
57
+ redistributed and/or sold with any software, provided that each copy
58
+ contains the above copyright notice and this license. These can be
59
+ included either as stand-alone text files, human-readable headers or
60
+ in the appropriate machine-readable metadata fields within text or
61
+ binary files as long as those fields can be easily viewed by the user.
62
+
63
+ 3) No Modified Version of the Font Software may use the Reserved Font
64
+ Name(s) unless explicit written permission is granted by the corresponding
65
+ Copyright Holder. This restriction only applies to the primary font name as
66
+ presented to the users.
67
+
68
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
69
+ Software shall not be used to promote, endorse or advertise any
70
+ Modified Version, except to acknowledge the contribution(s) of the
71
+ Copyright Holder(s) and the Author(s) or with their explicit written
72
+ permission.
73
+
74
+ 5) The Font Software, modified or unmodified, in part or in whole,
75
+ must be distributed entirely under this license, and must not be
76
+ distributed under any other license. The requirement for fonts to
77
+ remain under this license does not apply to any document created
78
+ using the Font Software.
79
+
80
+ TERMINATION
81
+ This license becomes null and void if any of the above conditions are
82
+ not met.
83
+
84
+ DISCLAIMER
85
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
87
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
88
+ OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
89
+ COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90
+ INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
91
+ DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92
+ FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
93
+ OTHER DEALINGS IN THE FONT SOFTWARE.
humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2010-2020 Adobe (http://www.adobe.com/), with Reserved Font Name 'Source'. All Rights Reserved. Source is a trademark of Adobe in the United States and/or other countries.
2
+
3
+ This Font Software is licensed under the SIL Open Font License, Version 1.1.
4
+
5
+ This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
6
+
7
+
8
+ -----------------------------------------------------------
9
+ SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
10
+ -----------------------------------------------------------
11
+
12
+ PREAMBLE
13
+ The goals of the Open Font License (OFL) are to stimulate worldwide
14
+ development of collaborative font projects, to support the font creation
15
+ efforts of academic and linguistic communities, and to provide a free and
16
+ open framework in which fonts may be shared and improved in partnership
17
+ with others.
18
+
19
+ The OFL allows the licensed fonts to be used, studied, modified and
20
+ redistributed freely as long as they are not sold by themselves. The
21
+ fonts, including any derivative works, can be bundled, embedded,
22
+ redistributed and/or sold with any software provided that any reserved
23
+ names are not used by derivative works. The fonts and derivatives,
24
+ however, cannot be released under any other type of license. The
25
+ requirement for fonts to remain under this license does not apply
26
+ to any document created using the fonts or their derivatives.
27
+
28
+ DEFINITIONS
29
+ "Font Software" refers to the set of files released by the Copyright
30
+ Holder(s) under this license and clearly marked as such. This may
31
+ include source files, build scripts and documentation.
32
+
33
+ "Reserved Font Name" refers to any names specified as such after the
34
+ copyright statement(s).
35
+
36
+ "Original Version" refers to the collection of Font Software components as
37
+ distributed by the Copyright Holder(s).
38
+
39
+ "Modified Version" refers to any derivative made by adding to, deleting,
40
+ or substituting -- in part or in whole -- any of the components of the
41
+ Original Version, by changing formats or by porting the Font Software to a
42
+ new environment.
43
+
44
+ "Author" refers to any designer, engineer, programmer, technical
45
+ writer or other person who contributed to the Font Software.
46
+
47
+ PERMISSION & CONDITIONS
48
+ Permission is hereby granted, free of charge, to any person obtaining
49
+ a copy of the Font Software, to use, study, copy, merge, embed, modify,
50
+ redistribute, and sell modified and unmodified copies of the Font
51
+ Software, subject to the following conditions:
52
+
53
+ 1) Neither the Font Software nor any of its individual components,
54
+ in Original or Modified Versions, may be sold by itself.
55
+
56
+ 2) Original or Modified Versions of the Font Software may be bundled,
57
+ redistributed and/or sold with any software, provided that each copy
58
+ contains the above copyright notice and this license. These can be
59
+ included either as stand-alone text files, human-readable headers or
60
+ in the appropriate machine-readable metadata fields within text or
61
+ binary files as long as those fields can be easily viewed by the user.
62
+
63
+ 3) No Modified Version of the Font Software may use the Reserved Font
64
+ Name(s) unless explicit written permission is granted by the corresponding
65
+ Copyright Holder. This restriction only applies to the primary font name as
66
+ presented to the users.
67
+
68
+ 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
69
+ Software shall not be used to promote, endorse or advertise any
70
+ Modified Version, except to acknowledge the contribution(s) of the
71
+ Copyright Holder(s) and the Author(s) or with their explicit written
72
+ permission.
73
+
74
+ 5) The Font Software, modified or unmodified, in part or in whole,
75
+ must be distributed entirely under this license, and must not be
76
+ distributed under any other license. The requirement for fonts to
77
+ remain under this license does not apply to any document created
78
+ using the Font Software.
79
+
80
+ TERMINATION
81
+ This license becomes null and void if any of the above conditions are
82
+ not met.
83
+
84
+ DISCLAIMER
85
+ THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
87
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
88
+ OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
89
+ COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90
+ INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
91
+ DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
92
+ FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
93
+ OTHER DEALINGS IN THE FONT SOFTWARE.
humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39e3ab05ccd7cb94907c31005bb5bec1d5432f0b096a2b782976e217a540eb6c
3
+ size 395372
humeo-core/src/humeo_core/primitives/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Primitives: deterministic, composable building blocks of the rocket."""
humeo-core/src/humeo_core/primitives/classify.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scene classifier: assigns one of the 5 layouts to each scene.
2
+
3
+ Two backends share the same contract:
4
+
5
+ * ``classify_scenes_heuristic`` — no model call. Uses keyframe pixel analysis
6
+ (edge density + color variance + face-rectangle heuristic-free approach)
7
+ to guess which of the 5 layouts fits best. Fully offline, deterministic.
8
+ Note: the heuristic only picks between ``SIT_CENTER`` / ``ZOOM_CALL_CENTER`` /
9
+ ``SPLIT_CHART_PERSON``; the two-of-a-kind splits (``SPLIT_TWO_PERSONS`` /
10
+ ``SPLIT_TWO_CHARTS``) are only selectable by the vision-LLM backend.
11
+ * ``classify_scenes_with_llm`` — pluggable LLM hook. Takes a callable
12
+ ``(image_path, prompt) -> str`` so the caller (MCP client or test) can
13
+ wire up whatever multimodal model they want. Enforces strict JSON output.
14
+
15
+ Even without a model, the heuristic is good enough for many real inputs and
16
+ keeps the whole pipeline runnable with zero external dependencies.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import struct
24
+ from typing import Callable, Iterable
25
+
26
+ from ..schemas import LayoutKind, Scene, SceneClassification
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Tiny PNG/JPEG reader → down-sampled grayscale column profile
31
+ # ---------------------------------------------------------------------------
32
+ # We intentionally avoid a hard dependency on Pillow. If Pillow is available
33
+ # we use it; otherwise we fall back to reading just PNG dimensions, which is
34
+ # enough for a coarse column-variance heuristic on any pre-decoded frame.
35
+
36
+
37
+ def _load_grayscale(path: str) -> tuple[list[list[int]], int, int] | None:
38
+ try:
39
+ from PIL import Image # type: ignore
40
+
41
+ img = Image.open(path).convert("L")
42
+ w, h = img.size
43
+ # Down-sample to at most 128 cols x 72 rows for cheap analysis.
44
+ tw = min(128, w)
45
+ th = min(72, h)
46
+ img = img.resize((tw, th))
47
+ px = list(img.getdata())
48
+ grid = [px[i * tw : (i + 1) * tw] for i in range(th)]
49
+ return grid, tw, th
50
+ except Exception:
51
+ return None
52
+
53
+
54
+ def _png_dims(path: str) -> tuple[int, int] | None:
55
+ try:
56
+ with open(path, "rb") as f:
57
+ head = f.read(24)
58
+ if head[:8] != b"\x89PNG\r\n\x1a\n":
59
+ return None
60
+ w, h = struct.unpack(">II", head[16:24])
61
+ return int(w), int(h)
62
+ except Exception:
63
+ return None
64
+
65
+
66
+ def _column_profile(grid: list[list[int]]) -> list[float]:
67
+ if not grid:
68
+ return []
69
+ h = len(grid)
70
+ w = len(grid[0])
71
+ out: list[float] = []
72
+ for x in range(w):
73
+ s = 0
74
+ for y in range(h):
75
+ s += grid[y][x]
76
+ out.append(s / h)
77
+ return out
78
+
79
+
80
+ def _variance(values: Iterable[float]) -> float:
81
+ vs = list(values)
82
+ if not vs:
83
+ return 0.0
84
+ m = sum(vs) / len(vs)
85
+ return sum((v - m) ** 2 for v in vs) / len(vs)
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Heuristic classifier
90
+ # ---------------------------------------------------------------------------
91
+
92
+
93
+ def _classify_one_heuristic(keyframe_path: str | None) -> SceneClassification:
94
+ if not keyframe_path or not os.path.exists(keyframe_path):
95
+ return SceneClassification(
96
+ scene_id="?",
97
+ layout=LayoutKind.SIT_CENTER,
98
+ confidence=0.3,
99
+ reason="no keyframe available — defaulting to SIT_CENTER",
100
+ )
101
+
102
+ gs = _load_grayscale(keyframe_path)
103
+ if gs is None:
104
+ # Can't read pixels: still return a safe default with low confidence.
105
+ return SceneClassification(
106
+ scene_id="?",
107
+ layout=LayoutKind.SIT_CENTER,
108
+ confidence=0.25,
109
+ reason="PIL unavailable or image unreadable — defaulting to SIT_CENTER",
110
+ )
111
+
112
+ grid, w, h = gs
113
+ cols = _column_profile(grid)
114
+
115
+ def _split_contrast(left: list[float], right: list[float]) -> float:
116
+ lm = sum(left) / max(1, len(left))
117
+ rm = sum(right) / max(1, len(right))
118
+ lv = _variance(left)
119
+ rv = _variance(right)
120
+ between = (lm - rm) ** 2
121
+ within = (lv + rv) / 2.0 + 1e-6
122
+ return between / within
123
+
124
+ # Left/right halves — good for symmetric two-up scenes.
125
+ mid = max(1, w // 2)
126
+ split_halves = _split_contrast(cols[:mid], cols[mid:])
127
+
128
+ # Left 2/3 vs right 1/3 — matches explainer slides (chart + talking head).
129
+ t = max(1, w // 3)
130
+ left_two_thirds = cols[: 2 * t]
131
+ right_one_third = cols[2 * t :]
132
+ split_thirds = _split_contrast(left_two_thirds, right_one_third)
133
+
134
+ split_score = max(split_halves, split_thirds)
135
+ # Overall column variance: low variance → flat composition (zoom call).
136
+ overall_var = _variance(cols)
137
+
138
+ # Threshold tuned on Ark-style 2/3 chart + 1/3 speaker; "thirds" score catches
139
+ # layouts where half-vs-half contrast was too weak (e.g. clip 005 vs 004).
140
+ if split_score > 20.0:
141
+ return SceneClassification(
142
+ scene_id="?",
143
+ layout=LayoutKind.SPLIT_CHART_PERSON,
144
+ confidence=min(0.95, 0.5 + split_score / 200.0),
145
+ reason=(
146
+ f"chart/person contrast (halves={split_halves:.1f}, "
147
+ f"thirds={split_thirds:.1f} → max={split_score:.1f})"
148
+ ),
149
+ )
150
+ if overall_var < 100.0:
151
+ return SceneClassification(
152
+ scene_id="?",
153
+ layout=LayoutKind.ZOOM_CALL_CENTER,
154
+ confidence=0.7,
155
+ reason=f"low column variance ({overall_var:.1f}) — flat centered framing",
156
+ )
157
+ return SceneClassification(
158
+ scene_id="?",
159
+ layout=LayoutKind.SIT_CENTER,
160
+ confidence=0.6,
161
+ reason=f"moderate composition (score={split_score:.1f}, var={overall_var:.1f})",
162
+ )
163
+
164
+
165
+ def classify_scenes_heuristic(scenes: list[Scene]) -> list[SceneClassification]:
166
+ out: list[SceneClassification] = []
167
+ for s in scenes:
168
+ r = _classify_one_heuristic(s.keyframe_path)
169
+ out.append(r.model_copy(update={"scene_id": s.scene_id}))
170
+ return out
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # LLM-backed classifier (caller provides the model hook)
175
+ # ---------------------------------------------------------------------------
176
+
177
+
178
+ LLMVisionFn = Callable[[str, str], str]
179
+ """Signature: (image_path, prompt) -> raw model string (expected JSON)."""
180
+
181
+
182
+ CLASSIFIER_PROMPT = """You are a scene layout classifier for a short-video editor.
183
+ Return ONLY a JSON object of the form:
184
+ {"layout": "<one of: zoom_call_center | sit_center | split_chart_person>",
185
+ "confidence": <0..1 float>,
186
+ "reason": "<=15 words"}
187
+
188
+ Layout definitions:
189
+ - zoom_call_center: one person on a video call (webcam grid / talking head tight crop), subject centered.
190
+ - sit_center: one person sitting in frame, subject centered, wider framing than a zoom call.
191
+ - split_chart_person: an explainer scene with a chart/graphic on the LEFT (~2/3 of frame) and a person on the RIGHT (~1/3).
192
+
193
+ Pick the single best match. No prose, no markdown, JSON only.
194
+ """
195
+
196
+
197
+ def classify_scenes_with_llm(
198
+ scenes: list[Scene], vision_fn: LLMVisionFn
199
+ ) -> list[SceneClassification]:
200
+ out: list[SceneClassification] = []
201
+ for s in scenes:
202
+ if not s.keyframe_path:
203
+ out.append(
204
+ SceneClassification(
205
+ scene_id=s.scene_id,
206
+ layout=LayoutKind.SIT_CENTER,
207
+ confidence=0.2,
208
+ reason="no keyframe",
209
+ )
210
+ )
211
+ continue
212
+ raw = vision_fn(s.keyframe_path, CLASSIFIER_PROMPT)
213
+ try:
214
+ data = json.loads(raw)
215
+ out.append(
216
+ SceneClassification(
217
+ scene_id=s.scene_id,
218
+ layout=LayoutKind(data["layout"]),
219
+ confidence=float(data.get("confidence", 0.5)),
220
+ reason=str(data.get("reason", ""))[:200],
221
+ )
222
+ )
223
+ except Exception as e:
224
+ out.append(
225
+ SceneClassification(
226
+ scene_id=s.scene_id,
227
+ layout=LayoutKind.SIT_CENTER,
228
+ confidence=0.25,
229
+ reason=f"LLM parse error: {e!r}",
230
+ )
231
+ )
232
+ return out
humeo-core/src/humeo_core/primitives/compile.py ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compiler: assemble a final 9:16 clip from source + clip + layout instruction.
2
+
3
+ Builds the ffmpeg invocation, optionally runs it. Keeping ``dry_run`` as a
4
+ first-class mode means the MCP server can return the exact command without
5
+ executing — ideal for an agent that wants to review before spending CPU.
6
+
7
+ Rendering order is fixed and intentional:
8
+
9
+ 1. **Cut + crop/compose.** ``plan_layout`` produces the base filtergraph
10
+ that takes the source, applies the layout-specific crops, and emits a
11
+ labelled ``[vout]`` at the exact output resolution (e.g. 1080x1920).
12
+ 2. **Overlay title** (``drawtext``) — skipped for split layouts because
13
+ the source itself already has a slide/chart title and an extra overlay
14
+ just obscures content.
15
+ 3. **Subtitles.** ``subtitles`` filter runs **last** so text is drawn over
16
+ the finished composition, not the source. ``original_size`` is pinned
17
+ to the output resolution so libass coordinate math (MarginV, FontSize)
18
+ is in *output pixels*, not libass's default PlayResY=288 — which was
19
+ the bug behind the "subtitles blocked / floating in the middle" look.
20
+ 4. **Mux** with the source audio stream (``0:a:0``).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import os
26
+ import shutil
27
+ import subprocess
28
+ import tempfile
29
+ from pathlib import Path
30
+
31
+ from ..schemas import RenderRequest, RenderResult, RenderTheme, SPLIT_LAYOUTS
32
+ from .layouts import plan_layout
33
+
34
+
35
+ def _ensure_ffmpeg() -> str:
36
+ exe = shutil.which("ffmpeg")
37
+ if not exe:
38
+ raise RuntimeError("ffmpeg not found on PATH")
39
+ return exe
40
+
41
+
42
+ def _ensure_windows_fontconfig() -> dict[str, str]:
43
+ """Return subprocess env with a minimal fontconfig setup on Windows.
44
+
45
+ Some Windows FFmpeg builds ship libass + fontconfig but do not bundle a
46
+ default fontconfig config, which makes subtitle rendering fail with:
47
+
48
+ ``Fontconfig error: Cannot load default config file: No such file: (null)``
49
+
50
+ We generate a tiny config that points fontconfig at ``C:/Windows/Fonts`` and
51
+ a writable cache dir under ``%LOCALAPPDATA%/humeo``. Non-Windows platforms
52
+ pass through the existing environment unchanged.
53
+ """
54
+ env = os.environ.copy()
55
+ if os.name != "nt":
56
+ return env
57
+ if env.get("FONTCONFIG_FILE"):
58
+ return env
59
+
60
+ local_appdata = Path(
61
+ env.get("LOCALAPPDATA", str(Path(tempfile.gettempdir()) / "humeo-local"))
62
+ )
63
+ cfg_dir = local_appdata / "humeo" / "fontconfig"
64
+ cache_dir = local_appdata / "humeo" / "fontconfig-cache"
65
+ cfg_dir.mkdir(parents=True, exist_ok=True)
66
+ cache_dir.mkdir(parents=True, exist_ok=True)
67
+
68
+ cfg_file = cfg_dir / "fonts.conf"
69
+ windows_fonts = Path(env.get("WINDIR", r"C:\Windows")) / "Fonts"
70
+ if not cfg_file.exists():
71
+ cfg_file.write_text(
72
+ "\n".join(
73
+ [
74
+ '<?xml version="1.0"?>',
75
+ "<fontconfig>",
76
+ f" <dir>{windows_fonts.as_posix()}</dir>",
77
+ f" <cachedir>{cache_dir.as_posix()}</cachedir>",
78
+ "</fontconfig>",
79
+ "",
80
+ ]
81
+ ),
82
+ encoding="utf-8",
83
+ )
84
+
85
+ env["FONTCONFIG_PATH"] = str(cfg_dir)
86
+ env["FONTCONFIG_FILE"] = str(cfg_file)
87
+ return env
88
+
89
+
90
+ def _escape_drawtext(text: str) -> str:
91
+ # drawtext quoting is brittle across ffmpeg builds. Keep it simple:
92
+ # collapse whitespace, drop apostrophes, and escape the characters
93
+ # that are still significant to the filter parser.
94
+ safe = " ".join(text.split()).replace("'", "")
95
+ return safe.replace("\\", "\\\\").replace(":", "\\:")
96
+
97
+
98
+ # ---------------------------------------------------------------------------
99
+ # Title overlay planning
100
+ # ---------------------------------------------------------------------------
101
+ #
102
+ # ffmpeg ``drawtext`` does not wrap text by itself; whatever you hand it is
103
+ # emitted as a single line. With a fixed 72px font and no width budget, the
104
+ # "Prediction Markets vs Derivatives" title on a 1080px canvas would spill
105
+ # past both edges and show up clipped (the user reported exactly this bug).
106
+ #
107
+ # The helpers below plan a title layout BEFORE it hits drawtext:
108
+ #
109
+ # 1. Short titles (fit at 72px single line): emit the existing single
110
+ # ``drawtext`` call unchanged so golden tests and previously-calibrated
111
+ # visuals stay byte-for-byte identical.
112
+ # 2. Long titles: split at the best word boundary into two balanced lines and
113
+ # emit two stacked ``drawtext`` filters at a slightly smaller font
114
+ # (60px / 52px / 44px, auto-shrinking until both lines fit).
115
+ # 3. Single-word titles that still overflow: shrink the single line until it
116
+ # fits, then hard-truncate with an ellipsis as a last resort.
117
+ #
118
+ # The character-width estimate is deliberately conservative (0.55 * fontsize)
119
+ # so mixed-case prose with wide letters like W/M still clears the margin.
120
+ # Calibrated visually against Arial Bold on 1080p output.
121
+
122
+ _TITLE_PRIMARY_SIZE = 72 # Current "hero" title size; preserved for short titles.
123
+ _TITLE_MIN_SIZE = 44 # Readability floor at 1080x1920 output.
124
+ _TITLE_MARGIN_PX = 60 # Horizontal safe-area on each side.
125
+ _TITLE_Y_TOP = 80 # Pixel offset of the top title baseline (matches pre-P2 look).
126
+ _TITLE_CHAR_WIDTH_RATIO = 0.55
127
+ _TITLE_LINE_SPACING_RATIO = 1.3
128
+
129
+ # Keep the overlay font explicit. Without a ``font=`` directive, drawtext
130
+ # falls back to fontconfig's "Sans", which resolves to a serif (Times New
131
+ # Roman) on default Windows installs — the "ugly serif title" bug reported
132
+ # against v1. Arial matches the ASS subtitle ``Fontname`` below so the
133
+ # title and captions read as a single typographic family. Keep this in
134
+ # sync with the ``Fontname=Arial`` in the subtitle filter if it ever
135
+ # changes.
136
+ _TITLE_FONT_NAME = "Arial"
137
+ _REFERENCE_TITLE_FONT_NAME = "League Spartan"
138
+ _REFERENCE_CAPTION_FONT_NAME = "Source Sans 3"
139
+ _REFERENCE_TITLE_BAR_X = 28
140
+ _REFERENCE_TITLE_BAR_Y = 32
141
+ _REFERENCE_TITLE_BAR_W = 1024
142
+ _REFERENCE_TITLE_BAR_H = 148
143
+ _REFERENCE_TITLE_TEXT_X = 72
144
+ _REFERENCE_TITLE_TEXT_Y = 54
145
+ _REFERENCE_TITLE_SIZE = 64
146
+ _REFERENCE_CAPTION_BAR_X = 0
147
+ _REFERENCE_CAPTION_BAR_W = 1080
148
+ _REFERENCE_CAPTION_BAR_H = 120
149
+ _REFERENCE_CAPTION_TEXT_MARGIN_L = 92
150
+ _REFERENCE_CAPTION_TEXT_MARGIN_R = 92
151
+
152
+
153
+ def _fonts_dir() -> Path:
154
+ return Path(__file__).resolve().parents[1] / "assets" / "fonts"
155
+
156
+
157
+ def _bundled_font_path(filename: str) -> Path | None:
158
+ path = _fonts_dir() / filename
159
+ return path if path.is_file() else None
160
+
161
+
162
+ def _title_char_px(size_px: int) -> float:
163
+ return size_px * _TITLE_CHAR_WIDTH_RATIO
164
+
165
+
166
+ def _title_fits(text: str, size_px: int, usable_w: int) -> bool:
167
+ return int(len(text) * _title_char_px(size_px)) <= usable_w
168
+
169
+
170
+ def _wrap_title_two_lines(text: str) -> tuple[str, str]:
171
+ """Split ``text`` at the word boundary that most balances the two halves.
172
+
173
+ Returns ``(line1, line2)``. If ``text`` has fewer than two words, returns
174
+ ``(text, "")`` and the caller should fall back to single-line shrinking.
175
+ """
176
+ words = text.split()
177
+ if len(words) < 2:
178
+ return text, ""
179
+ best_idx = 1
180
+ best_delta = 10**9
181
+ for i in range(1, len(words)):
182
+ left = " ".join(words[:i])
183
+ right = " ".join(words[i:])
184
+ delta = abs(len(left) - len(right))
185
+ if delta < best_delta:
186
+ best_delta = delta
187
+ best_idx = i
188
+ return " ".join(words[:best_idx]), " ".join(words[best_idx:])
189
+
190
+
191
+ def _drawtext_font_arg() -> str:
192
+ """Return a drawtext font selector that is stable on the current platform."""
193
+ if os.name == "nt":
194
+ arial = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" / "arial.ttf"
195
+ if arial.is_file():
196
+ return f"fontfile='{_escape_filter_path(str(arial))}'"
197
+ return f"font={_TITLE_FONT_NAME}"
198
+
199
+
200
+ def _reference_title_font_arg() -> str:
201
+ bundled = _bundled_font_path("LeagueSpartan-Bold-static.ttf") or _bundled_font_path(
202
+ "LeagueSpartan-Bold.ttf"
203
+ )
204
+ if bundled is not None:
205
+ return f"fontfile='{_escape_filter_path(str(bundled))}'"
206
+ return f"font={_REFERENCE_TITLE_FONT_NAME}"
207
+
208
+
209
+ def _drawtext_single(text: str, size: int, y: int) -> str:
210
+ esc = _escape_drawtext(text)
211
+ return (
212
+ f"drawtext=text='{esc}':"
213
+ "expansion=none:"
214
+ f"{_drawtext_font_arg()}:"
215
+ f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
216
+ f"x=(w-text_w)/2:y={y}"
217
+ )
218
+
219
+
220
+ def _drawtext_two(line1: str, line2: str, size: int, y_top: int) -> str:
221
+ """Two drawtext filters chained by comma — one ffmpeg filter chain, two lines."""
222
+ esc1 = _escape_drawtext(line1)
223
+ esc2 = _escape_drawtext(line2)
224
+ y_bottom = y_top + int(round(size * _TITLE_LINE_SPACING_RATIO))
225
+ return (
226
+ f"drawtext=text='{esc1}':"
227
+ "expansion=none:"
228
+ f"{_drawtext_font_arg()}:"
229
+ f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
230
+ f"x=(w-text_w)/2:y={y_top},"
231
+ f"drawtext=text='{esc2}':"
232
+ "expansion=none:"
233
+ f"{_drawtext_font_arg()}:"
234
+ f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
235
+ f"x=(w-text_w)/2:y={y_bottom}"
236
+ )
237
+
238
+
239
+ def plan_title_drawtext(title_text: str, out_w: int = 1080) -> str | None:
240
+ """Return the ``drawtext`` filter fragment for ``title_text`` or None to skip.
241
+
242
+ The returned string is intended to be spliced into the main filtergraph
243
+ between the ``[v_prepad]`` and ``[vout]`` labels by
244
+ :func:`build_ffmpeg_cmd`. It does NOT include those labels itself.
245
+
246
+ Backward compatibility: when the title fits on one line at the original
247
+ 72px size, the output is identical to the pre-P2 single-``drawtext``
248
+ form (same x/y/fontsize/borderw), so golden ffmpeg tests stay green.
249
+ """
250
+ text = " ".join((title_text or "").split())
251
+ if not text:
252
+ return None
253
+ usable_w = max(1, out_w - 2 * _TITLE_MARGIN_PX)
254
+
255
+ if _title_fits(text, _TITLE_PRIMARY_SIZE, usable_w):
256
+ return _drawtext_single(text, _TITLE_PRIMARY_SIZE, _TITLE_Y_TOP)
257
+
258
+ line1, line2 = _wrap_title_two_lines(text)
259
+ if line2:
260
+ for size in (60, 52, _TITLE_MIN_SIZE):
261
+ if _title_fits(line1, size, usable_w) and _title_fits(line2, size, usable_w):
262
+ return _drawtext_two(line1, line2, size, _TITLE_Y_TOP)
263
+
264
+ for size in (64, 56, 52, _TITLE_MIN_SIZE):
265
+ if _title_fits(text, size, usable_w):
266
+ return _drawtext_single(text, size, _TITLE_Y_TOP)
267
+
268
+ max_chars = max(4, int(usable_w / _title_char_px(_TITLE_MIN_SIZE)))
269
+ truncated = text[: max_chars - 1].rstrip() + "..."
270
+ return _drawtext_single(truncated, _TITLE_MIN_SIZE, _TITLE_Y_TOP)
271
+
272
+
273
+ def _reference_title_fragment(title_text: str, out_w: int = 1080) -> str:
274
+ bar_w = min(_REFERENCE_TITLE_BAR_W, max(320, out_w - 2 * _REFERENCE_TITLE_BAR_X))
275
+ accent_w = 16
276
+ title = " ".join((title_text or "").split())
277
+ usable_w = max(220, bar_w - (_REFERENCE_TITLE_TEXT_X - _REFERENCE_TITLE_BAR_X) - 30)
278
+ text_filters: list[str] = []
279
+ if title:
280
+ if _title_fits(title, _REFERENCE_TITLE_SIZE, usable_w):
281
+ esc = _escape_drawtext(title)
282
+ text_filters.append(
283
+ f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
284
+ f"fontcolor=white:fontsize={_REFERENCE_TITLE_SIZE}:"
285
+ "borderw=1.2:bordercolor=0x101010@0.18:"
286
+ f"x={_REFERENCE_TITLE_TEXT_X}:"
287
+ f"y={_REFERENCE_TITLE_TEXT_Y}"
288
+ )
289
+ else:
290
+ line1, line2 = _wrap_title_two_lines(title)
291
+ two_line_size = 54
292
+ while (
293
+ line2
294
+ and two_line_size > 42
295
+ and not (
296
+ _title_fits(line1, two_line_size, usable_w)
297
+ and _title_fits(line2, two_line_size, usable_w)
298
+ )
299
+ ):
300
+ two_line_size -= 2
301
+ if line2 and _title_fits(line1, two_line_size, usable_w) and _title_fits(line2, two_line_size, usable_w):
302
+ y_top = 36
303
+ y_bottom = y_top + int(round(two_line_size * 1.08))
304
+ for line, y in ((line1, y_top), (line2, y_bottom)):
305
+ esc = _escape_drawtext(line)
306
+ text_filters.append(
307
+ f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
308
+ f"fontcolor=white:fontsize={two_line_size}:"
309
+ "borderw=1.2:bordercolor=0x101010@0.18:"
310
+ f"x={_REFERENCE_TITLE_TEXT_X}:y={y}"
311
+ )
312
+ else:
313
+ size = _REFERENCE_TITLE_SIZE
314
+ while title and not _title_fits(title, size, usable_w) and size > 38:
315
+ size -= 2
316
+ if title and not _title_fits(title, size, usable_w):
317
+ max_chars = max(8, int(usable_w / _title_char_px(size)))
318
+ title = title[: max_chars - 1].rstrip() + "..."
319
+ esc = _escape_drawtext(title)
320
+ text_filters.append(
321
+ f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
322
+ f"fontcolor=white:fontsize={size}:"
323
+ "borderw=1.2:bordercolor=0x101010@0.18:"
324
+ f"x={_REFERENCE_TITLE_TEXT_X}:"
325
+ f"y={_REFERENCE_TITLE_TEXT_Y}"
326
+ )
327
+ text_filter = f",{','.join(text_filters)}" if text_filters else ""
328
+ return (
329
+ f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
330
+ f"w={bar_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x1F1F1F@0.84:t=fill,"
331
+ f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
332
+ f"w={accent_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x2A2453@0.98:t=fill"
333
+ f"{text_filter}"
334
+ )
335
+
336
+
337
+ def _reference_caption_bar_fragment(
338
+ *,
339
+ out_w: int = 1080,
340
+ out_h: int = 1920,
341
+ margin_v: int = 166,
342
+ font_size: int = 38,
343
+ ) -> str:
344
+ bar_w = min(_REFERENCE_CAPTION_BAR_W, max(320, out_w - 2 * _REFERENCE_CAPTION_BAR_X))
345
+ bar_h = max(_REFERENCE_CAPTION_BAR_H, int(round(font_size * 2.05)))
346
+ bar_y = max(
347
+ _REFERENCE_TITLE_BAR_Y + _REFERENCE_TITLE_BAR_H + 36,
348
+ out_h - max(40, margin_v) - bar_h,
349
+ )
350
+ return (
351
+ f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
352
+ f"w={bar_w}:h={bar_h}:color=0x6570E6@1.0:t=fill,"
353
+ f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
354
+ f"w={bar_w}:h=3:color=0xE4E7FF@0.14:t=fill"
355
+ )
356
+
357
+
358
+ def _escape_filter_path(path: str) -> str:
359
+ return path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'")
360
+
361
+
362
+ def _has_audio_stream(media_path: str) -> bool:
363
+ probe = shutil.which("ffprobe")
364
+ if not probe:
365
+ return False
366
+ out = subprocess.run(
367
+ [
368
+ probe,
369
+ "-v",
370
+ "error",
371
+ "-select_streams",
372
+ "a:0",
373
+ "-show_entries",
374
+ "stream=codec_type",
375
+ "-of",
376
+ "csv=p=0",
377
+ media_path,
378
+ ],
379
+ check=False,
380
+ capture_output=True,
381
+ text=True,
382
+ )
383
+ return out.returncode == 0 and "audio" in (out.stdout or "").lower()
384
+
385
+
386
+ def build_ffmpeg_cmd(
387
+ req: RenderRequest,
388
+ *,
389
+ src_w: int = 1920,
390
+ src_h: int = 1080,
391
+ include_audio: bool = True,
392
+ ) -> list[str]:
393
+ exe = _ensure_ffmpeg() if req.mode != "dry_run" else "ffmpeg"
394
+
395
+ plan = plan_layout(
396
+ req.layout, out_w=req.width, out_h=req.height, src_w=src_w, src_h=src_h
397
+ )
398
+ fg = plan.filtergraph
399
+
400
+ if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
401
+ chrome_parts = [
402
+ _reference_title_fragment(req.title_text, out_w=req.width),
403
+ _reference_caption_bar_fragment(
404
+ out_w=req.width,
405
+ out_h=req.height,
406
+ margin_v=min(req.subtitle_margin_v, 136),
407
+ font_size=max(req.subtitle_font_size, 124),
408
+ )
409
+ if req.subtitle_path
410
+ else "",
411
+ ]
412
+ fg = fg.replace(
413
+ "[vout]",
414
+ f"[v_prepad];[v_prepad]{','.join(part for part in chrome_parts if part)}[vout]",
415
+ )
416
+ elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
417
+ # The native-highlight theme mirrors the reference short in
418
+ # videoplayback (12): no separate top title card, just centered
419
+ # floating captions with per-word highlight timing.
420
+ pass
421
+ else:
422
+ # Skip the drawtext title overlay on split layouts: the top band already
423
+ # shows a slide/chart with its own baked-in title, so adding an overlay
424
+ # on top of that is pure noise (and was stacking over the chart title
425
+ # in the SPLIT_CHART_PERSON Cathy Wood shorts).
426
+ title_allowed = req.layout.layout not in SPLIT_LAYOUTS
427
+ if req.title_text and title_allowed:
428
+ # ``plan_title_drawtext`` returns a full filter fragment (possibly
429
+ # two chained ``drawtext`` calls) that fits within the output width.
430
+ # For short titles it is byte-identical to the pre-P2 single-line
431
+ # form, keeping existing golden tests green while fixing the
432
+ # "Prediction Markets vs Derivatives" edge-clip report.
433
+ title_fragment = plan_title_drawtext(req.title_text, out_w=req.width)
434
+ if title_fragment:
435
+ fg = fg.replace(
436
+ "[vout]",
437
+ f"[v_prepad];[v_prepad]{title_fragment}[vout]",
438
+ )
439
+
440
+ if req.subtitle_path:
441
+ subtitle_esc = _escape_filter_path(req.subtitle_path)
442
+ fonts_dir = _fonts_dir()
443
+ fontsdir_arg = (
444
+ f":fontsdir='{_escape_filter_path(str(fonts_dir))}'" if fonts_dir.is_dir() else ""
445
+ )
446
+ # ``original_size`` pins libass's PlayResY to the actual output so
447
+ # ``FontSize`` and ``MarginV`` are interpreted in output pixels. Without
448
+ # this, libass defaults to PlayResY=288 and then upscales to the real
449
+ # canvas (1920) -- blowing font sizes and pushing subtitles to the
450
+ # middle of the frame. ``WrapStyle=0`` enables smart word wrap so long
451
+ # lines break into readable stacks instead of running off-screen.
452
+ if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
453
+ force_style = (
454
+ f"Fontname={_REFERENCE_CAPTION_FONT_NAME},"
455
+ f"FontSize={max(req.subtitle_font_size, 124)},Alignment=2,"
456
+ f"MarginV={min(req.subtitle_margin_v, 136)},"
457
+ "MarginL=56,MarginR=56,"
458
+ "WrapStyle=0,BorderStyle=1,Outline=2,Shadow=0,"
459
+ "BackColour=&H00000000&,PrimaryColour=&H00FFFFFF&,"
460
+ "Bold=1,Italic=0,Spacing=-1"
461
+ )
462
+ subtitle_filter = (
463
+ "[v_sub_in];"
464
+ f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
465
+ f"original_size={req.width}x{req.height}:"
466
+ f"force_style='{force_style}'[vout]"
467
+ )
468
+ elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
469
+ subtitle_filter = (
470
+ "[v_sub_in];"
471
+ f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
472
+ f"original_size={req.width}x{req.height}[vout]"
473
+ )
474
+ else:
475
+ force_style = (
476
+ f"Fontname=Arial,"
477
+ f"FontSize={req.subtitle_font_size},Alignment=2,"
478
+ f"MarginV={req.subtitle_margin_v},MarginL=60,MarginR=60,"
479
+ "WrapStyle=0,BorderStyle=4,"
480
+ "BackColour=&H70000000&,PrimaryColour=&H00FFFFFF&,"
481
+ "Outline=0,Shadow=0,Bold=1"
482
+ )
483
+ subtitle_filter = (
484
+ "[v_sub_in];"
485
+ f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
486
+ f"original_size={req.width}x{req.height}:"
487
+ f"force_style='{force_style}'[vout]"
488
+ )
489
+ fg = fg.replace("[vout]", subtitle_filter)
490
+
491
+ start = req.clip.start_time_sec
492
+ dur = max(0.1, req.clip.duration_sec)
493
+
494
+ Path(Path(req.output_path).parent).mkdir(parents=True, exist_ok=True)
495
+
496
+ cmd: list[str] = [
497
+ exe,
498
+ "-y",
499
+ "-ss",
500
+ f"{start:.3f}",
501
+ "-t",
502
+ f"{dur:.3f}",
503
+ "-i",
504
+ req.source_path,
505
+ "-filter_complex",
506
+ fg,
507
+ "-map",
508
+ "[vout]",
509
+ "-c:v",
510
+ "libx264",
511
+ "-preset",
512
+ "veryfast",
513
+ "-crf",
514
+ "20",
515
+ ]
516
+
517
+ if include_audio:
518
+ cmd.extend(["-map", "0:a:0", "-c:a", "aac", "-b:a", "160k"])
519
+
520
+ cmd.extend(["-movflags", "+faststart", req.output_path])
521
+ return cmd
522
+
523
+
524
+ def probe_source_size(source_path: str) -> tuple[int, int]:
525
+ exe = shutil.which("ffprobe")
526
+ if not exe:
527
+ return 1920, 1080
528
+ out = subprocess.run(
529
+ [
530
+ exe,
531
+ "-v",
532
+ "error",
533
+ "-select_streams",
534
+ "v:0",
535
+ "-show_entries",
536
+ "stream=width,height",
537
+ "-of",
538
+ "csv=p=0",
539
+ source_path,
540
+ ],
541
+ check=False,
542
+ capture_output=True,
543
+ text=True,
544
+ )
545
+ try:
546
+ w, h = out.stdout.strip().split(",")
547
+ return int(w), int(h)
548
+ except Exception:
549
+ return 1920, 1080
550
+
551
+
552
+ def render_clip(req: RenderRequest) -> RenderResult:
553
+ try:
554
+ src_w, src_h = probe_source_size(req.source_path) if req.mode != "dry_run" else (1920, 1080)
555
+ except Exception:
556
+ src_w, src_h = 1920, 1080
557
+
558
+ include_audio = True
559
+ if req.mode != "dry_run":
560
+ include_audio = _has_audio_stream(req.source_path)
561
+ if not include_audio:
562
+ return RenderResult(
563
+ clip_id=req.clip.clip_id,
564
+ output_path=req.output_path,
565
+ ffmpeg_cmd=[],
566
+ success=False,
567
+ error="Source media has no detectable audio stream (a:0).",
568
+ )
569
+
570
+ cmd = build_ffmpeg_cmd(req, src_w=src_w, src_h=src_h, include_audio=include_audio)
571
+
572
+ if req.mode == "dry_run":
573
+ return RenderResult(
574
+ clip_id=req.clip.clip_id,
575
+ output_path=req.output_path,
576
+ ffmpeg_cmd=cmd,
577
+ success=True,
578
+ )
579
+ try:
580
+ subprocess.run(cmd, check=True, capture_output=True, env=_ensure_windows_fontconfig())
581
+ if include_audio and not _has_audio_stream(req.output_path):
582
+ return RenderResult(
583
+ clip_id=req.clip.clip_id,
584
+ output_path=req.output_path,
585
+ ffmpeg_cmd=cmd,
586
+ success=False,
587
+ error="Rendered output is missing audio stream (a:0).",
588
+ )
589
+ return RenderResult(
590
+ clip_id=req.clip.clip_id,
591
+ output_path=req.output_path,
592
+ ffmpeg_cmd=cmd,
593
+ success=True,
594
+ )
595
+ except subprocess.CalledProcessError as e:
596
+ return RenderResult(
597
+ clip_id=req.clip.clip_id,
598
+ output_path=req.output_path,
599
+ ffmpeg_cmd=cmd,
600
+ success=False,
601
+ error=e.stderr.decode("utf-8", errors="replace")[-4000:] if e.stderr else str(e),
602
+ )
humeo-core/src/humeo_core/primitives/face_detect.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Local face-detection primitive — the MediaPipe path as another ``SceneRegions`` producer.
2
+
3
+ Three detection backends share the *same output schema* (``SceneRegions``):
4
+
5
+ * ``primitives/classify.py`` — pixel variance heuristic, no model.
6
+ * ``primitives/face_detect.py`` — MediaPipe face rectangle (this file).
7
+ * ``primitives/vision.py`` — multimodal LLM + OCR bboxes.
8
+
9
+ Because all three emit ``SceneRegions``, the layout planner in
10
+ ``primitives/vision.py`` (``classify_from_regions`` + ``layout_instruction_from_regions``)
11
+ works on all of them unchanged. That is the whole point of the primitive
12
+ boundary — the *detector* is swappable, the *renderer* is fixed.
13
+
14
+ MediaPipe is imported lazily so it remains an optional extra.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from typing import Callable
21
+
22
+ from ..schemas import BoundingBox, Scene, SceneRegions
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # A bbox loader for any future cloud face API. Takes a keyframe path,
28
+ # returns a normalized face bbox or ``None``. Same shape as the MediaPipe
29
+ # wrapper below, which lets tests pass a stub and skip MediaPipe.
30
+ FaceBBoxFn = Callable[[str], BoundingBox | None]
31
+
32
+
33
+ def detect_face_regions(
34
+ scenes: list[Scene],
35
+ face_fn: FaceBBoxFn | None = None,
36
+ chart_split_threshold: float = 0.65,
37
+ ) -> list[SceneRegions]:
38
+ """Populate ``SceneRegions.person_bbox`` (+ ``chart_bbox``) from a face detector.
39
+
40
+ The face bbox is treated as the *person bbox*. If the face sits in the
41
+ right ``(1 - chart_split_threshold)`` of the frame, a *chart bbox* is
42
+ synthesised over the left region — mirroring the original
43
+ ``reframe.py`` split heuristic.
44
+
45
+ Args:
46
+ scenes: scenes with ``keyframe_path`` populated.
47
+ face_fn: pluggable face detector. Defaults to MediaPipe (lazy
48
+ import) if not supplied. Pass a stub in tests.
49
+ chart_split_threshold: face x-center above this normalized value
50
+ triggers a synthetic chart bbox on the left.
51
+ """
52
+
53
+ if face_fn is None:
54
+ face_fn = _mediapipe_face_bbox
55
+
56
+ out: list[SceneRegions] = []
57
+ for s in scenes:
58
+ if not s.keyframe_path:
59
+ out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available"))
60
+ continue
61
+ try:
62
+ face = face_fn(s.keyframe_path)
63
+ except Exception as e: # one bad scene should not kill the batch
64
+ logger.warning("face detector failed on %s: %r", s.keyframe_path, e)
65
+ out.append(SceneRegions(scene_id=s.scene_id, raw_reason=f"face detector error: {e!r}"))
66
+ continue
67
+
68
+ if face is None:
69
+ out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no face detected"))
70
+ continue
71
+
72
+ chart = None
73
+ if face.center_x >= chart_split_threshold:
74
+ # Face pushed right → assume a chart occupies the left region.
75
+ chart = BoundingBox(
76
+ x1=0.0,
77
+ y1=0.0,
78
+ x2=min(chart_split_threshold, face.x1),
79
+ y2=1.0,
80
+ label="chart_inferred",
81
+ confidence=max(0.0, face.center_x - chart_split_threshold + 0.5),
82
+ )
83
+
84
+ out.append(
85
+ SceneRegions(
86
+ scene_id=s.scene_id,
87
+ person_bbox=face,
88
+ chart_bbox=chart,
89
+ raw_reason="face detected" + (" + synthetic chart bbox" if chart else ""),
90
+ )
91
+ )
92
+
93
+ return out
94
+
95
+
96
+ def _mediapipe_face_bbox(keyframe_path: str) -> BoundingBox | None:
97
+ """Return the largest-confidence face as a ``BoundingBox``, or ``None``.
98
+
99
+ Imports MediaPipe + OpenCV lazily so they remain optional dependencies
100
+ (install ``humeo-core[face]``).
101
+ """
102
+
103
+ try:
104
+ import cv2 # type: ignore
105
+ import mediapipe as mp # type: ignore
106
+ except ImportError as e:
107
+ raise RuntimeError(
108
+ "MediaPipe face detection requires `pip install humeo-core[face]`"
109
+ ) from e
110
+
111
+ img = cv2.imread(keyframe_path)
112
+ if img is None:
113
+ return None
114
+ rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
115
+
116
+ with mp.solutions.face_detection.FaceDetection(
117
+ model_selection=1, min_detection_confidence=0.5
118
+ ) as detector:
119
+ results = detector.process(rgb)
120
+ if not results.detections:
121
+ return None
122
+ best = max(results.detections, key=lambda d: d.score[0])
123
+ box = best.location_data.relative_bounding_box
124
+ x1 = max(0.0, min(1.0, float(box.xmin)))
125
+ y1 = max(0.0, min(1.0, float(box.ymin)))
126
+ x2 = max(x1 + 1e-6, min(1.0, x1 + float(box.width)))
127
+ y2 = max(y1 + 1e-6, min(1.0, y1 + float(box.height)))
128
+ return BoundingBox(
129
+ x1=x1,
130
+ y1=y1,
131
+ x2=x2,
132
+ y2=y2,
133
+ label="face",
134
+ confidence=float(best.score[0]),
135
+ )
humeo-core/src/humeo_core/primitives/ingest.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Landing gear: deterministic, local extraction.
2
+
3
+ Everything here can run without a GPU, without an API key, and without the
4
+ internet (once inputs are present). This follows the HIVE guide's rule
5
+ "extraction stays local; LLMs only reason".
6
+
7
+ Functions:
8
+ probe_duration — ffprobe wrapper
9
+ detect_scenes — PySceneDetect (ContentDetector)
10
+ extract_keyframes — ffmpeg snapshot at each scene midpoint
11
+ transcribe_audio — faster-whisper (optional dependency)
12
+ ingest — one-shot convenience runner that returns IngestResult
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ from pathlib import Path
22
+
23
+ from ..schemas import IngestResult, Scene, TranscriptWord
24
+
25
+
26
+ class IngestError(RuntimeError):
27
+ pass
28
+
29
+
30
+ def _require(binary: str) -> str:
31
+ path = shutil.which(binary)
32
+ if not path:
33
+ raise IngestError(
34
+ f"Required binary not on PATH: {binary!r}. Install it or add the path."
35
+ )
36
+ return path
37
+
38
+
39
+ def probe_duration(source_path: str) -> float:
40
+ ffprobe = _require("ffprobe")
41
+ out = subprocess.run(
42
+ [
43
+ ffprobe,
44
+ "-v",
45
+ "error",
46
+ "-show_entries",
47
+ "format=duration",
48
+ "-of",
49
+ "json",
50
+ source_path,
51
+ ],
52
+ check=True,
53
+ capture_output=True,
54
+ text=True,
55
+ )
56
+ data = json.loads(out.stdout)
57
+ return float(data["format"]["duration"])
58
+
59
+
60
+ def detect_scenes(
61
+ source_path: str, threshold: float = 27.0, min_scene_sec: float = 1.0
62
+ ) -> list[Scene]:
63
+ """Use PySceneDetect's ContentDetector to split the video into scenes."""
64
+
65
+ try:
66
+ from scenedetect import detect, ContentDetector # type: ignore
67
+ except ModuleNotFoundError as e:
68
+ # scenedetect depends on OpenCV; surface the real missing module.
69
+ missing = getattr(e, "name", "") or str(e)
70
+ hint = "pip install 'scenedetect[opencv]'" if "cv2" in missing else "pip install scenedetect"
71
+ raise IngestError(
72
+ f"Scene detection unavailable (missing module: {missing}). Install with: {hint}"
73
+ ) from e
74
+
75
+ result = detect(
76
+ source_path,
77
+ ContentDetector(threshold=threshold, min_scene_len=int(min_scene_sec * 24)),
78
+ )
79
+ scenes: list[Scene] = []
80
+ for i, (start, end) in enumerate(result):
81
+ scenes.append(
82
+ Scene(
83
+ scene_id=f"s{i:04d}",
84
+ start_time=float(start.get_seconds()),
85
+ end_time=float(end.get_seconds()),
86
+ )
87
+ )
88
+ # Guard: if PySceneDetect returns empty (e.g. a single long shot),
89
+ # fall back to one scene spanning the whole video.
90
+ if not scenes:
91
+ duration = probe_duration(source_path)
92
+ scenes.append(Scene(scene_id="s0000", start_time=0.0, end_time=duration))
93
+ return scenes
94
+
95
+
96
+ def extract_keyframes(
97
+ source_path: str, scenes: list[Scene], out_dir: str
98
+ ) -> list[Scene]:
99
+ """Extract one JPG per scene at its midpoint. Mutates nothing; returns copies."""
100
+
101
+ ffmpeg = _require("ffmpeg")
102
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
103
+ updated: list[Scene] = []
104
+ for s in scenes:
105
+ mid = s.start_time + (s.end_time - s.start_time) / 2.0
106
+ out_path = os.path.join(out_dir, f"{s.scene_id}.jpg")
107
+ subprocess.run(
108
+ [
109
+ ffmpeg,
110
+ "-y",
111
+ "-loglevel",
112
+ "error",
113
+ "-ss",
114
+ f"{mid:.3f}",
115
+ "-i",
116
+ source_path,
117
+ "-frames:v",
118
+ "1",
119
+ "-q:v",
120
+ "3",
121
+ out_path,
122
+ ],
123
+ check=True,
124
+ )
125
+ updated.append(s.model_copy(update={"keyframe_path": out_path}))
126
+ return updated
127
+
128
+
129
+ def transcribe_audio(
130
+ source_path: str, model_name: str = "base", language: str | None = None
131
+ ) -> list[TranscriptWord]:
132
+ """Word-level transcript via faster-whisper. Optional dependency."""
133
+
134
+ try:
135
+ from faster_whisper import WhisperModel # type: ignore
136
+ except ImportError as e:
137
+ raise IngestError(
138
+ "faster-whisper is not installed. pip install faster-whisper"
139
+ ) from e
140
+
141
+ model = WhisperModel(model_name, device="auto", compute_type="auto")
142
+ segments, _info = model.transcribe(source_path, word_timestamps=True, language=language)
143
+ words: list[TranscriptWord] = []
144
+ for seg in segments:
145
+ for w in getattr(seg, "words", []) or []:
146
+ if w.word is None:
147
+ continue
148
+ words.append(
149
+ TranscriptWord(
150
+ word=str(w.word).strip(),
151
+ start_time=float(w.start or 0.0),
152
+ end_time=float(w.end or 0.0),
153
+ )
154
+ )
155
+ return words
156
+
157
+
158
+ def ingest(
159
+ source_path: str,
160
+ work_dir: str,
161
+ *,
162
+ with_transcript: bool = False,
163
+ whisper_model: str = "base",
164
+ ) -> IngestResult:
165
+ """Run all extraction stages and return a single ``IngestResult``."""
166
+
167
+ if not os.path.exists(source_path):
168
+ raise IngestError(f"source_path does not exist: {source_path}")
169
+
170
+ Path(work_dir).mkdir(parents=True, exist_ok=True)
171
+ keyframes_dir = os.path.join(work_dir, "keyframes")
172
+
173
+ duration = probe_duration(source_path)
174
+ scenes = detect_scenes(source_path)
175
+ scenes = extract_keyframes(source_path, scenes, keyframes_dir)
176
+
177
+ words: list[TranscriptWord] = []
178
+ if with_transcript:
179
+ words = transcribe_audio(source_path, model_name=whisper_model)
180
+
181
+ return IngestResult(
182
+ source_path=os.path.abspath(source_path),
183
+ duration_sec=duration,
184
+ scenes=scenes,
185
+ transcript_words=words,
186
+ keyframes_dir=keyframes_dir,
187
+ )
humeo-core/src/humeo_core/primitives/layouts.py ADDED
@@ -0,0 +1,707 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """The 9:16 layout thrusters — deterministic crop + compose math.
2
+
3
+ First principles: this video format has a hard constraint of **at most two
4
+ on-screen items** per short (see :class:`humeo_core.schemas.LayoutKind`). That
5
+ gives exactly five recipes:
6
+
7
+ * 1 person alone, tight → ``ZOOM_CALL_CENTER``
8
+ * 1 person alone, wider → ``SIT_CENTER``
9
+ * 1 chart + 1 person → ``SPLIT_CHART_PERSON``
10
+ * 2 persons → ``SPLIT_TWO_PERSONS``
11
+ * 2 charts → ``SPLIT_TWO_CHARTS``
12
+
13
+ Each planner returns a pure ``ffmpeg -filter_complex`` fragment ending in
14
+ ``[vout]``. The compiler (``compile.py``) glues the fragment to the cut +
15
+ audio + subtitle chain. Because every planner is a pure function that
16
+ returns a string, the whole layout system is unit-testable without ever
17
+ invoking ffmpeg.
18
+
19
+ Split layouts share one contract:
20
+
21
+ * Output: 9:16 frame split into a **top band** and **bottom band**.
22
+ Band heights are driven by :attr:`LayoutInstruction.top_band_ratio`.
23
+ Default is ``0.5`` (even 50/50), matching the user-requested symmetric look.
24
+ * Source strips for the two items are **complementary** — they partition
25
+ the source width at a single seam so the two items never overlap and
26
+ together cover the full frame width.
27
+ * Each strip is scaled to fill its output band using the "cover"
28
+ convention (``force_original_aspect_ratio=increase`` + center crop), so
29
+ the band is fully painted (no letterbox bars, no stretch).
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from dataclasses import dataclass
35
+
36
+ from ..schemas import (
37
+ BoundingBox,
38
+ FocusStackOrder,
39
+ LayoutInstruction,
40
+ LayoutKind,
41
+ TimedCenterPoint,
42
+ )
43
+
44
+
45
+ # Source geometry assumption. Most podcast sources are 1920x1080; we still
46
+ # normalize everything by the actual source size so changing this is safe.
47
+ DEFAULT_SRC_W = 1920
48
+ DEFAULT_SRC_H = 1080
49
+ TRACKING_BLEND_SEC = 0.30
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class FilterPlan:
54
+ """Result of planning a layout.
55
+
56
+ ``filtergraph`` is the body of ``-filter_complex`` and ends with
57
+ ``[vout]`` as the final labelled stream.
58
+ """
59
+
60
+ filtergraph: str
61
+ out_label: str = "vout"
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Tiny pixel helpers
66
+ # ---------------------------------------------------------------------------
67
+
68
+
69
+ def _clamp01(v: float) -> float:
70
+ return max(0.0, min(1.0, v))
71
+
72
+
73
+ def _even(v: int) -> int:
74
+ """Floor ``v`` to an even integer (ffmpeg ``crop``/``scale`` need even dims)."""
75
+ return v - (v % 2)
76
+
77
+
78
+ def _bbox_to_crop_pixels(
79
+ box: BoundingBox, src_w: int, src_h: int
80
+ ) -> tuple[int, int, int, int]:
81
+ """Normalized bbox → ``(cw, ch, x, y)`` with even dimensions for ffmpeg."""
82
+ x1 = int(round(_clamp01(box.x1) * float(src_w)))
83
+ y1 = int(round(_clamp01(box.y1) * float(src_h)))
84
+ x2 = int(round(_clamp01(box.x2) * float(src_w)))
85
+ y2 = int(round(_clamp01(box.y2) * float(src_h)))
86
+ x1 = max(0, min(src_w - 2, x1))
87
+ y1 = max(0, min(src_h - 2, y1))
88
+ x2 = max(x1 + 2, min(src_w, x2))
89
+ y2 = max(y1 + 2, min(src_h, y2))
90
+ cw = _even(x2 - x1)
91
+ ch = _even(y2 - y1)
92
+ return max(2, cw), max(2, ch), _even(x1), _even(y1)
93
+
94
+
95
+ def _base_crop_size(
96
+ src_w: int,
97
+ src_h: int,
98
+ target_aspect: float,
99
+ ) -> tuple[int, int]:
100
+ if src_w / src_h >= target_aspect:
101
+ base_ch = src_h
102
+ base_cw = int(round(base_ch * target_aspect))
103
+ else:
104
+ base_cw = src_w
105
+ base_ch = int(round(base_cw / target_aspect))
106
+ return _even(max(2, base_cw)), _even(max(2, base_ch))
107
+
108
+
109
+ def _crop_box(
110
+ src_w: int,
111
+ src_h: int,
112
+ target_aspect: float,
113
+ zoom: float,
114
+ center_x_norm: float,
115
+ center_y_norm: float = 0.5,
116
+ ) -> tuple[int, int, int, int]:
117
+ """Return ``(cw, ch, x, y)`` crop values for a centered aspect-ratio crop.
118
+
119
+ ``zoom > 1`` means tighter crop (smaller window around the center). The
120
+ function always keeps the crop window fully inside the source frame.
121
+ """
122
+
123
+ zoom = max(1.0, zoom)
124
+ base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
125
+
126
+ cw = _even(max(2, int(round(base_cw / zoom))))
127
+ ch = _even(max(2, int(round(base_ch / zoom))))
128
+
129
+ cx = int(round(_clamp01(center_x_norm) * src_w))
130
+ cy = int(round(_clamp01(center_y_norm) * src_h))
131
+ x = _even(max(0, min(src_w - cw, cx - cw // 2)))
132
+ y = _even(max(0, min(src_h - ch, cy - ch // 2)))
133
+ return cw, ch, x, y
134
+
135
+
136
+ def _center_crop_to_9x16(
137
+ src_w: int, src_h: int, zoom: float, person_x_norm: float
138
+ ) -> tuple[int, int, int, int]:
139
+ return _crop_box(src_w, src_h, 9 / 16, zoom, person_x_norm, 0.5)
140
+
141
+
142
+ def _crop_x_from_center(src_w: int, cw: int, center_x_norm: float) -> int:
143
+ """Return an even, in-bounds crop x for a normalized horizontal center."""
144
+ cx = int(round(_clamp01(center_x_norm) * src_w))
145
+ return _even(max(0, min(src_w - cw, cx - cw // 2)))
146
+
147
+
148
+ def _tracked_value_expr(
149
+ values: list[tuple[float, float]],
150
+ *,
151
+ clamp_min: float | None = None,
152
+ clamp_max: float | None = None,
153
+ round_even: bool = False,
154
+ ) -> str:
155
+ if not values:
156
+ raise ValueError("values must not be empty")
157
+
158
+ expr = f"{float(values[-1][0]):.3f}"
159
+ for idx in range(len(values) - 2, -1, -1):
160
+ v0, t0 = float(values[idx][0]), float(values[idx][1])
161
+ v1, t1 = float(values[idx + 1][0]), float(values[idx + 1][1])
162
+ if t1 <= t0:
163
+ expr = f"if(lt(t\\,{t1:.3f})\\,{v0:.3f}\\,{expr})"
164
+ continue
165
+
166
+ switch_t = (t0 + t1) / 2.0
167
+ blend_half = TRACKING_BLEND_SEC / 2.0
168
+ blend_start = max(t0, switch_t - blend_half)
169
+ blend_end = min(t1, switch_t + blend_half)
170
+
171
+ if blend_end <= blend_start:
172
+ expr = f"if(lt(t\\,{switch_t:.3f})\\,{v0:.3f}\\,{expr})"
173
+ continue
174
+
175
+ blend_expr = (
176
+ f"{v0:.3f}+({v1 - v0:.3f})*(t-{blend_start:.3f})/({blend_end - blend_start:.3f})"
177
+ )
178
+ expr = (
179
+ f"if(lt(t\\,{blend_start:.3f})\\,{v0:.3f}\\,"
180
+ f"if(lt(t\\,{blend_end:.3f})\\,{blend_expr}\\,{expr}))"
181
+ )
182
+
183
+ if clamp_min is not None:
184
+ expr = f"max({clamp_min:.3f}\\,{expr})"
185
+ if clamp_max is not None:
186
+ expr = f"min({clamp_max:.3f}\\,{expr})"
187
+ if round_even:
188
+ expr = f"floor(({expr})/2)*2"
189
+ return expr
190
+
191
+
192
+ def _tracked_crop_x_expr(
193
+ *,
194
+ src_w: int,
195
+ crop_w: int,
196
+ tracking: list[TimedCenterPoint],
197
+ ) -> str:
198
+ """Return an ffmpeg expression for a time-varying crop x position.
199
+
200
+ We mostly hold each framing until the midpoint between adjacent samples,
201
+ then blend over a short window. That keeps edited talk footage from
202
+ drifting for seconds after a cut while still avoiding a one-frame jump
203
+ in the crop position.
204
+ """
205
+ if not tracking:
206
+ raise ValueError("tracking must not be empty")
207
+
208
+ center_points = [
209
+ (_clamp01(point.x_norm) * src_w, float(point.t_sec))
210
+ for point in tracking
211
+ ]
212
+ center_expr = _tracked_value_expr(
213
+ center_points,
214
+ clamp_min=0.0,
215
+ clamp_max=float(src_w),
216
+ )
217
+ max_x = max(0, src_w - crop_w)
218
+ return f"floor(max(0\\,min({max_x}\\,({center_expr})-{crop_w}/2))/2)*2"
219
+
220
+
221
+ def _tracked_crop_exprs(
222
+ *,
223
+ src_w: int,
224
+ src_h: int,
225
+ target_aspect: float,
226
+ default_zoom: float,
227
+ center_y_norm: float,
228
+ tracking: list[TimedCenterPoint],
229
+ ) -> tuple[str, str, str, str]:
230
+ if not tracking:
231
+ raise ValueError("tracking must not be empty")
232
+
233
+ base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
234
+ width_points: list[tuple[float, float]] = []
235
+ height_points: list[tuple[float, float]] = []
236
+ center_points: list[tuple[float, float]] = []
237
+ for point in tracking:
238
+ zoom = max(1.0, float(point.zoom if point.zoom is not None else default_zoom))
239
+ width_points.append((float(_even(max(2, int(round(base_cw / zoom))))), float(point.t_sec)))
240
+ height_points.append((float(_even(max(2, int(round(base_ch / zoom))))), float(point.t_sec)))
241
+ center_points.append((_clamp01(point.x_norm) * src_w, float(point.t_sec)))
242
+
243
+ w_expr = _tracked_value_expr(
244
+ width_points,
245
+ clamp_min=2.0,
246
+ clamp_max=float(base_cw),
247
+ round_even=True,
248
+ )
249
+ h_expr = _tracked_value_expr(
250
+ height_points,
251
+ clamp_min=2.0,
252
+ clamp_max=float(base_ch),
253
+ round_even=True,
254
+ )
255
+ center_expr = _tracked_value_expr(
256
+ center_points,
257
+ clamp_min=0.0,
258
+ clamp_max=float(src_w),
259
+ )
260
+ center_y_px = _clamp01(center_y_norm) * src_h
261
+ x_expr = f"floor(max(0\\,min({src_w}-out_w\\,({center_expr})-out_w/2))/2)*2"
262
+ y_expr = f"floor(max(0\\,min({src_h}-out_h\\,{center_y_px:.3f}-out_h/2))/2)*2"
263
+ return w_expr, h_expr, x_expr, y_expr
264
+
265
+
266
+ # ---------------------------------------------------------------------------
267
+ # Split helpers — shared by all three split layouts
268
+ # ---------------------------------------------------------------------------
269
+
270
+
271
+ # Minimum source-strip width for a split, as a fraction of source width.
272
+ # Prevents a chart/person bbox that hugs one edge from starving the other.
273
+ _MIN_SPLIT_STRIP_FRAC = 0.2
274
+ _CHART_STRIP_VERTICAL_PAD_FRAC = 0.12
275
+
276
+
277
+ @dataclass(frozen=True)
278
+ class _SplitStrip:
279
+ """A source-frame crop rectangle destined for one output band."""
280
+
281
+ cw: int
282
+ ch: int
283
+ x: int
284
+ y: int
285
+
286
+ def filter_crop(self, input_label: str, out_w: int, band_h: int, out_label: str) -> str:
287
+ """Return ``[input]crop=...,scale=...,crop=...,setsar=1[out_label]``.
288
+
289
+ Uses the "cover" convention: scale so the band is fully painted, then
290
+ center-crop any overflow. Bands always get filled — no letterbox bars.
291
+ """
292
+ return (
293
+ f"[{input_label}]crop={self.cw}:{self.ch}:{self.x}:{self.y},"
294
+ f"scale={out_w}:{band_h}:force_original_aspect_ratio=increase,"
295
+ f"crop={out_w}:{band_h},setsar=1[{out_label}]"
296
+ )
297
+
298
+
299
+ def _bbox_strip(
300
+ box: BoundingBox | None,
301
+ *,
302
+ src_w: int,
303
+ src_h: int,
304
+ x_start: int,
305
+ x_end: int,
306
+ ) -> _SplitStrip:
307
+ """Build a source crop for one band.
308
+
309
+ Horizontal range is fixed by ``[x_start, x_end)`` (from the seam math so
310
+ strips partition the source width). Vertical range comes from ``box``
311
+ when available — that's what makes the chart **fill** the output band
312
+ instead of being squashed inside full-height source context.
313
+ """
314
+ x = _even(max(0, min(src_w - 2, x_start)))
315
+ cw = _even(max(2, min(src_w - x, x_end - x)))
316
+
317
+ if box is not None:
318
+ y1 = int(round(_clamp01(box.y1) * float(src_h)))
319
+ y2 = int(round(_clamp01(box.y2) * float(src_h)))
320
+ y = _even(max(0, min(src_h - 2, y1)))
321
+ ch = _even(max(2, min(src_h - y, y2 - y)))
322
+ else:
323
+ y = 0
324
+ ch = _even(src_h)
325
+
326
+ return _SplitStrip(cw=cw, ch=ch, x=x, y=y)
327
+
328
+
329
+ def _chart_strip_with_vertical_pad(
330
+ strip: _SplitStrip,
331
+ *,
332
+ src_h: int,
333
+ pad_frac: float = _CHART_STRIP_VERTICAL_PAD_FRAC,
334
+ ) -> _SplitStrip:
335
+ """Relax chart crops vertically so cover-scaling trims fewer chart edges."""
336
+
337
+ pad = _even(max(0, int(round(strip.ch * max(0.0, pad_frac)))))
338
+ if pad <= 0:
339
+ return strip
340
+
341
+ top = max(0, strip.y - pad)
342
+ bottom = min(src_h, strip.y + strip.ch + pad)
343
+ ch = _even(max(2, bottom - top))
344
+ if ch <= strip.ch:
345
+ return strip
346
+ y = _even(max(0, min(src_h - ch, top)))
347
+ return _SplitStrip(cw=strip.cw, ch=ch, x=strip.x, y=y)
348
+
349
+
350
+ def _compute_seam(
351
+ *,
352
+ left_box: BoundingBox | None,
353
+ right_box: BoundingBox | None,
354
+ src_w: int,
355
+ src_h: int,
356
+ default_fraction: float = 0.5,
357
+ ) -> int:
358
+ """Return an even x-coordinate that partitions the source into two strips.
359
+
360
+ When both bboxes are known, the seam is the midpoint of the gap/overlap
361
+ between ``left_box.x2`` and ``right_box.x1``. Falls back to
362
+ ``default_fraction * src_w`` (0.5 = even) otherwise. The seam is clamped
363
+ so neither strip is thinner than :data:`_MIN_SPLIT_STRIP_FRAC` of source.
364
+ """
365
+ if left_box is not None and right_box is not None:
366
+ _, _, left_x, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
367
+ left_cw, _, _, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
368
+ _, _, right_x, _ = _bbox_to_crop_pixels(right_box, src_w, src_h)
369
+
370
+ left_right = left_x + left_cw
371
+ seam = int(round((left_right + right_x) / 2.0))
372
+ else:
373
+ seam = int(round(default_fraction * float(src_w)))
374
+
375
+ seam = _even(seam)
376
+ min_strip = _even(max(2, int(round(src_w * _MIN_SPLIT_STRIP_FRAC))))
377
+ if min_strip * 2 >= src_w:
378
+ min_strip = _even(max(2, src_w // 4))
379
+ return max(min_strip, min(src_w - min_strip, seam))
380
+
381
+
382
+ def _band_heights(out_h: int, top_ratio: float) -> tuple[int, int]:
383
+ """Return ``(top_h, bot_h)`` even band heights that sum to ``out_h``."""
384
+ top_h = _even(int(round(out_h * top_ratio)))
385
+ top_h = max(2, min(out_h - 2, top_h))
386
+ bot_h = out_h - top_h
387
+ return top_h, bot_h
388
+
389
+
390
+ def _stack_filtergraph(
391
+ *,
392
+ top_strip: _SplitStrip,
393
+ bot_strip: _SplitStrip,
394
+ out_w: int,
395
+ top_h: int,
396
+ bot_h: int,
397
+ ) -> str:
398
+ """Compose the split filter graph: ``[0:v]split=2 → two crops → vstack → [vout]``."""
399
+ top_fg = top_strip.filter_crop("src1", out_w, top_h, "top")
400
+ bot_fg = bot_strip.filter_crop("src2", out_w, bot_h, "bot")
401
+ return (
402
+ f"[0:v]split=2[src1][src2];"
403
+ f"{top_fg};"
404
+ f"{bot_fg};"
405
+ f"[top][bot]vstack=inputs=2[vout]"
406
+ )
407
+
408
+
409
+ # ---------------------------------------------------------------------------
410
+ # Layout: single-subject (centered) — 1 person
411
+ # ---------------------------------------------------------------------------
412
+
413
+
414
+ def plan_zoom_call_center(
415
+ instruction: LayoutInstruction,
416
+ *,
417
+ out_w: int,
418
+ out_h: int,
419
+ src_w: int = DEFAULT_SRC_W,
420
+ src_h: int = DEFAULT_SRC_H,
421
+ ) -> FilterPlan:
422
+ """1 person, tight zoom-call framing. ``zoom`` clamped to ``>= 1.25``."""
423
+ zoom = max(instruction.zoom, 1.25)
424
+ cw, ch, x, y = _center_crop_to_9x16(src_w, src_h, zoom, instruction.person_x_norm)
425
+ if instruction.person_tracking:
426
+ if any(point.zoom is not None for point in instruction.person_tracking):
427
+ w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
428
+ src_w=src_w,
429
+ src_h=src_h,
430
+ target_aspect=9 / 16,
431
+ default_zoom=zoom,
432
+ center_y_norm=0.5,
433
+ tracking=instruction.person_tracking,
434
+ )
435
+ fg = (
436
+ f"[0:v]setpts=PTS-STARTPTS[vsrc];"
437
+ f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
438
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
439
+ )
440
+ else:
441
+ x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
442
+ fg = (
443
+ f"[0:v]setpts=PTS-STARTPTS[vsrc];"
444
+ f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
445
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
446
+ )
447
+ else:
448
+ fg = (
449
+ f"[0:v]crop={cw}:{ch}:{x}:{y},"
450
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
451
+ )
452
+ return FilterPlan(filtergraph=fg)
453
+
454
+
455
+ def plan_sit_center(
456
+ instruction: LayoutInstruction,
457
+ *,
458
+ out_w: int,
459
+ out_h: int,
460
+ src_w: int = DEFAULT_SRC_W,
461
+ src_h: int = DEFAULT_SRC_H,
462
+ ) -> FilterPlan:
463
+ """1 person, interview/seated framing. Vertical center biased to ``0.48``
464
+ so faces sit slightly above the 9:16 middle instead of centered on a
465
+ subject's chest.
466
+ """
467
+ zoom = max(instruction.zoom, 1.0)
468
+ cw, ch, x, y = _crop_box(
469
+ src_w, src_h, 9 / 16, zoom, instruction.person_x_norm, 0.48
470
+ )
471
+ if instruction.person_tracking:
472
+ if any(point.zoom is not None for point in instruction.person_tracking):
473
+ w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
474
+ src_w=src_w,
475
+ src_h=src_h,
476
+ target_aspect=9 / 16,
477
+ default_zoom=zoom,
478
+ center_y_norm=0.48,
479
+ tracking=instruction.person_tracking,
480
+ )
481
+ fg = (
482
+ f"[0:v]setpts=PTS-STARTPTS[vsrc];"
483
+ f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
484
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
485
+ )
486
+ else:
487
+ x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
488
+ fg = (
489
+ f"[0:v]setpts=PTS-STARTPTS[vsrc];"
490
+ f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
491
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
492
+ )
493
+ else:
494
+ fg = (
495
+ f"[0:v]crop={cw}:{ch}:{x}:{y},"
496
+ f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
497
+ )
498
+ return FilterPlan(filtergraph=fg)
499
+
500
+
501
+ # ---------------------------------------------------------------------------
502
+ # Split layouts — 2 items stacked vertically
503
+ # ---------------------------------------------------------------------------
504
+
505
+
506
+ def plan_split_chart_person(
507
+ instruction: LayoutInstruction,
508
+ *,
509
+ out_w: int,
510
+ out_h: int,
511
+ src_w: int = DEFAULT_SRC_W,
512
+ src_h: int = DEFAULT_SRC_H,
513
+ ) -> FilterPlan:
514
+ """1 chart + 1 person.
515
+
516
+ **Horizontal partition.** Chart occupies the left source strip, person the
517
+ right strip. When both bboxes are set (Gemini vision), the seam sits at
518
+ the midpoint between ``chart.x2`` and ``person.x1`` so the strips are
519
+ complementary (no overlap, no gap). Otherwise the seam defaults to a
520
+ 2/3 | 1/3 split (chart left, person right), matching the Ark-style
521
+ explainer-slide geometry this codebase was originally written against.
522
+
523
+ **Vertical crop.** Each strip's vertical extent comes from the
524
+ corresponding bbox when provided — crucial so the chart **fills** its
525
+ output band instead of being lost inside full-height source context
526
+ (plant, background, lower-third graphics, etc.). Falls back to full
527
+ source height when bboxes are unavailable.
528
+
529
+ **Output bands.** Controlled by :attr:`LayoutInstruction.top_band_ratio`
530
+ (default 0.5 = even 50/50 — the user-requested symmetric look). Focus
531
+ stack order picks chart-on-top (default) vs person-on-top.
532
+ """
533
+
534
+ top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
535
+
536
+ chart_box = instruction.split_chart_region
537
+ person_box = instruction.split_person_region
538
+
539
+ if chart_box is not None and person_box is not None:
540
+ seam = _compute_seam(
541
+ left_box=chart_box, right_box=person_box, src_w=src_w, src_h=src_h
542
+ )
543
+ chart_start = 0
544
+ else:
545
+ # Historical default: chart = left 2/3, person = right 1/3 (the
546
+ # Ark-style explainer-slide geometry this codebase was originally
547
+ # written against). ``chart_x_norm`` trims the chart strip from its
548
+ # left edge when we have no vision bbox to do it precisely.
549
+ seam = _even(max(2, min(src_w - 2, int(round((2.0 / 3.0) * float(src_w))))))
550
+ trim = int(round(_clamp01(instruction.chart_x_norm) * float(seam)))
551
+ chart_start = _even(max(0, min(seam - 2, trim)))
552
+
553
+ chart_strip = _bbox_strip(
554
+ chart_box, src_w=src_w, src_h=src_h, x_start=chart_start, x_end=seam
555
+ )
556
+ if chart_box is not None:
557
+ chart_strip = _chart_strip_with_vertical_pad(chart_strip, src_h=src_h)
558
+ person_strip = _bbox_strip(
559
+ person_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
560
+ )
561
+ return _emit_split(
562
+ chart_strip=chart_strip,
563
+ person_strip=person_strip,
564
+ order=instruction.focus_stack_order,
565
+ out_w=out_w,
566
+ top_h=top_h,
567
+ bot_h=bot_h,
568
+ )
569
+
570
+
571
+ def _emit_split(
572
+ *,
573
+ chart_strip: _SplitStrip,
574
+ person_strip: _SplitStrip,
575
+ order: FocusStackOrder,
576
+ out_w: int,
577
+ top_h: int,
578
+ bot_h: int,
579
+ ) -> FilterPlan:
580
+ if order == FocusStackOrder.CHART_THEN_PERSON:
581
+ fg = _stack_filtergraph(
582
+ top_strip=chart_strip,
583
+ bot_strip=person_strip,
584
+ out_w=out_w,
585
+ top_h=top_h,
586
+ bot_h=bot_h,
587
+ )
588
+ else:
589
+ fg = _stack_filtergraph(
590
+ top_strip=person_strip,
591
+ bot_strip=chart_strip,
592
+ out_w=out_w,
593
+ top_h=top_h,
594
+ bot_h=bot_h,
595
+ )
596
+ return FilterPlan(filtergraph=fg)
597
+
598
+
599
+ def plan_split_two_persons(
600
+ instruction: LayoutInstruction,
601
+ *,
602
+ out_w: int,
603
+ out_h: int,
604
+ src_w: int = DEFAULT_SRC_W,
605
+ src_h: int = DEFAULT_SRC_H,
606
+ ) -> FilterPlan:
607
+ """2 persons (interview two-up) stacked vertically.
608
+
609
+ First person = ``split_person_region``, second person = ``split_second_person_region``.
610
+ Seam sits at the midpoint between the two bboxes when both are known;
611
+ otherwise defaults to a centered 50/50 split.
612
+ """
613
+ top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
614
+
615
+ left_box = instruction.split_person_region
616
+ right_box = instruction.split_second_person_region
617
+
618
+ seam = _compute_seam(
619
+ left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
620
+ )
621
+
622
+ left_strip = _bbox_strip(
623
+ left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
624
+ )
625
+ right_strip = _bbox_strip(
626
+ right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
627
+ )
628
+ fg = _stack_filtergraph(
629
+ top_strip=left_strip,
630
+ bot_strip=right_strip,
631
+ out_w=out_w,
632
+ top_h=top_h,
633
+ bot_h=bot_h,
634
+ )
635
+ return FilterPlan(filtergraph=fg)
636
+
637
+
638
+ def plan_split_two_charts(
639
+ instruction: LayoutInstruction,
640
+ *,
641
+ out_w: int,
642
+ out_h: int,
643
+ src_w: int = DEFAULT_SRC_W,
644
+ src_h: int = DEFAULT_SRC_H,
645
+ ) -> FilterPlan:
646
+ """2 charts stacked vertically.
647
+
648
+ First chart = ``split_chart_region``, second chart = ``split_second_chart_region``.
649
+ Uses the same seam/bbox-y-crop recipe as the other splits, so each chart
650
+ fills its output band instead of being surrounded by source context.
651
+ """
652
+ top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
653
+
654
+ left_box = instruction.split_chart_region
655
+ right_box = instruction.split_second_chart_region
656
+
657
+ seam = _compute_seam(
658
+ left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
659
+ )
660
+
661
+ left_strip = _bbox_strip(
662
+ left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
663
+ )
664
+ if left_box is not None:
665
+ left_strip = _chart_strip_with_vertical_pad(left_strip, src_h=src_h)
666
+ right_strip = _bbox_strip(
667
+ right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
668
+ )
669
+ if right_box is not None:
670
+ right_strip = _chart_strip_with_vertical_pad(right_strip, src_h=src_h)
671
+ fg = _stack_filtergraph(
672
+ top_strip=left_strip,
673
+ bot_strip=right_strip,
674
+ out_w=out_w,
675
+ top_h=top_h,
676
+ bot_h=bot_h,
677
+ )
678
+ return FilterPlan(filtergraph=fg)
679
+
680
+
681
+ _DISPATCH = {
682
+ LayoutKind.ZOOM_CALL_CENTER: plan_zoom_call_center,
683
+ LayoutKind.SIT_CENTER: plan_sit_center,
684
+ LayoutKind.SPLIT_CHART_PERSON: plan_split_chart_person,
685
+ LayoutKind.SPLIT_TWO_PERSONS: plan_split_two_persons,
686
+ LayoutKind.SPLIT_TWO_CHARTS: plan_split_two_charts,
687
+ }
688
+
689
+
690
+ def plan_layout(
691
+ instruction: LayoutInstruction,
692
+ *,
693
+ out_w: int = 1080,
694
+ out_h: int = 1920,
695
+ src_w: int = DEFAULT_SRC_W,
696
+ src_h: int = DEFAULT_SRC_H,
697
+ ) -> FilterPlan:
698
+ """Dispatch to one of the five thrusters.
699
+
700
+ Exhaustive over :class:`LayoutKind` — adding a new layout requires adding
701
+ a planner above **and** an entry in :data:`_DISPATCH`.
702
+ """
703
+
704
+ fn = _DISPATCH.get(instruction.layout)
705
+ if fn is None:
706
+ raise ValueError(f"Unknown layout: {instruction.layout!r}")
707
+ return fn(instruction, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
humeo-core/src/humeo_core/primitives/select_clips.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Clip selection: pick the strongest 30-60s segments from a long source.
2
+
3
+ Two backends, same contract:
4
+
5
+ * ``select_clips_heuristic`` — greedy word-density scoring. Uses the
6
+ transcript alone; zero model calls. Good baseline when transcript exists.
7
+ * ``select_clips_with_llm`` — pluggable LLM hook. Caller provides a
8
+ ``(prompt_text) -> str`` function that must return strict JSON matching
9
+ the ``ClipPlan`` schema. We re-validate before returning.
10
+
11
+ Both return a ``ClipPlan``.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ from typing import Callable
18
+
19
+ from ..schemas import Clip, ClipPlan, TranscriptWord
20
+
21
+
22
+ LLMTextFn = Callable[[str], str]
23
+
24
+
25
+ CLIP_SELECTOR_PROMPT_TEMPLATE = """You are a viral-clip selector for a podcast editor.
26
+ Return ONLY JSON matching this shape:
27
+
28
+ {{
29
+ "source_path": "{source_path}",
30
+ "clips": [
31
+ {{
32
+ "clip_id": "001",
33
+ "topic": "<short topic>",
34
+ "start_time_sec": <float>,
35
+ "end_time_sec": <float>,
36
+ "viral_hook": "<one line>",
37
+ "virality_score": <0..1>,
38
+ "transcript": "<full clip transcript>",
39
+ "suggested_overlay_title": "<<=6 words>"
40
+ }}
41
+ ]
42
+ }}
43
+
44
+ Pick {target_count} clips, each {min_sec}-{max_sec} seconds long, NO overlaps, sorted by virality_score desc.
45
+
46
+ Transcript (word, start, end):
47
+ {transcript}
48
+ """
49
+
50
+
51
+ def _words_in_window(
52
+ words: list[TranscriptWord], start: float, end: float
53
+ ) -> list[TranscriptWord]:
54
+ return [w for w in words if w.start_time >= start and w.end_time <= end]
55
+
56
+
57
+ def select_clips_heuristic(
58
+ source_path: str,
59
+ words: list[TranscriptWord],
60
+ duration_sec: float,
61
+ *,
62
+ target_count: int = 5,
63
+ min_sec: float = 30.0,
64
+ max_sec: float = 60.0,
65
+ step_sec: float = 5.0,
66
+ ) -> ClipPlan:
67
+ """Greedy: slide a window, score by words/sec, take top non-overlapping picks."""
68
+
69
+ if duration_sec <= min_sec or not words:
70
+ # No sensible windowing possible; return one clip of the whole thing.
71
+ end = min(duration_sec, max_sec) if duration_sec > 0 else max_sec
72
+ return ClipPlan(
73
+ source_path=source_path,
74
+ clips=[
75
+ Clip(
76
+ clip_id="001",
77
+ topic="Full source",
78
+ start_time_sec=0.0,
79
+ end_time_sec=max(end, 1.0),
80
+ viral_hook="",
81
+ virality_score=0.5,
82
+ transcript=" ".join(w.word for w in words),
83
+ suggested_overlay_title="Highlight",
84
+ )
85
+ ],
86
+ )
87
+
88
+ candidates: list[tuple[float, float, float, str]] = []
89
+ window = (min_sec + max_sec) / 2.0
90
+ t = 0.0
91
+ while t + window <= duration_sec:
92
+ ws = _words_in_window(words, t, t + window)
93
+ if ws:
94
+ density = len(ws) / window
95
+ text = " ".join(w.word for w in ws)
96
+ candidates.append((density, t, t + window, text))
97
+ t += step_sec
98
+
99
+ candidates.sort(key=lambda c: c[0], reverse=True)
100
+ picked: list[tuple[float, float, float, str]] = []
101
+ for c in candidates:
102
+ if len(picked) >= target_count:
103
+ break
104
+ if all(c[2] <= p[1] or c[1] >= p[2] for p in picked):
105
+ picked.append(c)
106
+ picked.sort(key=lambda c: c[1])
107
+
108
+ clips: list[Clip] = []
109
+ for i, (density, s, e, text) in enumerate(picked, start=1):
110
+ norm = min(1.0, density / 3.0) # ~3 words/sec is dense talking
111
+ clips.append(
112
+ Clip(
113
+ clip_id=f"{i:03d}",
114
+ topic=text.split(".")[0][:60] or f"Clip {i}",
115
+ start_time_sec=round(s, 2),
116
+ end_time_sec=round(e, 2),
117
+ viral_hook=text[:120],
118
+ virality_score=round(norm, 3),
119
+ transcript=text,
120
+ suggested_overlay_title=(text.split(".")[0][:40] or f"Clip {i}"),
121
+ )
122
+ )
123
+ return ClipPlan(source_path=source_path, clips=clips)
124
+
125
+
126
+ def select_clips_with_llm(
127
+ source_path: str,
128
+ words: list[TranscriptWord],
129
+ *,
130
+ target_count: int,
131
+ min_sec: float,
132
+ max_sec: float,
133
+ text_fn: LLMTextFn,
134
+ ) -> ClipPlan:
135
+ transcript_lines = "\n".join(
136
+ f"{w.word}\t{w.start_time:.2f}\t{w.end_time:.2f}" for w in words
137
+ )
138
+ prompt = CLIP_SELECTOR_PROMPT_TEMPLATE.format(
139
+ source_path=source_path,
140
+ target_count=target_count,
141
+ min_sec=min_sec,
142
+ max_sec=max_sec,
143
+ transcript=transcript_lines,
144
+ )
145
+ raw = text_fn(prompt)
146
+ try:
147
+ data = json.loads(raw)
148
+ except json.JSONDecodeError as e:
149
+ raise ValueError(f"LLM did not return JSON: {e}; raw={raw[:200]!r}") from e
150
+ return ClipPlan.model_validate(data)
humeo-core/src/humeo_core/primitives/vision.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Vision-LLM + OCR primitive — the alt path to per-scene framing decisions.
2
+
3
+ Design (Bryan's "big screen change -> v3 images -> LLM+OCR -> bbox" idea):
4
+
5
+ 1. Scene detection already produces one keyframe per scene (deterministic,
6
+ local, cheap). That is ``primitives/ingest.py::extract_keyframes``.
7
+ 2. For each keyframe, call a pluggable vision LLM with an OCR hint. The
8
+ model returns normalized bboxes for the on-screen roles it cares about
9
+ (``person``, ``chart``) plus any OCR text it reads.
10
+ 3. Fold those bboxes into ``LayoutInstruction`` values so the existing
11
+ layout planner (``primitives/layouts.py``) does the actual ffmpeg math.
12
+
13
+ Why this shape:
14
+
15
+ * **Pluggable**. Caller supplies ``LLMRegionFn``. We never hard-code a
16
+ provider. The same primitive works for Gemini, GPT-4o, internal models,
17
+ tests, or mocks.
18
+ * **Schema-validated**. Raw model output is parsed into ``SceneRegions``
19
+ (Pydantic). Malformed output degrades to ``None`` regions rather than
20
+ crashing or corrupting downstream state.
21
+ * **Separable**. ``detect_regions_with_llm`` is one function. Mapping
22
+ regions to ``LayoutInstruction`` is another. Mapping a ``LayoutKind``
23
+ guess from regions is a third. Each is independently testable.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import json
29
+ from typing import Callable
30
+
31
+ from ..schemas import (
32
+ BoundingBox,
33
+ LayoutInstruction,
34
+ LayoutKind,
35
+ Scene,
36
+ SceneClassification,
37
+ SceneRegions,
38
+ )
39
+
40
+
41
+ LLMRegionFn = Callable[[str, str], str]
42
+ """Signature: (keyframe_path, prompt) -> raw model string (expected JSON).
43
+
44
+ The caller is responsible for any image encoding (base64, multipart, etc.).
45
+ The primitive only passes the path + prompt and re-validates the reply.
46
+ """
47
+
48
+
49
+ REGION_PROMPT = """You are a vision+OCR system for a short-video editor.
50
+ Look at the provided keyframe and return a STRICT JSON object of this shape:
51
+
52
+ {
53
+ "person_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
54
+ "chart_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
55
+ "ocr_text": "<text visible on screen, empty string if none>",
56
+ "reason": "<= 20 words of rationale"
57
+ }
58
+
59
+ Rules:
60
+ - All bbox coordinates are normalized to the frame (0=left/top, 1=right/bottom).
61
+ - x2 > x1, y2 > y1.
62
+ - Return null for any region that is not present (e.g. a pure talking-head
63
+ scene has no chart).
64
+ - "person_bbox" is the *speaker's* body/head region if visible.
65
+ - "chart_bbox" is any chart, graph, slide, screenshare, or diagram.
66
+ - OCR text should be the readable text on screen (titles, labels, chart
67
+ axis values). Omit subtitle captions.
68
+ - NO markdown, NO prose outside JSON. JSON only.
69
+ """
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Core: detect regions per scene via pluggable LLM
74
+ # ---------------------------------------------------------------------------
75
+
76
+
77
+ def detect_regions_with_llm(
78
+ scenes: list[Scene], vision_fn: LLMRegionFn
79
+ ) -> list[SceneRegions]:
80
+ """Call ``vision_fn`` for each scene's keyframe and return parsed regions.
81
+
82
+ Parse failures degrade to an empty ``SceneRegions`` with ``raw_reason``
83
+ describing the error — never raise — so a single bad scene can't take
84
+ down the whole pipeline.
85
+ """
86
+
87
+ out: list[SceneRegions] = []
88
+ for s in scenes:
89
+ if not s.keyframe_path:
90
+ out.append(
91
+ SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available")
92
+ )
93
+ continue
94
+ raw = vision_fn(s.keyframe_path, REGION_PROMPT)
95
+ out.append(_parse_region_reply(s.scene_id, raw))
96
+ return out
97
+
98
+
99
+ def _parse_region_reply(scene_id: str, raw: str) -> SceneRegions:
100
+ try:
101
+ data = json.loads(raw)
102
+ except json.JSONDecodeError as e:
103
+ return SceneRegions(scene_id=scene_id, raw_reason=f"JSON parse error: {e!r}")
104
+
105
+ def _opt_bbox(value: object) -> BoundingBox | None:
106
+ if not value:
107
+ return None
108
+ try:
109
+ return BoundingBox.model_validate(value)
110
+ except Exception:
111
+ return None
112
+
113
+ return SceneRegions(
114
+ scene_id=scene_id,
115
+ person_bbox=_opt_bbox(data.get("person_bbox")),
116
+ chart_bbox=_opt_bbox(data.get("chart_bbox")),
117
+ ocr_text=str(data.get("ocr_text", ""))[:4000],
118
+ raw_reason=str(data.get("reason", ""))[:400],
119
+ )
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Derivation: regions -> LayoutKind / LayoutInstruction
124
+ # ---------------------------------------------------------------------------
125
+
126
+
127
+ # Width threshold: if the chart bbox covers this much of the frame width, it
128
+ # is wide enough to treat the scene as a split_chart_person. Tuned for the
129
+ # source videos described in the spec (chart ~2/3 of width).
130
+ _CHART_WIDTH_SPLIT_THRESHOLD = 0.45
131
+
132
+
133
+ def classify_from_regions(regions: SceneRegions) -> SceneClassification:
134
+ """Pick a ``LayoutKind`` for a scene using only its ``SceneRegions``.
135
+
136
+ Priority:
137
+ 1. If ``chart_bbox`` is present and wide, it's ``SPLIT_CHART_PERSON``.
138
+ 2. Else if ``person_bbox`` is present and tight, ``ZOOM_CALL_CENTER``.
139
+ 3. Else default to ``SIT_CENTER`` with low confidence.
140
+
141
+ "Tight" ≈ the person covers more than half the frame width (zoom-call
142
+ webcam framing). "Wide" for a chart ≈ 45% of frame width or more.
143
+ """
144
+
145
+ if regions.chart_bbox and regions.chart_bbox.width >= _CHART_WIDTH_SPLIT_THRESHOLD:
146
+ return SceneClassification(
147
+ scene_id=regions.scene_id,
148
+ layout=LayoutKind.SPLIT_CHART_PERSON,
149
+ confidence=float(min(1.0, 0.5 + regions.chart_bbox.width / 2.0)),
150
+ reason=f"chart bbox covers {regions.chart_bbox.width:.2f} of width",
151
+ )
152
+ if regions.person_bbox and regions.person_bbox.width >= 0.5:
153
+ return SceneClassification(
154
+ scene_id=regions.scene_id,
155
+ layout=LayoutKind.ZOOM_CALL_CENTER,
156
+ confidence=float(min(1.0, 0.5 + regions.person_bbox.width / 2.0)),
157
+ reason=f"person bbox wide ({regions.person_bbox.width:.2f}) — tight framing",
158
+ )
159
+ if regions.person_bbox:
160
+ return SceneClassification(
161
+ scene_id=regions.scene_id,
162
+ layout=LayoutKind.SIT_CENTER,
163
+ confidence=0.7,
164
+ reason="person present, no wide chart, wider framing",
165
+ )
166
+ return SceneClassification(
167
+ scene_id=regions.scene_id,
168
+ layout=LayoutKind.SIT_CENTER,
169
+ confidence=0.3,
170
+ reason=regions.raw_reason or "no regions detected — defaulting to sit_center",
171
+ )
172
+
173
+
174
+ def layout_instruction_from_regions(
175
+ regions: SceneRegions,
176
+ classification: SceneClassification,
177
+ *,
178
+ clip_id: str | None = None,
179
+ zoom: float = 1.0,
180
+ ) -> LayoutInstruction:
181
+ """Build a ``LayoutInstruction`` whose knobs are populated from bboxes.
182
+
183
+ ``person_x_norm`` uses the person bbox center when available; falls back
184
+ to 0.5 (center). ``chart_x_norm`` uses the chart bbox left edge; falls
185
+ back to 0.0.
186
+ """
187
+
188
+ person_x = regions.person_bbox.center_x if regions.person_bbox else 0.5
189
+ chart_x = regions.chart_bbox.x1 if regions.chart_bbox else 0.0
190
+ return LayoutInstruction(
191
+ clip_id=clip_id or classification.scene_id,
192
+ layout=classification.layout,
193
+ zoom=zoom,
194
+ person_x_norm=person_x,
195
+ chart_x_norm=chart_x,
196
+ )
197
+
198
+
199
+ def classify_scenes_with_vision_llm(
200
+ scenes: list[Scene], vision_fn: LLMRegionFn
201
+ ) -> list[tuple[SceneRegions, SceneClassification]]:
202
+ """One-shot helper: keyframes -> regions -> classifications.
203
+
204
+ Returns ``(regions, classification)`` pairs per scene so the caller can
205
+ keep both artefacts on disk (regions = deep detail, classification =
206
+ what a renderer consumes).
207
+ """
208
+
209
+ regions = detect_regions_with_llm(scenes, vision_fn)
210
+ return [(r, classify_from_regions(r)) for r in regions]
humeo-core/src/humeo_core/schemas.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Strict JSON contracts — the "container" of the rocket.
2
+
3
+ Every primitive reads and writes these. No primitive takes or returns free-form
4
+ strings. This is the non-negotiable interface described in the HIVE paper
5
+ guide (section 7): machine-checkable intermediate artifacts at every stage.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from enum import Enum
11
+ from typing import Literal
12
+
13
+ from pydantic import BaseModel, Field, field_validator, model_serializer, model_validator
14
+
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Extraction artifacts
18
+ # ---------------------------------------------------------------------------
19
+
20
+
21
+ class Scene(BaseModel):
22
+ """A single shot/scene detected in the source video."""
23
+
24
+ scene_id: str
25
+ start_time: float = Field(ge=0)
26
+ end_time: float = Field(gt=0)
27
+ keyframe_path: str | None = None
28
+
29
+ @field_validator("end_time")
30
+ @classmethod
31
+ def _end_after_start(cls, v: float, info) -> float:
32
+ start = info.data.get("start_time", 0.0)
33
+ if v <= start:
34
+ raise ValueError("end_time must be strictly greater than start_time")
35
+ return v
36
+
37
+ @property
38
+ def duration(self) -> float:
39
+ return self.end_time - self.start_time
40
+
41
+
42
+ class TranscriptWord(BaseModel):
43
+ """One ASR token with times in **seconds on the source video** timeline."""
44
+
45
+ word: str
46
+ start_time: float = Field(ge=0)
47
+ end_time: float = Field(ge=0)
48
+
49
+
50
+ class ClipSubtitleWords(BaseModel):
51
+ """Words for one clip with times in **seconds relative to clip start** (t=0 at cut in-point)."""
52
+
53
+ words: list[TranscriptWord] = Field(default_factory=list)
54
+
55
+
56
+ class FocusStackOrder(str, Enum):
57
+ """Vertical order for split layouts: which item occupies the top vs bottom band.
58
+
59
+ Bands are split by :attr:`LayoutInstruction.top_band_ratio` (default 0.5 = even).
60
+ For ``SPLIT_CHART_PERSON`` this picks chart-on-top vs person-on-top.
61
+ For ``SPLIT_TWO_PERSONS`` / ``SPLIT_TWO_CHARTS`` it has no visible meaning
62
+ (both bands hold the same kind of item); the enum value is retained only
63
+ so a single stacking recipe drives all three split layouts.
64
+ """
65
+
66
+ CHART_THEN_PERSON = "chart_then_person"
67
+ PERSON_THEN_CHART = "person_then_chart"
68
+
69
+
70
+ class RenderTheme(str, Enum):
71
+ """Visual treatment applied by the final renderer."""
72
+
73
+ LEGACY = "legacy"
74
+ REFERENCE_LOWER_THIRD = "reference_lower_third"
75
+ NATIVE_HIGHLIGHT = "native_highlight"
76
+
77
+
78
+ class IngestResult(BaseModel):
79
+ """Everything Stage 1 (deterministic local extraction) produces."""
80
+
81
+ source_path: str
82
+ duration_sec: float
83
+ scenes: list[Scene]
84
+ transcript_words: list[TranscriptWord]
85
+ keyframes_dir: str | None = None
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Layout system — the 5 "thrusters" (max 2 on-screen items per short)
90
+ # ---------------------------------------------------------------------------
91
+
92
+
93
+ class LayoutKind(str, Enum):
94
+ """The 9:16 layouts. A short contains **at most two** on-screen items.
95
+
96
+ An "item" is one of ``person`` (a human speaker) or ``chart`` (slide, graph,
97
+ data visual, screenshare). Five combinations are allowed:
98
+
99
+ - ``ZOOM_CALL_CENTER``: **1 person**, tight webcam/zoom-call framing, centered.
100
+ - ``SIT_CENTER``: **1 person**, interview/seated framing, centered.
101
+ - ``SPLIT_CHART_PERSON``: **1 chart + 1 person** — chart + speaker share the
102
+ source frame. Output stacks them vertically
103
+ (by default ``focus_stack_order`` = chart-on-top).
104
+ - ``SPLIT_TWO_PERSONS``: **2 persons** — two speakers (e.g. interview two-up).
105
+ Output stacks them vertically.
106
+ - ``SPLIT_TWO_CHARTS``: **2 charts** — two charts/slides side-by-side in source.
107
+ Output stacks them vertically.
108
+
109
+ The "max 2 items" constraint is the keep-it-simple rule: every rendered short
110
+ is either one item centered, or two items stacked evenly top/bottom.
111
+ """
112
+
113
+ ZOOM_CALL_CENTER = "zoom_call_center"
114
+ SIT_CENTER = "sit_center"
115
+ SPLIT_CHART_PERSON = "split_chart_person"
116
+ SPLIT_TWO_PERSONS = "split_two_persons"
117
+ SPLIT_TWO_CHARTS = "split_two_charts"
118
+
119
+
120
+ # Layouts that stack two items vertically in the 9:16 output.
121
+ SPLIT_LAYOUTS: frozenset[LayoutKind] = frozenset(
122
+ {
123
+ LayoutKind.SPLIT_CHART_PERSON,
124
+ LayoutKind.SPLIT_TWO_PERSONS,
125
+ LayoutKind.SPLIT_TWO_CHARTS,
126
+ }
127
+ )
128
+
129
+
130
+ class TimedCenterPoint(BaseModel):
131
+ """Speaker x-center at a clip-relative time, used for tracked centering."""
132
+
133
+ t_sec: float = Field(ge=0.0)
134
+ x_norm: float = Field(ge=0.0, le=1.0)
135
+ zoom: float | None = Field(
136
+ default=None,
137
+ gt=0.0,
138
+ le=4.0,
139
+ description=(
140
+ "Optional per-sample crop zoom. When unset, the layout uses the "
141
+ "clip-level ``zoom`` value for that moment."
142
+ ),
143
+ )
144
+
145
+
146
+ class ClipRenderSpan(BaseModel):
147
+ """One kept source-timeline span inside a selected clip."""
148
+
149
+ start_time_sec: float = Field(ge=0.0)
150
+ end_time_sec: float = Field(gt=0.0)
151
+
152
+ @field_validator("end_time_sec")
153
+ @classmethod
154
+ def _end_after_start(cls, v: float, info) -> float:
155
+ start = info.data.get("start_time_sec", 0.0)
156
+ if v <= start:
157
+ raise ValueError("render span end_time_sec must be greater than start_time_sec")
158
+ return v
159
+
160
+ @property
161
+ def duration_sec(self) -> float:
162
+ return self.end_time_sec - self.start_time_sec
163
+
164
+
165
+ class LayoutInstruction(BaseModel):
166
+ """Per-clip decision telling the compiler which layout to apply and how to crop.
167
+
168
+ Every short is described by exactly one of these, keyed by ``clip_id``. Split
169
+ layouts additionally carry up to two normalized bounding boxes (chart/person
170
+ or two-of-a-kind) so the compiler crops source strips that **partition** the
171
+ source width without overlap or gap.
172
+ """
173
+
174
+ clip_id: str
175
+ layout: LayoutKind
176
+ # Optional per-layout knobs. Defaults are sane for a 1920x1080 source.
177
+ zoom: float = Field(default=1.0, gt=0, le=4.0)
178
+ person_x_norm: float = Field(
179
+ default=0.5,
180
+ ge=0.0,
181
+ le=1.0,
182
+ description="Normalized x-center of the human subject in source frame (0=left, 1=right).",
183
+ )
184
+ person_tracking: list[TimedCenterPoint] = Field(
185
+ default_factory=list,
186
+ description=(
187
+ "Optional clip-relative speaker framing samples for moving 9:16 crops. "
188
+ "Each point can shift the x-center and optionally widen/tighten the crop "
189
+ "for that moment. When empty, the compiler uses the static "
190
+ "person_x_norm/zoom settings."
191
+ ),
192
+ )
193
+ chart_x_norm: float = Field(
194
+ default=0.0,
195
+ ge=0.0,
196
+ le=1.0,
197
+ description=(
198
+ "split_chart_person only: left-edge trim of the chart strip, as a fraction of the "
199
+ "left 2/3 pane (0 = use full chart area)."
200
+ ),
201
+ )
202
+ focus_stack_order: FocusStackOrder = Field(
203
+ default=FocusStackOrder.CHART_THEN_PERSON,
204
+ description="For split_chart_person only: chart-on-top vs person-on-top in the 9:16 stack.",
205
+ )
206
+ split_chart_region: BoundingBox | None = Field(
207
+ default=None,
208
+ description=(
209
+ "Optional normalized rect for the chart/slide crop (Gemini vision). "
210
+ "When set with split_person_region, the split layout uses these boxes instead of fixed 2/3|1/3."
211
+ ),
212
+ )
213
+ split_person_region: BoundingBox | None = Field(
214
+ default=None,
215
+ description="Optional normalized rect for the speaker crop (Gemini vision).",
216
+ )
217
+ split_second_chart_region: BoundingBox | None = Field(
218
+ default=None,
219
+ description=(
220
+ "For ``SPLIT_TWO_CHARTS`` only: second chart bbox. The first chart occupies "
221
+ "the top output band, this one occupies the bottom band."
222
+ ),
223
+ )
224
+ split_second_person_region: BoundingBox | None = Field(
225
+ default=None,
226
+ description=(
227
+ "For ``SPLIT_TWO_PERSONS`` only: second speaker bbox. The first person "
228
+ "occupies the top output band, this one occupies the bottom band."
229
+ ),
230
+ )
231
+ top_band_ratio: float = Field(
232
+ default=0.5,
233
+ ge=0.2,
234
+ le=0.8,
235
+ description=(
236
+ "Fraction of 9:16 output height used by the top band for split layouts. "
237
+ "0.5 = EVEN 50/50 split (default — the user-requested symmetric look). "
238
+ "0.6 historically matched the 'chart dominant / person small' look."
239
+ ),
240
+ )
241
+
242
+
243
+ @field_validator("person_tracking")
244
+ @classmethod
245
+ def _tracking_times_non_decreasing(
246
+ cls, points: list[TimedCenterPoint]
247
+ ) -> list[TimedCenterPoint]:
248
+ last_t = -1.0
249
+ for point in points:
250
+ if point.t_sec < last_t:
251
+ raise ValueError("person_tracking times must be non-decreasing")
252
+ last_t = point.t_sec
253
+ return points
254
+
255
+
256
+ class SceneClassification(BaseModel):
257
+ """Result of the classifier: which layout should a given scene use."""
258
+
259
+ scene_id: str
260
+ layout: LayoutKind
261
+ confidence: float = Field(ge=0.0, le=1.0)
262
+ reason: str = ""
263
+
264
+
265
+ # ---------------------------------------------------------------------------
266
+ # Vision bounding boxes — the LLM+OCR path (alt to pixel heuristics)
267
+ # ---------------------------------------------------------------------------
268
+
269
+
270
+ class BoundingBox(BaseModel):
271
+ """Normalized [0..1] bounding box in the source frame coordinate space.
272
+
273
+ Normalized coords keep these outputs portable across source resolutions
274
+ and stop the model hallucinating pixel values. ``x2 > x1`` and
275
+ ``y2 > y1`` are enforced.
276
+ """
277
+
278
+ x1: float = Field(ge=0.0, le=1.0)
279
+ y1: float = Field(ge=0.0, le=1.0)
280
+ x2: float = Field(ge=0.0, le=1.0)
281
+ y2: float = Field(ge=0.0, le=1.0)
282
+ label: str = ""
283
+ confidence: float = Field(default=1.0, ge=0.0, le=1.0)
284
+
285
+ @field_validator("x2")
286
+ @classmethod
287
+ def _x2_after_x1(cls, v: float, info) -> float:
288
+ x1 = info.data.get("x1", 0.0)
289
+ if v <= x1:
290
+ raise ValueError("x2 must be > x1")
291
+ return v
292
+
293
+ @field_validator("y2")
294
+ @classmethod
295
+ def _y2_after_y1(cls, v: float, info) -> float:
296
+ y1 = info.data.get("y1", 0.0)
297
+ if v <= y1:
298
+ raise ValueError("y2 must be > y1")
299
+ return v
300
+
301
+ @property
302
+ def center_x(self) -> float:
303
+ return (self.x1 + self.x2) / 2.0
304
+
305
+ @property
306
+ def center_y(self) -> float:
307
+ return (self.y1 + self.y2) / 2.0
308
+
309
+ @property
310
+ def width(self) -> float:
311
+ return self.x2 - self.x1
312
+
313
+
314
+ class SceneRegions(BaseModel):
315
+ """Vision-LLM output for a single scene keyframe.
316
+
317
+ Flow: detect a scene change locally (cheap) -> extract one keyframe per
318
+ scene -> send that keyframe to a vision LLM with an OCR hint -> get
319
+ normalized bounding boxes for the on-screen roles (``person``,
320
+ ``chart``). Those boxes drive ``person_x_norm`` / ``chart_x_norm`` on a
321
+ ``LayoutInstruction`` without any pixel code running in Python.
322
+ """
323
+
324
+ scene_id: str
325
+ person_bbox: BoundingBox | None = None
326
+ chart_bbox: BoundingBox | None = None
327
+ ocr_text: str = ""
328
+ raw_reason: str = ""
329
+
330
+
331
+ # ---------------------------------------------------------------------------
332
+ # Clip planning
333
+ # ---------------------------------------------------------------------------
334
+
335
+
336
+ class Clip(BaseModel):
337
+ clip_id: str
338
+ topic: str
339
+ start_time_sec: float = Field(ge=0)
340
+ end_time_sec: float = Field(gt=0)
341
+ viral_hook: str = ""
342
+ virality_score: float = Field(default=0.0, ge=0.0, le=1.0)
343
+ transcript: str = ""
344
+ suggested_overlay_title: str = ""
345
+ layout: LayoutKind | None = None
346
+ score_breakdown: dict[str, float] | None = None
347
+ origin: Literal["text", "visual", "both"] = "text"
348
+ visual_notes: str | None = None
349
+ reasoning: str | None = None
350
+
351
+ # Optional LLM metadata (source timeline is start_time_sec / end_time_sec).
352
+ hook_start_sec: float | None = Field(
353
+ default=None,
354
+ description="Seconds from clip in-point where the viral hook begins (0 = clip start).",
355
+ )
356
+ hook_end_sec: float | None = Field(
357
+ default=None,
358
+ description="Seconds from clip in-point where the hook ends (exclusive upper bound).",
359
+ )
360
+ trim_start_sec: float = Field(
361
+ default=0.0,
362
+ ge=0,
363
+ description="Seconds to remove from the start of this segment when exporting.",
364
+ )
365
+ trim_end_sec: float = Field(
366
+ default=0.0,
367
+ ge=0,
368
+ description="Seconds to remove from the end of this segment when exporting.",
369
+ )
370
+ render_spans: list[ClipRenderSpan] = Field(
371
+ default_factory=list,
372
+ description=(
373
+ "Optional ordered source-timeline spans to keep when exporting. "
374
+ "When present, these spans override contiguous trim_start/trim_end export."
375
+ ),
376
+ )
377
+ shorts_title: str = ""
378
+ description: str = ""
379
+ hashtags: list[str] = Field(default_factory=list)
380
+ layout_hint: LayoutKind | None = None
381
+ needs_review: bool = False
382
+ review_reason: str = ""
383
+
384
+ @field_validator("score_breakdown")
385
+ @classmethod
386
+ def _score_breakdown_in_range(
387
+ cls, v: dict[str, float] | None
388
+ ) -> dict[str, float] | None:
389
+ if v is None:
390
+ return None
391
+ cleaned: dict[str, float] = {}
392
+ for axis, score in v.items():
393
+ if score < 0.0:
394
+ raise ValueError(f"score_breakdown[{axis!r}] must be non-negative")
395
+ cleaned[axis] = min(score, 1.0)
396
+ return cleaned
397
+
398
+ @model_validator(mode="after")
399
+ def _timing_consistency(self) -> "Clip":
400
+ if self.end_time_sec <= self.start_time_sec:
401
+ raise ValueError("end_time_sec must be greater than start_time_sec")
402
+ dur = self.end_time_sec - self.start_time_sec
403
+ hs, he = self.hook_start_sec, self.hook_end_sec
404
+ if (hs is None) ^ (he is None):
405
+ raise ValueError("hook_start_sec and hook_end_sec must both be set or both omitted")
406
+ if hs is not None and he is not None:
407
+ if not (0 <= hs < he <= dur):
408
+ raise ValueError(
409
+ "hook window must satisfy 0 <= hook_start_sec < hook_end_sec <= clip duration"
410
+ )
411
+ if self.trim_start_sec + self.trim_end_sec > dur:
412
+ raise ValueError("trim_start_sec + trim_end_sec must not exceed clip duration")
413
+ last_end = None
414
+ for span in self.render_spans:
415
+ if span.start_time_sec < self.start_time_sec - 1e-6:
416
+ raise ValueError("render_spans must stay within the clip start_time_sec")
417
+ if span.end_time_sec > self.end_time_sec + 1e-6:
418
+ raise ValueError("render_spans must stay within the clip end_time_sec")
419
+ if last_end is not None and span.start_time_sec < last_end - 1e-6:
420
+ raise ValueError("render_spans must be ordered and non-overlapping")
421
+ last_end = span.end_time_sec
422
+ return self
423
+
424
+ @model_serializer(mode="wrap")
425
+ def _serialize_without_default_extensions(self, handler):
426
+ data = handler(self)
427
+ if data.get("score_breakdown") is None:
428
+ data.pop("score_breakdown", None)
429
+ if data.get("origin") == "text":
430
+ data.pop("origin", None)
431
+ if data.get("visual_notes") is None:
432
+ data.pop("visual_notes", None)
433
+ if data.get("reasoning") is None:
434
+ data.pop("reasoning", None)
435
+ return data
436
+
437
+ @property
438
+ def duration_sec(self) -> float:
439
+ return self.end_time_sec - self.start_time_sec
440
+
441
+
442
+ class ClipPlan(BaseModel):
443
+ """Output of the clip-selection stage — a list of clips + their layouts."""
444
+
445
+ source_path: str
446
+ clips: list[Clip]
447
+
448
+
449
+ class ApprovalResult(BaseModel):
450
+ action: Literal["proceed", "refine", "quit", "accept_all"]
451
+ selected_ids: list[str] | None = None
452
+ steering_note: str | None = None
453
+
454
+
455
+ class RatingFeedback(BaseModel):
456
+ rating: Literal[1, 2, 3]
457
+ issues: list[
458
+ Literal[
459
+ "wrong_moments",
460
+ "bad_cuts",
461
+ "boring",
462
+ "confusing",
463
+ "wrong_layout",
464
+ "length_off",
465
+ "other",
466
+ ]
467
+ ] = Field(default_factory=list)
468
+ free_text: str | None = None
469
+
470
+
471
+ class SessionState(BaseModel):
472
+ source_key: str = ""
473
+ iteration: int = 0
474
+ steering_notes: list[str] = Field(default_factory=list)
475
+ last_rating: RatingFeedback | None = None
476
+ last_selected_ids: list[str] | None = None
477
+
478
+
479
+ # ---------------------------------------------------------------------------
480
+ # Render
481
+ # ---------------------------------------------------------------------------
482
+
483
+
484
+ class RenderRequest(BaseModel):
485
+ source_path: str
486
+ clip: Clip
487
+ layout: LayoutInstruction
488
+ output_path: str
489
+ width: int = 1080
490
+ height: int = 1920
491
+ subtitle_path: str | None = None
492
+ subtitle_font_size: int = Field(
493
+ default=48,
494
+ ge=10,
495
+ le=120,
496
+ description=(
497
+ "Caption font size in **output pixels** (libass is pinned to "
498
+ "``original_size=width x height`` by the compiler, so this is a "
499
+ "true pixel value, not the old PlayResY=288 unit)."
500
+ ),
501
+ )
502
+ subtitle_margin_v: int = Field(
503
+ default=160,
504
+ ge=0,
505
+ le=800,
506
+ description="Vertical caption margin in output pixels (bottom-anchored).",
507
+ )
508
+ title_text: str = ""
509
+ render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT
510
+ mode: Literal["normal", "dry_run"] = "normal"
511
+
512
+
513
+ class RenderResult(BaseModel):
514
+ clip_id: str
515
+ output_path: str
516
+ ffmpeg_cmd: list[str]
517
+ success: bool
518
+ error: str = ""
humeo-core/src/humeo_core/server.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastMCP server — the control panel for the reusable rocket.
2
+
3
+ Every primitive is exposed as a single MCP ``tool``. Each tool takes and
4
+ returns strict Pydantic-validated JSON, so an MCP client (Cursor, Claude
5
+ Desktop, etc.) can compose a full long-to-short pipeline without guessing
6
+ any interface.
7
+
8
+ Tools:
9
+
10
+ humeo.ingest — Stage 1 extraction (scenes + keyframes [+ transcript])
11
+ humeo.classify_scenes — Assign one of 5 layouts to each scene (pixel heuristic)
12
+ humeo.classify_scenes_with_vision — Assign layouts using bboxes from a vision LLM + OCR
13
+ humeo.detect_scene_regions — Raw LLM bbox output per scene keyframe (OCR-assisted)
14
+ humeo.select_clips — Pick top clips from a transcript (heuristic)
15
+ humeo.plan_layout — Return the ffmpeg filtergraph for a given layout
16
+ humeo.build_render_cmd — Build the full ffmpeg command (dry-run safe)
17
+ humeo.render_clip — Build + actually run ffmpeg to produce a 9:16 clip
18
+ humeo.list_layouts — List the 5 available layouts (discovery)
19
+
20
+ Resources:
21
+
22
+ humeo://layouts — JSON listing of the 5 layouts + description
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ from typing import Any
29
+
30
+ from mcp.server.fastmcp import FastMCP
31
+
32
+ from .primitives import classify as classify_mod
33
+ from .primitives import compile as compile_mod
34
+ from .primitives import ingest as ingest_mod
35
+ from .primitives import layouts as layouts_mod
36
+ from .primitives import select_clips as select_mod
37
+ from .primitives import vision as vision_mod
38
+ from .schemas import (
39
+ IngestResult,
40
+ LayoutInstruction,
41
+ LayoutKind,
42
+ RenderRequest,
43
+ RenderResult,
44
+ Scene,
45
+ SceneRegions,
46
+ TranscriptWord,
47
+ )
48
+
49
+
50
+ mcp = FastMCP(
51
+ "humeo-core",
52
+ instructions=(
53
+ "Humeo MCP: reusable primitives for turning long videos into 9:16 shorts. "
54
+ "Compose tools in this order: ingest -> classify_scenes -> select_clips -> "
55
+ "plan_layout/build_render_cmd -> render_clip. All IO is strict JSON."
56
+ ),
57
+ )
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Discovery
62
+ # ---------------------------------------------------------------------------
63
+
64
+
65
+ @mcp.tool()
66
+ def list_layouts() -> dict[str, Any]:
67
+ """Return the 5 fixed 9:16 layouts this server supports.
68
+
69
+ Every short shows **at most two** on-screen items (person/chart), which
70
+ gives exactly five recipes. Use this to discover the set of
71
+ :class:`LayoutKind` values before classifying scenes or requesting
72
+ renders.
73
+ """
74
+
75
+ return {
76
+ "layouts": [
77
+ {
78
+ "kind": LayoutKind.ZOOM_CALL_CENTER.value,
79
+ "items": ["person"],
80
+ "description": "1 person, tight zoom-call / webcam framing, centered.",
81
+ },
82
+ {
83
+ "kind": LayoutKind.SIT_CENTER.value,
84
+ "items": ["person"],
85
+ "description": "1 person, interview / seated framing, centered.",
86
+ },
87
+ {
88
+ "kind": LayoutKind.SPLIT_CHART_PERSON.value,
89
+ "items": ["chart", "person"],
90
+ "description": (
91
+ "1 chart + 1 person. Source is partitioned left/right by the chart and "
92
+ "person bboxes (falling back to a 2/3 | 1/3 split); each strip is scaled "
93
+ "to fill its output band. Bands default to an even 50/50 vertical split; "
94
+ "configurable via ``top_band_ratio`` and swappable via ``focus_stack_order``."
95
+ ),
96
+ },
97
+ {
98
+ "kind": LayoutKind.SPLIT_TWO_PERSONS.value,
99
+ "items": ["person", "person"],
100
+ "description": (
101
+ "2 people (interview two-up / panel). Left speaker in the top band, right "
102
+ "speaker in the bottom band; seam sits between the two person bboxes."
103
+ ),
104
+ },
105
+ {
106
+ "kind": LayoutKind.SPLIT_TWO_CHARTS.value,
107
+ "items": ["chart", "chart"],
108
+ "description": (
109
+ "2 charts / slides side-by-side in source. Left chart on top, right chart "
110
+ "on bottom; each is scaled to fill its band."
111
+ ),
112
+ },
113
+ ]
114
+ }
115
+
116
+
117
+ @mcp.resource("humeo://layouts")
118
+ def layouts_resource() -> str:
119
+ return json.dumps(list_layouts(), indent=2)
120
+
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # Landing gear: ingest
124
+ # ---------------------------------------------------------------------------
125
+
126
+
127
+ @mcp.tool()
128
+ def ingest(
129
+ source_path: str,
130
+ work_dir: str,
131
+ with_transcript: bool = False,
132
+ whisper_model: str = "base",
133
+ ) -> dict[str, Any]:
134
+ """Run deterministic local extraction (scenes + keyframes, optional transcript).
135
+
136
+ Args:
137
+ source_path: absolute path to a local video file.
138
+ work_dir: directory where keyframes/ and temp artifacts will be written.
139
+ with_transcript: if True, run faster-whisper word-level transcription.
140
+ whisper_model: whisper model name (e.g. "tiny", "base", "small").
141
+ """
142
+
143
+ result: IngestResult = ingest_mod.ingest(
144
+ source_path,
145
+ work_dir,
146
+ with_transcript=with_transcript,
147
+ whisper_model=whisper_model,
148
+ )
149
+ return result.model_dump()
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # Pilot: classify scenes
154
+ # ---------------------------------------------------------------------------
155
+
156
+
157
+ @mcp.tool()
158
+ def classify_scenes(scenes: list[dict[str, Any]]) -> dict[str, Any]:
159
+ """Classify each scene into exactly one of the 5 supported layouts.
160
+
161
+ Uses an offline pixel heuristic on each scene's keyframe. Agents that
162
+ want a smarter classifier can post-process or overwrite the result,
163
+ or call ``classify_scenes_with_vision`` with bboxes from a vision LLM.
164
+ """
165
+
166
+ parsed = [Scene.model_validate(s) for s in scenes]
167
+ results = classify_mod.classify_scenes_heuristic(parsed)
168
+ return {"classifications": [r.model_dump() for r in results]}
169
+
170
+
171
+ # ---------------------------------------------------------------------------
172
+ # Pilot (alt path): vision-LLM + OCR bbox classifier
173
+ # ---------------------------------------------------------------------------
174
+
175
+
176
+ @mcp.tool()
177
+ def detect_scene_regions(scenes: list[dict[str, Any]]) -> dict[str, Any]:
178
+ """Return the prompt + per-scene stubs used for LLM+OCR bbox detection.
179
+
180
+ This tool is the *adapter* half of the vision primitive. The MCP server
181
+ itself never calls an LLM — the agent does. So this endpoint returns:
182
+
183
+ 1. the exact ``REGION_PROMPT`` to send along with each keyframe, and
184
+ 2. a list of ``{scene_id, keyframe_path, prompt}`` jobs.
185
+
186
+ The agent runs its own vision model for each job, then feeds the
187
+ resulting JSON back via ``classify_scenes_with_vision``.
188
+ """
189
+
190
+ parsed = [Scene.model_validate(s) for s in scenes]
191
+ return {
192
+ "prompt": vision_mod.REGION_PROMPT,
193
+ "jobs": [
194
+ {
195
+ "scene_id": s.scene_id,
196
+ "keyframe_path": s.keyframe_path,
197
+ "prompt": vision_mod.REGION_PROMPT,
198
+ }
199
+ for s in parsed
200
+ ],
201
+ }
202
+
203
+
204
+ @mcp.tool()
205
+ def classify_scenes_with_vision(regions: list[dict[str, Any]]) -> dict[str, Any]:
206
+ """Classify scenes from already-gathered ``SceneRegions`` bbox records.
207
+
208
+ Input is a list of ``SceneRegions`` JSON dicts (output of the agent's
209
+ vision-LLM pass). Output is a ``{classifications, layout_instructions}``
210
+ pair — the layout kind per scene plus a ready-to-render
211
+ ``LayoutInstruction`` with ``person_x_norm`` / ``chart_x_norm`` already
212
+ populated from the bboxes.
213
+ """
214
+
215
+ parsed_regions = [SceneRegions.model_validate(r) for r in regions]
216
+ classifications = [vision_mod.classify_from_regions(r) for r in parsed_regions]
217
+ instructions = [
218
+ vision_mod.layout_instruction_from_regions(r, c)
219
+ for r, c in zip(parsed_regions, classifications)
220
+ ]
221
+ return {
222
+ "classifications": [c.model_dump() for c in classifications],
223
+ "layout_instructions": [i.model_dump() for i in instructions],
224
+ }
225
+
226
+
227
+ # ---------------------------------------------------------------------------
228
+ # Pilot: select clips
229
+ # ---------------------------------------------------------------------------
230
+
231
+
232
+ @mcp.tool()
233
+ def select_clips(
234
+ source_path: str,
235
+ transcript_words: list[dict[str, Any]],
236
+ duration_sec: float,
237
+ target_count: int = 5,
238
+ min_sec: float = 30.0,
239
+ max_sec: float = 60.0,
240
+ ) -> dict[str, Any]:
241
+ """Heuristically select top clips from a word-level transcript.
242
+
243
+ Scoring is word-density per window. Returns a ``ClipPlan`` with up to
244
+ ``target_count`` non-overlapping clips.
245
+ """
246
+
247
+ words = [TranscriptWord.model_validate(w) for w in transcript_words]
248
+ plan = select_mod.select_clips_heuristic(
249
+ source_path,
250
+ words,
251
+ duration_sec,
252
+ target_count=target_count,
253
+ min_sec=min_sec,
254
+ max_sec=max_sec,
255
+ )
256
+ return plan.model_dump()
257
+
258
+
259
+ # ---------------------------------------------------------------------------
260
+ # Thrusters: plan + render
261
+ # ---------------------------------------------------------------------------
262
+
263
+
264
+ @mcp.tool()
265
+ def plan_layout(
266
+ layout: str,
267
+ out_w: int = 1080,
268
+ out_h: int = 1920,
269
+ src_w: int = 1920,
270
+ src_h: int = 1080,
271
+ zoom: float = 1.0,
272
+ person_x_norm: float = 0.5,
273
+ chart_x_norm: float = 0.0,
274
+ clip_id: str = "preview",
275
+ ) -> dict[str, Any]:
276
+ """Return the ffmpeg filter_complex fragment for one layout.
277
+
278
+ This is the pure, deterministic function underpinning the 5 thrusters.
279
+ No rendering is performed. Useful for agents that want to preview the
280
+ filtergraph or compose it with their own ffmpeg invocation.
281
+ """
282
+
283
+ instr = LayoutInstruction(
284
+ clip_id=clip_id,
285
+ layout=LayoutKind(layout),
286
+ zoom=zoom,
287
+ person_x_norm=person_x_norm,
288
+ chart_x_norm=chart_x_norm,
289
+ )
290
+ fp = layouts_mod.plan_layout(instr, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
291
+ return {"filtergraph": fp.filtergraph, "out_label": fp.out_label}
292
+
293
+
294
+ @mcp.tool()
295
+ def build_render_cmd(request: dict[str, Any]) -> dict[str, Any]:
296
+ """Build (but do NOT run) the ffmpeg command for a render request.
297
+
298
+ ``request`` must conform to the ``RenderRequest`` schema. This is a
299
+ dry-run helper so an agent can review the command before executing it.
300
+ """
301
+
302
+ req = RenderRequest.model_validate({**request, "mode": "dry_run"})
303
+ result = compile_mod.render_clip(req)
304
+ return result.model_dump()
305
+
306
+
307
+ @mcp.tool()
308
+ def render_clip(request: dict[str, Any]) -> dict[str, Any]:
309
+ """Render a single 9:16 clip with the specified layout.
310
+
311
+ ``request`` must conform to ``RenderRequest``. If ``request.mode`` is
312
+ ``"dry_run"`` the ffmpeg command is returned without execution.
313
+ """
314
+
315
+ req = RenderRequest.model_validate(request)
316
+ result: RenderResult = compile_mod.render_clip(req)
317
+ return result.model_dump()
318
+
319
+
320
+ # ---------------------------------------------------------------------------
321
+ # Entrypoint
322
+ # ---------------------------------------------------------------------------
323
+
324
+
325
+ def main() -> None:
326
+ """stdio entrypoint for ``humeo-core`` console-script."""
327
+
328
+ mcp.run()
329
+
330
+
331
+ if __name__ == "__main__":
332
+ main()
humeo-core/tests/__init__.py ADDED
File without changes
humeo-core/tests/test_classify.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from humeo_core.primitives.classify import (
4
+ classify_scenes_heuristic,
5
+ classify_scenes_with_llm,
6
+ )
7
+ from humeo_core.schemas import LayoutKind, Scene
8
+
9
+
10
+ def test_heuristic_no_keyframe_defaults_sit_center():
11
+ scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path=None)]
12
+ result = classify_scenes_heuristic(scenes)
13
+ assert len(result) == 1
14
+ assert result[0].scene_id == "s0"
15
+ assert result[0].layout == LayoutKind.SIT_CENTER
16
+
17
+
18
+ def test_llm_classifier_uses_callback_and_validates():
19
+ scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
20
+
21
+ def fake_vision(image_path: str, prompt: str) -> str:
22
+ return json.dumps(
23
+ {"layout": "split_chart_person", "confidence": 0.88, "reason": "chart left"}
24
+ )
25
+
26
+ result = classify_scenes_with_llm(scenes, fake_vision)
27
+ assert result[0].layout == LayoutKind.SPLIT_CHART_PERSON
28
+ assert result[0].confidence == 0.88
29
+
30
+
31
+ def test_llm_classifier_parse_error_is_safe():
32
+ scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
33
+
34
+ def bad_vision(image_path: str, prompt: str) -> str:
35
+ return "not json"
36
+
37
+ result = classify_scenes_with_llm(scenes, bad_vision)
38
+ assert result[0].layout == LayoutKind.SIT_CENTER
39
+ assert "parse error" in result[0].reason.lower()
humeo-core/tests/test_compile.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from humeo_core.primitives import compile as compile_mod
4
+ from humeo_core.primitives.compile import (
5
+ _ensure_windows_fontconfig,
6
+ build_ffmpeg_cmd,
7
+ plan_title_drawtext,
8
+ )
9
+ from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RenderRequest, RenderTheme
10
+
11
+
12
+ def _req(**overrides):
13
+ c = Clip(clip_id="1", topic="t", start_time_sec=10.0, end_time_sec=40.0)
14
+ li = LayoutInstruction(clip_id="1", layout=LayoutKind.SIT_CENTER)
15
+ data = dict(
16
+ source_path="/tmp/src.mp4",
17
+ clip=c,
18
+ layout=li,
19
+ output_path="/tmp/out.mp4",
20
+ render_theme=RenderTheme.LEGACY,
21
+ mode="dry_run",
22
+ )
23
+ data.update(overrides)
24
+ return RenderRequest(**data)
25
+
26
+
27
+ def test_ffmpeg_cmd_has_ss_duration_filtergraph_output():
28
+ cmd = build_ffmpeg_cmd(_req())
29
+ assert "-ss" in cmd
30
+ assert "-t" in cmd
31
+ assert "-filter_complex" in cmd
32
+ # duration = 30.0
33
+ t_idx = cmd.index("-t")
34
+ assert float(cmd[t_idx + 1]) == 30.0
35
+ ss_idx = cmd.index("-ss")
36
+ assert float(cmd[ss_idx + 1]) == 10.0
37
+ assert cmd[-1] == "/tmp/out.mp4"
38
+
39
+
40
+ def test_title_text_injects_drawtext():
41
+ cmd = build_ffmpeg_cmd(_req(title_text="Hello: world's"))
42
+ fg = cmd[cmd.index("-filter_complex") + 1]
43
+ assert "drawtext" in fg
44
+ # colon should be escaped
45
+ assert "Hello\\:" in fg
46
+ assert "worlds" in fg
47
+ assert "world's" not in fg
48
+ assert "expansion=none" in fg
49
+
50
+
51
+ def test_map_vout_and_primary_audio():
52
+ cmd = build_ffmpeg_cmd(_req())
53
+ assert "[vout]" in cmd
54
+ assert "0:a:0" in cmd
55
+
56
+
57
+ def test_subtitle_style_uses_requested_font_and_margin():
58
+ cmd = build_ffmpeg_cmd(
59
+ _req(subtitle_path="/tmp/clip.srt", subtitle_font_size=18, subtitle_margin_v=64)
60
+ )
61
+ fg = cmd[cmd.index("-filter_complex") + 1]
62
+ assert "subtitles='" in fg
63
+ assert "FontSize=18" in fg
64
+ assert "MarginV=64" in fg
65
+ # Smart word wrap so long captions break into multiple readable lines.
66
+ assert "WrapStyle=0" in fg
67
+
68
+
69
+ def test_subtitle_original_size_pins_libass_to_output_resolution():
70
+ """Without original_size=W x H, libass uses PlayResY=288 and blows up fonts/margins.
71
+
72
+ This is the root cause of the "subtitles floating in the middle of the
73
+ frame / blocked" bug the user reported.
74
+ """
75
+ cmd = build_ffmpeg_cmd(_req(subtitle_path="/tmp/clip.srt"))
76
+ fg = cmd[cmd.index("-filter_complex") + 1]
77
+ assert "original_size=1080x1920" in fg
78
+
79
+
80
+ def test_subtitles_applied_after_crop_and_title():
81
+ """Order: crop/compose -> drawtext title -> subtitles.
82
+
83
+ The pipeline must crop **first**, then draw text on the finished frame.
84
+ """
85
+ cmd = build_ffmpeg_cmd(
86
+ _req(title_text="Hook", subtitle_path="/tmp/clip.srt")
87
+ )
88
+ fg = cmd[cmd.index("-filter_complex") + 1]
89
+ crop_pos = fg.index("[0:v]crop=")
90
+ drawtext_pos = fg.index("drawtext")
91
+ subs_pos = fg.index("subtitles=")
92
+ assert crop_pos < drawtext_pos < subs_pos
93
+
94
+
95
+ def test_build_is_layout_specific():
96
+ c = Clip(clip_id="1", topic="t", start_time_sec=0, end_time_sec=10)
97
+ split_req = _req(
98
+ clip=c,
99
+ layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
100
+ )
101
+ cmd = build_ffmpeg_cmd(split_req)
102
+ fg = cmd[cmd.index("-filter_complex") + 1]
103
+ assert "vstack" in fg
104
+
105
+
106
+ def test_title_is_suppressed_on_split_layouts():
107
+ """Split layouts already contain a slide/chart with its own title.
108
+
109
+ Overlaying an additional drawtext title just obscures content -- that's
110
+ what was happening in the Cathy Wood "chart overlaps subject" report.
111
+ """
112
+ for kind in (
113
+ LayoutKind.SPLIT_CHART_PERSON,
114
+ LayoutKind.SPLIT_TWO_PERSONS,
115
+ LayoutKind.SPLIT_TWO_CHARTS,
116
+ ):
117
+ cmd = build_ffmpeg_cmd(
118
+ _req(
119
+ layout=LayoutInstruction(clip_id="1", layout=kind),
120
+ title_text="This should not render",
121
+ )
122
+ )
123
+ fg = cmd[cmd.index("-filter_complex") + 1]
124
+ assert "drawtext" not in fg, f"title leaked into split layout {kind}"
125
+
126
+
127
+ def test_title_is_drawn_on_single_subject_layouts():
128
+ """Titles are still rendered on ZOOM_CALL_CENTER and SIT_CENTER."""
129
+ for kind in (LayoutKind.ZOOM_CALL_CENTER, LayoutKind.SIT_CENTER):
130
+ cmd = build_ffmpeg_cmd(
131
+ _req(
132
+ layout=LayoutInstruction(clip_id="1", layout=kind),
133
+ title_text="Hook title",
134
+ )
135
+ )
136
+ fg = cmd[cmd.index("-filter_complex") + 1]
137
+ assert "drawtext=text='Hook title'" in fg
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Title wrapping / auto-shrink (P2: fixes the "Prediction Markets vs
142
+ # Derivatives" clipped-title bug reported against the Cathy Wood run).
143
+ # ---------------------------------------------------------------------------
144
+
145
+
146
+ def test_plan_title_short_stays_single_line_at_72px():
147
+ """Backward compat: short titles keep the pre-P2 single-drawtext form.
148
+
149
+ Byte-identical output for short titles is important because it keeps
150
+ previously-calibrated visual output unchanged and avoids needless cache
151
+ churn on existing renders.
152
+ """
153
+ frag = plan_title_drawtext("Hook title", out_w=1080)
154
+ assert frag is not None
155
+ assert frag.count("drawtext=") == 1
156
+ assert "fontsize=72" in frag
157
+ assert "y=80" in frag
158
+ assert "drawtext=text='Hook title'" in frag
159
+
160
+
161
+ def test_plan_title_long_wraps_to_two_lines_below_72px():
162
+ """Long titles wrap at the best word boundary and shrink to fit.
163
+
164
+ "Prediction Markets vs Derivatives" is 33 chars — it overflows a 1080px
165
+ canvas at 72px. It must wrap into "Prediction Markets" / "vs Derivatives"
166
+ (balanced halves) at a smaller font.
167
+ """
168
+ frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
169
+ assert frag is not None
170
+ assert frag.count("drawtext=") == 2, "long titles must split into two drawtext calls"
171
+ assert "drawtext=text='Prediction Markets'" in frag
172
+ assert "drawtext=text='vs Derivatives'" in frag
173
+ assert "fontsize=72" not in frag, "two-line layout must use a smaller font"
174
+ # Both lines share the same shrunken fontsize.
175
+ import re
176
+
177
+ sizes = re.findall(r"fontsize=(\d+)", frag)
178
+ assert len(sizes) == 2 and sizes[0] == sizes[1]
179
+ assert 44 <= int(sizes[0]) <= 64
180
+
181
+
182
+ def test_plan_title_empty_returns_none():
183
+ assert plan_title_drawtext("", out_w=1080) is None
184
+ assert plan_title_drawtext(" ", out_w=1080) is None
185
+
186
+
187
+ def test_plan_title_single_huge_word_shrinks_instead_of_wrapping():
188
+ """A single word cannot be word-wrapped; it must shrink to fit."""
189
+ frag = plan_title_drawtext("Supercalifragilisticexpialidocious", out_w=1080)
190
+ assert frag is not None
191
+ assert frag.count("drawtext=") == 1 # no wrap possible
192
+ assert "fontsize=72" not in frag
193
+
194
+
195
+ def test_title_uses_arial_font_not_default_serif():
196
+ """Titles must render in Arial (matching the ASS subtitle font), not the
197
+ platform default which is Times New Roman on Windows.
198
+
199
+ Regression test for the "ugly serif title on the finance short" bug.
200
+ Both the single-line and the two-line drawtext variants must carry a
201
+ ``font=Arial`` directive so fontconfig resolves to the same family as
202
+ the subtitle ``Fontname=Arial``.
203
+ """
204
+ short = plan_title_drawtext("Hook title", out_w=1080)
205
+ assert short is not None
206
+ assert "font=Arial" in short or "fontfile='" in short
207
+
208
+ long_frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
209
+ assert long_frag is not None
210
+ if "font=Arial" in long_frag:
211
+ assert long_frag.count("font=Arial") == 2
212
+ else:
213
+ assert long_frag.count("fontfile='") == 2
214
+
215
+
216
+ def test_title_font_matches_subtitle_font_family():
217
+ """Title overlay and subtitle captions must read as one typographic
218
+ family. Both routes through ``build_ffmpeg_cmd`` should carry the same
219
+ Arial reference.
220
+ """
221
+ cmd = build_ffmpeg_cmd(
222
+ _req(
223
+ title_text="Hook title",
224
+ subtitle_path="/tmp/clip.ass",
225
+ )
226
+ )
227
+ fg = cmd[cmd.index("-filter_complex") + 1]
228
+ assert "font=Arial" in fg or "fontfile='" in fg
229
+ assert "Fontname=Arial" in fg
230
+
231
+
232
+ def test_long_title_pipes_through_build_ffmpeg_cmd():
233
+ """End-to-end: a long title routed through the full command builder
234
+ produces a valid filtergraph with two drawtext filters and no syntax
235
+ errors ffmpeg would choke on.
236
+ """
237
+ cmd = build_ffmpeg_cmd(_req(title_text="Prediction Markets vs Derivatives"))
238
+ fg = cmd[cmd.index("-filter_complex") + 1]
239
+ assert fg.count("drawtext=") == 2
240
+ assert "[v_prepad]drawtext=text='Prediction Markets'" in fg
241
+ assert "[vout]" in fg
242
+ assert ";;" not in fg # no empty chain links
243
+ assert ",," not in fg # no stray commas
244
+
245
+
246
+ def test_reference_theme_draws_title_and_caption_bars():
247
+ cmd = build_ffmpeg_cmd(
248
+ _req(
249
+ title_text="A Multi-Trillion Dollar Opportunity",
250
+ subtitle_path="/tmp/clip.ass",
251
+ render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
252
+ )
253
+ )
254
+ fg = cmd[cmd.index("-filter_complex") + 1]
255
+ assert "drawbox=x=28:y=32" in fg
256
+ assert "drawbox=x=0:y=" in fg
257
+ assert "Fontname=Source Sans 3" in fg
258
+ assert "Alignment=2" in fg
259
+ assert "Outline=2" in fg
260
+
261
+
262
+ def test_reference_theme_wraps_long_titles_inside_the_title_bar():
263
+ cmd = build_ffmpeg_cmd(
264
+ _req(
265
+ title_text="12% Youth Unemployment? Start a Business With AI",
266
+ render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
267
+ )
268
+ )
269
+ fg = cmd[cmd.index("-filter_complex") + 1]
270
+ assert fg.count("drawtext=") >= 2
271
+ assert "..." not in fg
272
+
273
+
274
+ def test_reference_theme_draws_frosted_caption_ribbon_when_subtitles_exist():
275
+ cmd = build_ffmpeg_cmd(
276
+ _req(
277
+ title_text="Hook title",
278
+ subtitle_path="/tmp/clip.ass",
279
+ render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
280
+ )
281
+ )
282
+ fg = cmd[cmd.index("-filter_complex") + 1]
283
+ assert "drawbox=x=0:y=" in fg
284
+
285
+
286
+ def test_reference_theme_allows_titles_on_split_layouts():
287
+ cmd = build_ffmpeg_cmd(
288
+ _req(
289
+ layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
290
+ title_text="Hook title",
291
+ render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
292
+ )
293
+ )
294
+ fg = cmd[cmd.index("-filter_complex") + 1]
295
+ assert "drawtext=" in fg
296
+
297
+
298
+ def test_native_highlight_theme_skips_title_card_and_keeps_ass_styles():
299
+ cmd = build_ffmpeg_cmd(
300
+ _req(
301
+ title_text="This title should not render",
302
+ subtitle_path="/tmp/clip.ass",
303
+ render_theme=RenderTheme.NATIVE_HIGHLIGHT,
304
+ )
305
+ )
306
+ fg = cmd[cmd.index("-filter_complex") + 1]
307
+ assert "drawtext" not in fg
308
+ assert "subtitles='" in fg
309
+ assert "force_style='" not in fg
310
+
311
+
312
+ def test_ensure_windows_fontconfig_is_noop_off_windows():
313
+ env = _ensure_windows_fontconfig()
314
+ assert isinstance(env, dict)
315
+
316
+
317
+ def test_ensure_windows_fontconfig_creates_config(monkeypatch, tmp_path):
318
+ monkeypatch.setattr(compile_mod.os, "name", "nt", raising=False)
319
+ monkeypatch.delenv("FONTCONFIG_FILE", raising=False)
320
+ monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "localappdata"))
321
+ monkeypatch.setenv("WINDIR", str(tmp_path / "winroot"))
322
+
323
+ env = _ensure_windows_fontconfig()
324
+
325
+ cfg_file = Path(env["FONTCONFIG_FILE"])
326
+ assert cfg_file.is_file()
327
+ text = cfg_file.read_text(encoding="utf-8")
328
+ assert (tmp_path / "winroot" / "Fonts").as_posix() in text
329
+ assert "fontconfig-cache" in text
humeo-core/tests/test_face_detect.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the MediaPipe-backed face detection primitive.
2
+
3
+ Uses a stub ``face_fn`` so MediaPipe itself is not required to run the
4
+ tests — the primitive contract is what we care about: *given* a face
5
+ bbox, does the primitive produce the right ``SceneRegions``.
6
+ """
7
+
8
+ from humeo_core.primitives.face_detect import detect_face_regions
9
+ from humeo_core.schemas import BoundingBox, Scene
10
+
11
+
12
+ def _scene(i: int, kf: str | None = "/tmp/k.jpg") -> Scene:
13
+ return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
14
+
15
+
16
+ def test_no_keyframe_returns_raw_reason():
17
+ out = detect_face_regions([_scene(0, kf=None)], face_fn=lambda _p: None)
18
+ assert out[0].person_bbox is None
19
+ assert "no keyframe" in out[0].raw_reason.lower()
20
+
21
+
22
+ def test_no_face_detected_returns_raw_reason():
23
+ out = detect_face_regions([_scene(0)], face_fn=lambda _p: None)
24
+ assert out[0].person_bbox is None
25
+ assert "no face" in out[0].raw_reason.lower()
26
+
27
+
28
+ def test_face_centered_produces_person_only():
29
+ centered = BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.7, label="face", confidence=0.9)
30
+ out = detect_face_regions([_scene(0)], face_fn=lambda _p: centered)
31
+ r = out[0]
32
+ assert r.person_bbox is not None
33
+ assert r.person_bbox.center_x == centered.center_x
34
+ assert r.chart_bbox is None
35
+
36
+
37
+ def test_face_pushed_right_synthesises_chart_bbox():
38
+ # face center x ~ 0.86 -> above default threshold 0.65 -> chart bbox inferred
39
+ face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9, label="face", confidence=0.95)
40
+ out = detect_face_regions([_scene(0)], face_fn=lambda _p: face)
41
+ r = out[0]
42
+ assert r.person_bbox is not None
43
+ assert r.chart_bbox is not None
44
+ assert r.chart_bbox.x1 == 0.0
45
+ assert r.chart_bbox.x2 <= 0.75 # can't overlap the face
46
+ assert r.chart_bbox.x2 <= 0.65 # bounded by threshold too
47
+ assert "synthetic chart" in r.raw_reason
48
+
49
+
50
+ def test_face_detector_exception_is_isolated_per_scene():
51
+ scenes = [_scene(0), _scene(1)]
52
+ calls: list[str] = []
53
+
54
+ def flaky_fn(path: str) -> BoundingBox | None:
55
+ calls.append(path)
56
+ if len(calls) == 1:
57
+ raise RuntimeError("boom")
58
+ return BoundingBox(x1=0.3, y1=0.2, x2=0.7, y2=0.8)
59
+
60
+ out = detect_face_regions(scenes, face_fn=flaky_fn)
61
+ assert out[0].person_bbox is None
62
+ assert "error" in out[0].raw_reason.lower()
63
+ assert out[1].person_bbox is not None
64
+
65
+
66
+ def test_custom_threshold_prevents_false_chart_split():
67
+ face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9)
68
+ out = detect_face_regions(
69
+ [_scene(0)],
70
+ face_fn=lambda _p: face,
71
+ chart_split_threshold=0.95,
72
+ )
73
+ assert out[0].chart_bbox is None
humeo-core/tests/test_layout_bbox.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Split layout uses optional normalized bbox regions (Gemini vision)."""
2
+
3
+ from humeo_core.primitives.layouts import plan_layout
4
+ from humeo_core.schemas import BoundingBox, FocusStackOrder, LayoutInstruction, LayoutKind
5
+
6
+
7
+ def test_split_with_bbox_regions_not_fixed_thirds():
8
+ instr = LayoutInstruction(
9
+ clip_id="c",
10
+ layout=LayoutKind.SPLIT_CHART_PERSON,
11
+ focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
12
+ split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.64, y2=1.0),
13
+ split_person_region=BoundingBox(x1=0.64, y1=0.0, x2=1.0, y2=1.0),
14
+ )
15
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
16
+ assert "crop=1228:1080:0:0" in fg or "crop=1224:1080:0:0" in fg
17
+ assert "vstack=inputs=2" in fg
humeo-core/tests/test_layouts.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from humeo_core.primitives.layouts import (
4
+ _center_crop_to_9x16,
5
+ _crop_box,
6
+ plan_layout,
7
+ )
8
+ from humeo_core.schemas import (
9
+ BoundingBox,
10
+ FocusStackOrder,
11
+ LayoutInstruction,
12
+ LayoutKind,
13
+ TimedCenterPoint,
14
+ )
15
+
16
+
17
+ def test_crop_box_aspect_exact():
18
+ cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 1.0, 0.5, 0.5)
19
+ # 9:16 inside 1920x1080 -> height-limited: ch=1080, cw ~= 608
20
+ assert ch == 1080
21
+ assert abs(cw / ch - 9 / 16) < 0.01
22
+ assert 0 <= x <= 1920 - cw
23
+ assert y == 0
24
+
25
+
26
+ def test_crop_box_clamps_inside_frame():
27
+ cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 2.0, 0.99, 0.5)
28
+ assert x + cw <= 1920
29
+ assert y + ch <= 1080
30
+
31
+
32
+ def test_crop_box_zoom_tightens():
33
+ cw_small, ch_small, _, _ = _center_crop_to_9x16(1920, 1080, 2.0, 0.5)
34
+ cw_large, ch_large, _, _ = _center_crop_to_9x16(1920, 1080, 1.0, 0.5)
35
+ assert cw_small < cw_large
36
+ assert ch_small < ch_large
37
+
38
+
39
+ def test_even_dimensions():
40
+ cw, ch, x, y = _crop_box(1921, 1081, 9 / 16, 1.3, 0.4, 0.5)
41
+ assert cw % 2 == 0 and ch % 2 == 0
42
+ assert x % 2 == 0 and y % 2 == 0
43
+
44
+
45
+ def _contains(s: str, *subs: str) -> bool:
46
+ return all(sub in s for sub in subs)
47
+
48
+
49
+ def test_zoom_call_layout_filtergraph_shape():
50
+ instr = LayoutInstruction(
51
+ clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.5, person_x_norm=0.5
52
+ )
53
+ plan = plan_layout(instr, out_w=1080, out_h=1920)
54
+ fg = plan.filtergraph
55
+ assert _contains(fg, "[0:v]crop=", "scale=1080:1920", "[vout]")
56
+
57
+
58
+ def test_sit_center_layout_filtergraph_shape():
59
+ instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
60
+ plan = plan_layout(instr, out_w=1080, out_h=1920)
61
+ assert "[vout]" in plan.filtergraph
62
+ assert plan.out_label == "vout"
63
+
64
+
65
+ def test_sit_center_tracking_uses_dynamic_crop_expression():
66
+ instr = LayoutInstruction(
67
+ clip_id="c",
68
+ layout=LayoutKind.SIT_CENTER,
69
+ person_tracking=[
70
+ TimedCenterPoint(t_sec=0.0, x_norm=0.2),
71
+ TimedCenterPoint(t_sec=10.0, x_norm=0.8),
72
+ ],
73
+ )
74
+ fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
75
+ assert "setpts=PTS-STARTPTS" in fg
76
+ assert "[vsrc]crop=" in fg
77
+ assert "if(lt(t\\,4.850)" in fg
78
+ assert "*(t-4.850)/(0.300)" in fg
79
+
80
+
81
+ def test_sit_center_tracking_with_zoom_uses_dynamic_crop_window_expressions():
82
+ instr = LayoutInstruction(
83
+ clip_id="c",
84
+ layout=LayoutKind.SIT_CENTER,
85
+ person_tracking=[
86
+ TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.28),
87
+ TimedCenterPoint(t_sec=10.0, x_norm=0.8, zoom=1.0),
88
+ ],
89
+ )
90
+ fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
91
+ assert "setpts=PTS-STARTPTS" in fg
92
+ assert "[vsrc]crop=" in fg
93
+ assert "out_w/2" in fg
94
+ assert "out_h/2" in fg
95
+ assert "floor((min(" in fg
96
+
97
+
98
+ def test_split_layout_contains_vstack():
99
+ instr = LayoutInstruction(
100
+ clip_id="c",
101
+ layout=LayoutKind.SPLIT_CHART_PERSON,
102
+ person_x_norm=0.83,
103
+ chart_x_norm=0.0,
104
+ )
105
+ plan = plan_layout(instr, out_w=1080, out_h=1920)
106
+ fg = plan.filtergraph
107
+ assert _contains(fg, "split=2", "vstack=inputs=2", "[vout]")
108
+ assert "[top]" in fg and "[bot]" in fg
109
+
110
+
111
+ def test_split_layout_person_crop_is_right_third():
112
+ """Chart uses left 2/3; person uses right 1/3 (non-overlapping)."""
113
+ instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
114
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
115
+ # Right third: x=1280, w=640 for 1920-wide source.
116
+ assert "crop=640:1080:1280:0" in fg
117
+
118
+
119
+ def test_split_layout_can_swap_stack_order():
120
+ """PERSON_THEN_CHART puts the right-strip (person) crop into the top band."""
121
+ chart_first = plan_layout(
122
+ LayoutInstruction(
123
+ clip_id="c",
124
+ layout=LayoutKind.SPLIT_CHART_PERSON,
125
+ focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
126
+ ),
127
+ out_w=1080,
128
+ out_h=1920,
129
+ ).filtergraph
130
+ person_first = plan_layout(
131
+ LayoutInstruction(
132
+ clip_id="c",
133
+ layout=LayoutKind.SPLIT_CHART_PERSON,
134
+ focus_stack_order=FocusStackOrder.PERSON_THEN_CHART,
135
+ ),
136
+ out_w=1080,
137
+ out_h=1920,
138
+ ).filtergraph
139
+
140
+ def top_crop(fg: str) -> str:
141
+ m = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg)
142
+ assert m is not None, fg
143
+ return m.group(1)
144
+
145
+ # chart strip = left 1280px of source (2/3 split seam).
146
+ assert top_crop(chart_first) == "1280:1080:0:0"
147
+ # person strip = right 640px -> x=1280.
148
+ assert top_crop(person_first) == "640:1080:1280:0"
149
+ assert "vstack=inputs=2" in chart_first
150
+ assert "vstack=inputs=2" in person_first
151
+
152
+
153
+ def test_split_layout_person_clamped():
154
+ instr = LayoutInstruction(
155
+ clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, person_x_norm=1.0
156
+ )
157
+ plan = plan_layout(instr, out_w=1080, out_h=1920)
158
+ assert "crop=" in plan.filtergraph # no OOB math crash
159
+
160
+
161
+ def test_plan_layout_dispatch_covers_all_kinds():
162
+ for k in LayoutKind:
163
+ instr = LayoutInstruction(clip_id="c", layout=k)
164
+ plan = plan_layout(instr)
165
+ assert plan.out_label == "vout"
166
+ assert plan.filtergraph.endswith("[vout]")
167
+
168
+
169
+ def test_default_split_is_even_50_50_bands():
170
+ """The user-requested symmetric look: top and bottom bands are equal."""
171
+ instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
172
+ fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
173
+ # Each strip should scale to the same height (half of 1920).
174
+ heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
175
+ assert len(heights) == 2
176
+ assert heights[0] == heights[1] == "960", f"expected even 960/960, got {heights}"
177
+
178
+
179
+ def test_top_band_ratio_honored_for_uneven_splits():
180
+ instr = LayoutInstruction(
181
+ clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, top_band_ratio=0.6
182
+ )
183
+ fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
184
+ heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
185
+ assert heights == ["1152", "768"], heights
186
+
187
+
188
+ def test_split_seam_is_midpoint_between_bboxes():
189
+ """When both bboxes are provided, strips partition the source -- no overlap, no gap."""
190
+ instr = LayoutInstruction(
191
+ clip_id="c",
192
+ layout=LayoutKind.SPLIT_CHART_PERSON,
193
+ split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.50, y2=1.0),
194
+ split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
195
+ )
196
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
197
+ # chart.x2 = 960px, person.x1 = 1056px -> midpoint = 1008 -> even -> 1008.
198
+ # Chart strip: x=0, cw=1008. Person strip: x=1008, cw=912.
199
+ top_crop = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
200
+ bot_crop = re.search(r"\[src2\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
201
+ assert top_crop == "1008:1080:0:0"
202
+ assert bot_crop == "912:1080:1008:0"
203
+
204
+
205
+ def test_split_uses_bbox_y_for_tight_band_fill():
206
+ """Chart bboxes anchor the crop, with a little extra height for edge safety."""
207
+ instr = LayoutInstruction(
208
+ clip_id="c",
209
+ layout=LayoutKind.SPLIT_CHART_PERSON,
210
+ split_chart_region=BoundingBox(x1=0.0, y1=0.1, x2=0.5, y2=0.7),
211
+ split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
212
+ )
213
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
214
+ # Chart bbox y: 0.1..0.7 -> y=108, ch=648, then a modest 12% pad per side.
215
+ assert "crop=1008:804:0:30" in fg
216
+
217
+
218
+ def test_split_chart_person_adds_vertical_pad_to_reduce_chart_side_crop():
219
+ instr = LayoutInstruction(
220
+ clip_id="c",
221
+ layout=LayoutKind.SPLIT_CHART_PERSON,
222
+ split_chart_region=BoundingBox(x1=0.02, y1=0.03, x2=0.58, y2=0.7),
223
+ split_person_region=BoundingBox(x1=0.585, y1=0.0, x2=0.995, y2=0.62),
224
+ top_band_ratio=0.436,
225
+ )
226
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=640, src_h=360).filtergraph
227
+ assert "[src1]crop=372:280:0:0" in fg
228
+
229
+
230
+ def test_split_minimum_strip_width_enforced():
231
+ """If chart/person bboxes are pathological (seam at edge), don't starve a strip."""
232
+ instr = LayoutInstruction(
233
+ clip_id="c",
234
+ layout=LayoutKind.SPLIT_CHART_PERSON,
235
+ split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.05, y2=1.0),
236
+ split_person_region=BoundingBox(x1=0.05, y1=0.0, x2=1.0, y2=1.0),
237
+ )
238
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
239
+ widths = [int(m) for m in re.findall(r"crop=(\d+):\d+:\d+:\d+", fg)]
240
+ # Min strip = 20% of 1920 = 384 px. Neither strip should be narrower.
241
+ assert all(w >= 384 for w in widths), widths
242
+
243
+
244
+ def test_split_two_persons_stacks_two_crops():
245
+ instr = LayoutInstruction(
246
+ clip_id="c",
247
+ layout=LayoutKind.SPLIT_TWO_PERSONS,
248
+ split_person_region=BoundingBox(x1=0.0, y1=0.05, x2=0.5, y2=0.95),
249
+ split_second_person_region=BoundingBox(x1=0.5, y1=0.05, x2=1.0, y2=0.95),
250
+ )
251
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
252
+ assert "split=2" in fg and "vstack=inputs=2" in fg
253
+ # Seam at x=960. bbox y: 0.05..0.95 -> y=54, ch=972 (even).
254
+ assert "[src1]crop=960:972:0:54" in fg
255
+ assert "[src2]crop=960:972:960:54" in fg
256
+
257
+
258
+ def test_split_two_charts_stacks_two_crops():
259
+ instr = LayoutInstruction(
260
+ clip_id="c",
261
+ layout=LayoutKind.SPLIT_TWO_CHARTS,
262
+ split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.5, y2=1.0),
263
+ split_second_chart_region=BoundingBox(x1=0.5, y1=0.0, x2=1.0, y2=1.0),
264
+ )
265
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
266
+ assert "split=2" in fg and "vstack=inputs=2" in fg
267
+ assert "[src1]crop=960:1080:0:0" in fg
268
+ assert "[src2]crop=960:1080:960:0" in fg
269
+
270
+
271
+ def test_split_two_persons_without_bboxes_defaults_to_centered():
272
+ """No bboxes -> centered 50/50 seam, full source height fallback."""
273
+ instr = LayoutInstruction(
274
+ clip_id="c", layout=LayoutKind.SPLIT_TWO_PERSONS
275
+ )
276
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
277
+ assert "[src1]crop=960:1080:0:0" in fg
278
+ assert "[src2]crop=960:1080:960:0" in fg
279
+
280
+
281
+ def test_split_bands_use_cover_scale_plus_center_crop():
282
+ """Each band is painted edge-to-edge -- no letterbox bars."""
283
+ instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
284
+ fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
285
+ assert fg.count("force_original_aspect_ratio=increase") == 2
286
+ assert fg.count("setsar=1") == 2
287
+
288
+
289
+ def test_zoom_tighter_means_smaller_crop_window():
290
+ from humeo_core.primitives.layouts import plan_zoom_call_center
291
+
292
+ wide = plan_zoom_call_center(
293
+ LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.0),
294
+ out_w=1080,
295
+ out_h=1920,
296
+ )
297
+ tight = plan_zoom_call_center(
298
+ LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=2.0),
299
+ out_w=1080,
300
+ out_h=1920,
301
+ )
302
+ # Parse crop=CW:CH:X:Y out of each filtergraph.
303
+ import re
304
+
305
+ def crop(fg: str) -> tuple[int, int]:
306
+ m = re.search(r"crop=(\d+):(\d+):", fg)
307
+ assert m is not None
308
+ return int(m.group(1)), int(m.group(2))
309
+
310
+ wcw, wch = crop(wide.filtergraph)
311
+ tcw, tch = crop(tight.filtergraph)
312
+ assert tcw < wcw and tch < wch
humeo-core/tests/test_schemas.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from pydantic import ValidationError
3
+
4
+ from humeo_core.schemas import (
5
+ ApprovalResult,
6
+ Clip,
7
+ ClipPlan,
8
+ ClipSubtitleWords,
9
+ FocusStackOrder,
10
+ LayoutInstruction,
11
+ LayoutKind,
12
+ RatingFeedback,
13
+ RenderRequest,
14
+ Scene,
15
+ SessionState,
16
+ TimedCenterPoint,
17
+ TranscriptWord,
18
+ )
19
+
20
+
21
+ def test_scene_requires_end_after_start():
22
+ Scene(scene_id="s1", start_time=0.0, end_time=1.0)
23
+ with pytest.raises(ValueError):
24
+ Scene(scene_id="s1", start_time=5.0, end_time=5.0)
25
+ with pytest.raises(ValueError):
26
+ Scene(scene_id="s1", start_time=5.0, end_time=1.0)
27
+
28
+
29
+ def test_layout_instruction_defaults_and_bounds():
30
+ li = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
31
+ assert li.zoom == 1.0
32
+ assert 0 <= li.person_x_norm <= 1
33
+ assert li.person_tracking == []
34
+ assert li.focus_stack_order == FocusStackOrder.CHART_THEN_PERSON
35
+ with pytest.raises(ValueError):
36
+ LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, zoom=0.0)
37
+ with pytest.raises(ValueError):
38
+ LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, person_x_norm=2.0)
39
+
40
+
41
+ def test_layout_instruction_accepts_sorted_tracking_points():
42
+ li = LayoutInstruction(
43
+ clip_id="c",
44
+ layout=LayoutKind.SIT_CENTER,
45
+ person_tracking=[
46
+ TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.25),
47
+ TimedCenterPoint(t_sec=5.0, x_norm=0.8, zoom=1.0),
48
+ ],
49
+ )
50
+ assert [point.t_sec for point in li.person_tracking] == [0.0, 5.0]
51
+ assert li.person_tracking[0].zoom == pytest.approx(1.25)
52
+
53
+
54
+ def test_layout_instruction_rejects_unsorted_tracking_points():
55
+ with pytest.raises(ValueError, match="person_tracking times"):
56
+ LayoutInstruction(
57
+ clip_id="c",
58
+ layout=LayoutKind.SIT_CENTER,
59
+ person_tracking=[
60
+ TimedCenterPoint(t_sec=5.0, x_norm=0.8),
61
+ TimedCenterPoint(t_sec=1.0, x_norm=0.2),
62
+ ],
63
+ )
64
+
65
+
66
+ def test_clip_duration():
67
+ c = Clip(
68
+ clip_id="1",
69
+ topic="t",
70
+ start_time_sec=10.0,
71
+ end_time_sec=42.5,
72
+ )
73
+ assert c.duration_sec == pytest.approx(32.5)
74
+
75
+
76
+ def test_clip_hook_relative_to_clip_in_point():
77
+ c = Clip(
78
+ clip_id="1",
79
+ topic="t",
80
+ start_time_sec=100.0,
81
+ end_time_sec=130.0,
82
+ hook_start_sec=0.0,
83
+ hook_end_sec=3.0,
84
+ )
85
+ assert c.hook_end_sec == 3.0
86
+
87
+
88
+ def test_clip_hook_must_be_within_duration():
89
+ with pytest.raises(ValueError, match="hook window"):
90
+ Clip(
91
+ clip_id="1",
92
+ topic="t",
93
+ start_time_sec=0.0,
94
+ end_time_sec=10.0,
95
+ hook_start_sec=0.0,
96
+ hook_end_sec=15.0,
97
+ )
98
+
99
+
100
+ def test_clip_hook_both_or_neither():
101
+ with pytest.raises(ValueError, match="hook_start_sec and hook_end_sec"):
102
+ Clip(
103
+ clip_id="1",
104
+ topic="t",
105
+ start_time_sec=0.0,
106
+ end_time_sec=10.0,
107
+ hook_start_sec=1.0,
108
+ hook_end_sec=None,
109
+ )
110
+
111
+
112
+ def test_clip_trim_cannot_exceed_duration():
113
+ with pytest.raises(ValueError, match="trim"):
114
+ Clip(
115
+ clip_id="1",
116
+ topic="t",
117
+ start_time_sec=0.0,
118
+ end_time_sec=10.0,
119
+ trim_start_sec=6.0,
120
+ trim_end_sec=6.0,
121
+ )
122
+
123
+
124
+ def test_clip_plan_roundtrip():
125
+ plan = ClipPlan(
126
+ source_path="/tmp/x.mp4",
127
+ clips=[
128
+ Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
129
+ ],
130
+ )
131
+ d = plan.model_dump()
132
+ assert ClipPlan.model_validate(d) == plan
133
+
134
+
135
+ def test_clip_roundtrip_with_extended_fields():
136
+ clip = Clip(
137
+ clip_id="1",
138
+ topic="t",
139
+ start_time_sec=0.0,
140
+ end_time_sec=30.0,
141
+ score_breakdown={"message_wow": 0.9, "hook_emotion": 0.7},
142
+ origin="both",
143
+ visual_notes="Speaker leans in.",
144
+ reasoning="Strong explanation and hook.",
145
+ )
146
+
147
+ dumped = clip.model_dump()
148
+
149
+ assert dumped["score_breakdown"] == {"message_wow": 0.9, "hook_emotion": 0.7}
150
+ assert dumped["origin"] == "both"
151
+ assert dumped["visual_notes"] == "Speaker leans in."
152
+ assert dumped["reasoning"] == "Strong explanation and hook."
153
+ assert Clip.model_validate(dumped) == clip
154
+
155
+
156
+ def test_clip_defaults_validate_and_do_not_serialize_new_fields():
157
+ clip = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
158
+
159
+ assert clip.origin == "text"
160
+ assert clip.score_breakdown is None
161
+ assert clip.visual_notes is None
162
+ assert clip.reasoning is None
163
+
164
+ dumped = clip.model_dump()
165
+ assert "score_breakdown" not in dumped
166
+ assert "origin" not in dumped
167
+ assert "visual_notes" not in dumped
168
+ assert "reasoning" not in dumped
169
+ assert Clip.model_validate(dumped) == clip
170
+
171
+
172
+ def test_clip_score_breakdown_validation():
173
+ with pytest.raises(ValidationError):
174
+ Clip(
175
+ clip_id="1",
176
+ topic="t",
177
+ start_time_sec=0.0,
178
+ end_time_sec=30.0,
179
+ score_breakdown={"hook": -0.1},
180
+ )
181
+
182
+ clip = Clip(
183
+ clip_id="1",
184
+ topic="t",
185
+ start_time_sec=0.0,
186
+ end_time_sec=30.0,
187
+ score_breakdown={"hook": 1.2},
188
+ )
189
+ assert clip.score_breakdown == {"hook": 1.0}
190
+
191
+ clip = Clip(
192
+ clip_id="1",
193
+ topic="t",
194
+ start_time_sec=0.0,
195
+ end_time_sec=30.0,
196
+ score_breakdown={},
197
+ )
198
+ assert clip.score_breakdown == {}
199
+
200
+ clip = Clip(
201
+ clip_id="1",
202
+ topic="t",
203
+ start_time_sec=0.0,
204
+ end_time_sec=30.0,
205
+ score_breakdown={"hook": 0.5},
206
+ )
207
+ assert clip.score_breakdown == {"hook": 0.5}
208
+
209
+
210
+ def test_clip_subtitle_words_relative_times():
211
+ w = ClipSubtitleWords(
212
+ words=[TranscriptWord(word="hi", start_time=0.0, end_time=0.2)]
213
+ )
214
+ assert w.words[0].start_time == 0.0
215
+
216
+
217
+ def test_render_request_modes():
218
+ c = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
219
+ li = LayoutInstruction(clip_id="1", layout=LayoutKind.ZOOM_CALL_CENTER)
220
+ req = RenderRequest(
221
+ source_path="/tmp/x.mp4",
222
+ clip=c,
223
+ layout=li,
224
+ output_path="/tmp/out.mp4",
225
+ )
226
+ assert req.mode == "normal"
227
+ req2 = RenderRequest(**{**req.model_dump(), "mode": "dry_run"})
228
+ assert req2.mode == "dry_run"
229
+
230
+
231
+ def test_approval_result_roundtrip():
232
+ result = ApprovalResult(
233
+ action="proceed",
234
+ selected_ids=["001", "003"],
235
+ steering_note="prefer emotional moments",
236
+ )
237
+ assert ApprovalResult.model_validate(result.model_dump()) == result
238
+
239
+
240
+ def test_approval_result_rejects_invalid_action():
241
+ with pytest.raises(ValidationError):
242
+ ApprovalResult(action="invalid")
243
+
244
+
245
+ def test_rating_feedback_roundtrip():
246
+ feedback = RatingFeedback(
247
+ rating=2,
248
+ issues=["wrong_moments", "other"],
249
+ free_text="needs more context",
250
+ )
251
+ assert RatingFeedback.model_validate(feedback.model_dump()) == feedback
252
+
253
+
254
+ def test_rating_feedback_rejects_invalid_rating():
255
+ with pytest.raises(ValidationError):
256
+ RatingFeedback(rating=4)
257
+
258
+
259
+ def test_session_state_roundtrip():
260
+ state = SessionState(
261
+ source_key="youtube:PdVv_vLkUgk",
262
+ iteration=3,
263
+ steering_notes=["be punchier"],
264
+ last_rating=RatingFeedback(rating=3),
265
+ last_selected_ids=["001", "002"],
266
+ )
267
+ assert SessionState.model_validate(state.model_dump()) == state
humeo-core/tests/test_select_clips.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from humeo_core.primitives.select_clips import select_clips_heuristic
2
+ from humeo_core.schemas import TranscriptWord
3
+
4
+
5
+ def _words(start: float, end: float, n: int) -> list[TranscriptWord]:
6
+ step = (end - start) / max(1, n)
7
+ return [
8
+ TranscriptWord(word=f"w{i}", start_time=start + i * step, end_time=start + (i + 1) * step)
9
+ for i in range(n)
10
+ ]
11
+
12
+
13
+ def test_no_transcript_returns_single_clip():
14
+ plan = select_clips_heuristic("/tmp/x.mp4", [], duration_sec=600.0)
15
+ assert len(plan.clips) == 1
16
+
17
+
18
+ def test_prefers_dense_windows():
19
+ # dense between 30-90, sparse elsewhere
20
+ dense = _words(30.0, 90.0, 240) # 4 words/sec
21
+ sparse_before = _words(0.0, 30.0, 6)
22
+ sparse_after = _words(90.0, 600.0, 30)
23
+ words = sparse_before + dense + sparse_after
24
+ plan = select_clips_heuristic(
25
+ "/tmp/x.mp4", words, duration_sec=600.0, target_count=1, min_sec=30, max_sec=60
26
+ )
27
+ assert len(plan.clips) == 1
28
+ c = plan.clips[0]
29
+ assert 30 <= c.start_time_sec <= 90
30
+ assert c.end_time_sec <= 120
31
+
32
+
33
+ def test_no_overlap_when_multiple_picked():
34
+ dense_a = _words(30.0, 90.0, 240)
35
+ dense_b = _words(200.0, 260.0, 240)
36
+ words = dense_a + dense_b
37
+ plan = select_clips_heuristic(
38
+ "/tmp/x.mp4",
39
+ words,
40
+ duration_sec=400.0,
41
+ target_count=3,
42
+ min_sec=30,
43
+ max_sec=60,
44
+ )
45
+ # Should pick both dense regions without overlap.
46
+ assert len(plan.clips) >= 2
47
+ starts_ends = sorted((c.start_time_sec, c.end_time_sec) for c in plan.clips)
48
+ for (s1, e1), (s2, e2) in zip(starts_ends, starts_ends[1:]):
49
+ assert e1 <= s2
humeo-core/tests/test_server_tools.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Exercise the MCP server tools as plain Python callables.
2
+
3
+ FastMCP tools are registered on the server instance, but the underlying
4
+ functions are ordinary Python functions decorated with ``@mcp.tool()``.
5
+ We import the module and invoke those functions directly to verify the
6
+ end-to-end wiring (schemas validated, dispatch correct, JSON-serializable).
7
+ """
8
+
9
+ import humeo_core.server as srv
10
+ from humeo_core.schemas import LayoutKind
11
+
12
+
13
+ def test_list_layouts_lists_all_three():
14
+ result = srv.list_layouts()
15
+ kinds = {layout["kind"] for layout in result["layouts"]}
16
+ assert kinds == {k.value for k in LayoutKind}
17
+
18
+
19
+ def test_plan_layout_tool_returns_filtergraph():
20
+ for k in LayoutKind:
21
+ out = srv.plan_layout(layout=k.value)
22
+ assert out["out_label"] == "vout"
23
+ assert "[vout]" in out["filtergraph"]
24
+
25
+
26
+ def test_build_render_cmd_dry_run():
27
+ req = {
28
+ "source_path": "/tmp/src.mp4",
29
+ "clip": {
30
+ "clip_id": "1",
31
+ "topic": "t",
32
+ "start_time_sec": 0.0,
33
+ "end_time_sec": 30.0,
34
+ },
35
+ "layout": {"clip_id": "1", "layout": LayoutKind.SIT_CENTER.value},
36
+ "output_path": "/tmp/out.mp4",
37
+ }
38
+ out = srv.build_render_cmd(request=req)
39
+ assert out["success"] is True
40
+ assert out["output_path"] == "/tmp/out.mp4"
41
+ assert any("-filter_complex" == part for part in out["ffmpeg_cmd"])
42
+
43
+
44
+ def test_select_clips_tool_happy_path():
45
+ words = [
46
+ {"word": f"w{i}", "start_time": float(i), "end_time": float(i) + 0.5}
47
+ for i in range(120)
48
+ ]
49
+ plan = srv.select_clips(
50
+ source_path="/tmp/x.mp4",
51
+ transcript_words=words,
52
+ duration_sec=120.0,
53
+ target_count=2,
54
+ min_sec=30.0,
55
+ max_sec=60.0,
56
+ )
57
+ assert plan["source_path"] == "/tmp/x.mp4"
58
+ assert 1 <= len(plan["clips"]) <= 2
59
+
60
+
61
+ def test_classify_scenes_tool_no_keyframes():
62
+ scenes = [{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0}]
63
+ out = srv.classify_scenes(scenes=scenes)
64
+ assert out["classifications"][0]["scene_id"] == "s0"
65
+ assert out["classifications"][0]["layout"] in {k.value for k in LayoutKind}
66
+
67
+
68
+ def test_detect_scene_regions_returns_jobs_and_prompt():
69
+ scenes = [
70
+ {"scene_id": "s0", "start_time": 0.0, "end_time": 5.0, "keyframe_path": "/tmp/k0.jpg"},
71
+ {"scene_id": "s1", "start_time": 5.0, "end_time": 10.0, "keyframe_path": "/tmp/k1.jpg"},
72
+ ]
73
+ out = srv.detect_scene_regions(scenes=scenes)
74
+ assert "STRICT JSON" in out["prompt"]
75
+ assert len(out["jobs"]) == 2
76
+ assert out["jobs"][0]["scene_id"] == "s0"
77
+ assert out["jobs"][0]["keyframe_path"] == "/tmp/k0.jpg"
78
+
79
+
80
+ def test_classify_scenes_with_vision_derives_instructions():
81
+ regions = [
82
+ {
83
+ "scene_id": "s0",
84
+ "chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 0.66, "y2": 1.0},
85
+ "person_bbox": {"x1": 0.72, "y1": 0.1, "x2": 0.99, "y2": 0.95},
86
+ "ocr_text": "CPI YoY",
87
+ }
88
+ ]
89
+ out = srv.classify_scenes_with_vision(regions=regions)
90
+ assert out["classifications"][0]["layout"] == LayoutKind.SPLIT_CHART_PERSON.value
91
+ instr = out["layout_instructions"][0]
92
+ assert instr["chart_x_norm"] == 0.0
93
+ assert 0.8 < instr["person_x_norm"] < 0.9
humeo-core/tests/test_vision.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the scene-change + vision-LLM + OCR bbox primitive.
2
+
3
+ Covers:
4
+ * happy path: well-formed JSON -> populated ``SceneRegions``.
5
+ * bad JSON: degrade to empty regions + raw_reason, never raise.
6
+ * bad bbox: one malformed bbox does not take down the whole scene record.
7
+ * classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.
8
+ * layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come
9
+ from the bboxes when present, defaults when not.
10
+ """
11
+
12
+ import json
13
+
14
+ import pytest
15
+
16
+ from humeo_core.primitives.vision import (
17
+ _CHART_WIDTH_SPLIT_THRESHOLD,
18
+ classify_from_regions,
19
+ classify_scenes_with_vision_llm,
20
+ detect_regions_with_llm,
21
+ layout_instruction_from_regions,
22
+ )
23
+ from humeo_core.schemas import (
24
+ BoundingBox,
25
+ LayoutKind,
26
+ Scene,
27
+ SceneClassification,
28
+ SceneRegions,
29
+ )
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Schema
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ def test_bounding_box_requires_x2_gt_x1():
38
+ BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
39
+ with pytest.raises(ValueError):
40
+ BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
41
+ with pytest.raises(ValueError):
42
+ BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)
43
+
44
+
45
+ def test_bounding_box_center_and_width():
46
+ b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
47
+ assert b.center_x == pytest.approx(0.4)
48
+ assert b.center_y == pytest.approx(0.65)
49
+ assert b.width == pytest.approx(0.4)
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # detect_regions_with_llm
54
+ # ---------------------------------------------------------------------------
55
+
56
+
57
+ def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
58
+ return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
59
+
60
+
61
+ def test_detect_regions_happy_path():
62
+ scenes = [_scene(0)]
63
+
64
+ def vision_fn(_img: str, _prompt: str) -> str:
65
+ return json.dumps(
66
+ {
67
+ "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
68
+ "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
69
+ "ocr_text": "Inflation YoY",
70
+ "reason": "explainer layout",
71
+ }
72
+ )
73
+
74
+ out = detect_regions_with_llm(scenes, vision_fn)
75
+ assert len(out) == 1
76
+ r = out[0]
77
+ assert r.scene_id == "s0"
78
+ assert r.person_bbox and r.person_bbox.center_x > 0.8
79
+ assert r.chart_bbox and r.chart_bbox.width > 0.6
80
+ assert "Inflation" in r.ocr_text
81
+
82
+
83
+ def test_detect_regions_bad_json_is_safe():
84
+ scenes = [_scene(0)]
85
+
86
+ def vision_fn(*_a) -> str:
87
+ return "not json"
88
+
89
+ out = detect_regions_with_llm(scenes, vision_fn)
90
+ assert out[0].person_bbox is None
91
+ assert out[0].chart_bbox is None
92
+ assert "parse error" in out[0].raw_reason.lower()
93
+
94
+
95
+ def test_detect_regions_missing_keyframe_is_safe():
96
+ scenes = [_scene(0, kf=None)]
97
+
98
+ def vision_fn(*_a) -> str: # pragma: no cover - should not be called
99
+ raise AssertionError("vision_fn must not be called without a keyframe")
100
+
101
+ out = detect_regions_with_llm(scenes, vision_fn)
102
+ assert out[0].person_bbox is None
103
+ assert "no keyframe" in out[0].raw_reason.lower()
104
+
105
+
106
+ def test_detect_regions_bad_bbox_degrades_gracefully():
107
+ scenes = [_scene(0)]
108
+
109
+ def vision_fn(*_a) -> str:
110
+ return json.dumps(
111
+ {
112
+ "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
113
+ "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
114
+ "ocr_text": "",
115
+ "reason": "person bbox inverted",
116
+ }
117
+ )
118
+
119
+ out = detect_regions_with_llm(scenes, vision_fn)
120
+ assert out[0].person_bbox is None
121
+ assert out[0].chart_bbox is not None
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # classify_from_regions
126
+ # ---------------------------------------------------------------------------
127
+
128
+
129
+ def test_classify_wide_chart_is_split():
130
+ r = SceneRegions(
131
+ scene_id="s0",
132
+ chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
133
+ person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
134
+ )
135
+ c = classify_from_regions(r)
136
+ assert c.layout == LayoutKind.SPLIT_CHART_PERSON
137
+ assert c.confidence > 0.5
138
+
139
+
140
+ def test_classify_narrow_chart_not_split():
141
+ r = SceneRegions(
142
+ scene_id="s0",
143
+ chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
144
+ person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
145
+ )
146
+ c = classify_from_regions(r)
147
+ # chart width (0.1) is below the split threshold -> not split
148
+ assert c.layout != LayoutKind.SPLIT_CHART_PERSON
149
+
150
+
151
+ def test_classify_wide_person_is_zoom_call():
152
+ r = SceneRegions(
153
+ scene_id="s0",
154
+ person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
155
+ )
156
+ c = classify_from_regions(r)
157
+ assert c.layout == LayoutKind.ZOOM_CALL_CENTER
158
+
159
+
160
+ def test_classify_small_person_is_sit_center():
161
+ r = SceneRegions(
162
+ scene_id="s0",
163
+ person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
164
+ )
165
+ c = classify_from_regions(r)
166
+ assert c.layout == LayoutKind.SIT_CENTER
167
+
168
+
169
+ def test_classify_nothing_detected_defaults_sit_center_low_conf():
170
+ r = SceneRegions(scene_id="s0", raw_reason="model returned null")
171
+ c = classify_from_regions(r)
172
+ assert c.layout == LayoutKind.SIT_CENTER
173
+ assert c.confidence <= 0.5
174
+
175
+
176
+ def test_chart_threshold_is_exported():
177
+ # guard against the tuning constant silently being removed
178
+ assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # layout_instruction_from_regions
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def test_layout_instruction_from_regions_split():
187
+ r = SceneRegions(
188
+ scene_id="s0",
189
+ chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
190
+ person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
191
+ )
192
+ c = classify_from_regions(r)
193
+ instr = layout_instruction_from_regions(r, c)
194
+ assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
195
+ # person_x_norm = center of (0.72, 0.99) = 0.855
196
+ assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
197
+ # chart_x_norm = left edge = 0.0
198
+ assert instr.chart_x_norm == pytest.approx(0.0)
199
+
200
+
201
+ def test_layout_instruction_defaults_when_no_regions():
202
+ r = SceneRegions(scene_id="s0")
203
+ c = SceneClassification(
204
+ scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
205
+ )
206
+ instr = layout_instruction_from_regions(r, c)
207
+ assert instr.person_x_norm == 0.5
208
+ assert instr.chart_x_norm == 0.0
209
+
210
+
211
+ def test_classify_scenes_with_vision_llm_returns_pairs():
212
+ scenes = [_scene(0)]
213
+
214
+ def vision_fn(*_a) -> str:
215
+ return json.dumps(
216
+ {
217
+ "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
218
+ "chart_bbox": None,
219
+ "ocr_text": "",
220
+ "reason": "solo subject",
221
+ }
222
+ )
223
+
224
+ pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
225
+ assert len(pairs) == 1
226
+ regions, classification = pairs[0]
227
+ assert regions.person_bbox is not None
228
+ assert classification.layout == LayoutKind.ZOOM_CALL_CENTER
pyproject.toml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "humeo"
7
+ version = "0.1.0"
8
+ description = "Automated podcast-to-shorts pipeline"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "yt-dlp>=2024.0",
13
+ "fastapi>=0.115",
14
+ "openai>=1.0",
15
+ "google-genai>=1.0",
16
+ "httpx>=0.28",
17
+ "jinja2>=3.1",
18
+ "numpy>=1.24",
19
+ "Pillow>=10.0",
20
+ "python-dotenv>=1.0",
21
+ "replicate>=0.34.2",
22
+ "tqdm>=4.60",
23
+ "python-multipart>=0.0.9",
24
+ "uvicorn[standard]>=0.30",
25
+ "humeo-core",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ "pytest-asyncio>=0.23",
31
+ "ruff",
32
+ "pytest",
33
+ ]
34
+ whisper = [
35
+ "whisperx @ git+https://github.com/m-bain/whisperX.git",
36
+ ]
37
+
38
+ [tool.uv.sources]
39
+ humeo-core = { path = "humeo-core", editable = true }
40
+
41
+ [project.scripts]
42
+ humeo = "humeo.cli:main"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["src"]
46
+
47
+ [tool.setuptools.package-data]
48
+ humeo = ["prompts/*.jinja2"]
49
+
50
+ [tool.pytest.ini_options]
51
+ testpaths = ["tests", "humeo-core/tests"]
52
+ addopts = "-ra -q"
53
+
54
+ [tool.ruff]
55
+ line-length = 100
56
+ target-version = "py310"
src/humeo.egg-info/PKG-INFO ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: humeo
3
+ Version: 0.1.0
4
+ Summary: Automated podcast-to-shorts pipeline
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: yt-dlp>=2024.0
9
+ Requires-Dist: openai>=1.0
10
+ Requires-Dist: google-genai>=1.0
11
+ Requires-Dist: httpx>=0.28
12
+ Requires-Dist: jinja2>=3.1
13
+ Requires-Dist: numpy>=1.24
14
+ Requires-Dist: Pillow>=10.0
15
+ Requires-Dist: python-dotenv>=1.0
16
+ Requires-Dist: replicate>=0.34.2
17
+ Requires-Dist: tqdm>=4.60
18
+ Requires-Dist: humeo-core
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
21
+ Requires-Dist: ruff; extra == "dev"
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Provides-Extra: whisper
24
+ Requires-Dist: whisperx @ git+https://github.com/m-bain/whisperX.git ; extra == "whisper"
25
+ Dynamic: license-file
26
+
27
+ ---
28
+ title: Humeo
29
+ sdk: docker
30
+ app_port: 7860
31
+ ---
32
+
33
+ # Humeo
34
+
35
+ Current default preset:
36
+
37
+ - `native_highlight` captions
38
+ - OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
39
+ - Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
40
+ - ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
41
+
42
+ Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
43
+
44
+ **Architecture (static HTML, GitHub Pages):**
45
+ [https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
46
+
47
+ ## Hugging Face Space
48
+
49
+ This repo includes a Hugging Face Docker Space entrypoint in `app.py`.
50
+
51
+ - Upload one local MP4
52
+ - Watch live pipeline logs and stage progress
53
+ - Download rendered `short_*.mp4` clips from the UI
54
+
55
+ Required Space secrets:
56
+
57
+ - `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
58
+ - `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
59
+
60
+ The Docker image pins `HUMEO_TRANSCRIBE_PROVIDER=openai` for the Space demo.
61
+
62
+ ## Repo layout
63
+
64
+ | Path | Role |
65
+ |------|------|
66
+ | `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
67
+ | `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
68
+
69
+ ## Pipeline (actual order)
70
+
71
+ ```text
72
+ YouTube URL
73
+ → ingest (source.mp4, transcript.json)
74
+ → clip selection (Gemini → clips.json)
75
+ → hook detection (Gemini → hooks.json)
76
+ → content pruning (Gemini → prune.json)
77
+ → keyframes + layout vision (Gemini vision → layout_vision.json)
78
+ → ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
79
+ ```
80
+
81
+ Details: **`docs/PIPELINE.md`**.
82
+
83
+ ## Five layouts
84
+
85
+ A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
86
+
87
+ ## Requirements
88
+
89
+ - **Python** ≥ 3.10
90
+ - **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
91
+ - **`ffmpeg`** — on `PATH` for extract/render
92
+ - **API keys** — see **`docs/ENVIRONMENT.md`**
93
+ - `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
94
+ - `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
95
+ - `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
96
+
97
+ Copy **`.env.example`** → **`.env`** (never commit `.env`).
98
+
99
+ ## Install
100
+
101
+ ```bash
102
+ uv venv
103
+ uv sync
104
+ ```
105
+
106
+ Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
107
+
108
+ ```bash
109
+ uv sync --extra whisper
110
+ ```
111
+
112
+ ## Run
113
+
114
+ ```bash
115
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
116
+ humeo --long-to-shorts "C:\path\to\video.mp4"
117
+ ```
118
+
119
+ Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
120
+
121
+ ## CLI guide (all flags)
122
+
123
+ Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
124
+
125
+ ### Required
126
+
127
+ | Flag | Meaning |
128
+ |------|---------|
129
+ | `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
130
+
131
+ ### Paths and cache behavior
132
+
133
+ | Flag | Meaning |
134
+ |------|---------|
135
+ | `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
136
+ | `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
137
+ | `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
138
+ | `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
139
+ | `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
140
+
141
+ ### Model selection and stage forcing
142
+
143
+ | Flag | Meaning |
144
+ |------|---------|
145
+ | `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
146
+ | `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
147
+ | `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
148
+ | `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
149
+ | `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
150
+ | `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
151
+ | `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
152
+
153
+ ### Pruning and subtitles
154
+
155
+ | Flag | Meaning |
156
+ |------|---------|
157
+ | `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
158
+ | `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
159
+ | `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
160
+ | `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
161
+ | `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
162
+
163
+ ### Logging
164
+
165
+ | Flag | Meaning |
166
+ |------|---------|
167
+ | `--verbose`, `-v` | Enable debug logging. |
168
+
169
+ ### Common command recipes
170
+
171
+ ```bash
172
+ # Basic run
173
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
174
+
175
+ # Local MP4
176
+ humeo --long-to-shorts "C:\path\to\video.mp4"
177
+
178
+ # Full fresh run for debugging / prompt tuning
179
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
180
+
181
+ # Re-run only clip selection after prompt edits
182
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
183
+
184
+ # Keep intermediates in a fixed local folder
185
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
186
+
187
+ # Compare different prune levels on same source
188
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
189
+ humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
190
+ ```
191
+
192
+ ## Documentation
193
+
194
+ | Doc | Purpose |
195
+ |-----|---------|
196
+ | **`docs/README.md`** | Index of all files under `docs/` |
197
+ | **`docs/STUDY_ORDER.md`** | Read order for onboarding |
198
+ | **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
199
+ | **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
200
+ | **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
201
+ | **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
202
+ | **`docs/full_run_output.txt`** | Example full run log (text) |
203
+ | **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
204
+ | **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
205
+ | **`docs/TODO.md`** | Backlog |
206
+ | **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
207
+ | **`docs/SOLUTIONS.md`** | Design rationale |
208
+ | **`TERMINOLOGY.md`** | Glossary |
209
+
210
+ ## Tests
211
+
212
+ ```bash
213
+ uv sync --extra dev
214
+ uv run pytest
215
+ ```
216
+
217
+ ## Sharing outputs
218
+
219
+ `output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
220
+
221
+ ## License
222
+
223
+ See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
src/humeo.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/humeo/__init__.py
5
+ src/humeo/best_of.py
6
+ src/humeo/cli.py
7
+ src/humeo/clip_assembly.py
8
+ src/humeo/clip_selection_cache.py
9
+ src/humeo/clip_selector.py
10
+ src/humeo/config.py
11
+ src/humeo/content_pruning.py
12
+ src/humeo/cutter.py
13
+ src/humeo/env.py
14
+ src/humeo/gemini_generate.py
15
+ src/humeo/hook_detector.py
16
+ src/humeo/hook_library.py
17
+ src/humeo/ingest.py
18
+ src/humeo/interactive.py
19
+ src/humeo/layout_vision.py
20
+ src/humeo/pipeline.py
21
+ src/humeo/prompt_loader.py
22
+ src/humeo/reframe_ffmpeg.py
23
+ src/humeo/render_window.py
24
+ src/humeo/session_state.py
25
+ src/humeo/transcript_align.py
26
+ src/humeo/video_cache.py
27
+ src/humeo.egg-info/PKG-INFO
28
+ src/humeo.egg-info/SOURCES.txt
29
+ src/humeo.egg-info/dependency_links.txt
30
+ src/humeo.egg-info/entry_points.txt
31
+ src/humeo.egg-info/requires.txt
32
+ src/humeo.egg-info/top_level.txt
33
+ src/humeo/prompts/clip_selection_system.jinja2
34
+ src/humeo/prompts/clip_selection_user.jinja2
35
+ src/humeo/prompts/content_pruning_system.jinja2
36
+ src/humeo/prompts/hook_detection_system.jinja2
37
+ tests/test_ass_subtitles.py
38
+ tests/test_best_of.py
39
+ tests/test_clip_assembly.py
40
+ tests/test_clip_ranking.py
41
+ tests/test_clip_selection_cache.py
42
+ tests/test_clip_selector.py
43
+ tests/test_content_pruning.py
44
+ tests/test_cutter_native_highlight.py
45
+ tests/test_gemini_generate.py
46
+ tests/test_hook_detector.py
47
+ tests/test_hook_library.py
48
+ tests/test_ingest_openai_chunks.py
49
+ tests/test_interactive.py
50
+ tests/test_layout_vision_unit.py
51
+ tests/test_pipeline_interactive.py
52
+ tests/test_pipeline_quality_gate.py
53
+ tests/test_prompt_loader.py
54
+ tests/test_reframe_ffmpeg.py
55
+ tests/test_render_window.py
56
+ tests/test_session_state.py
57
+ tests/test_transcript_align.py
58
+ tests/test_video_cache.py
src/humeo.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/humeo.egg-info/entry_points.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [console_scripts]
2
+ humeo = humeo.cli:main
src/humeo.egg-info/requires.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ yt-dlp>=2024.0
2
+ openai>=1.0
3
+ google-genai>=1.0
4
+ httpx>=0.28
5
+ jinja2>=3.1
6
+ numpy>=1.24
7
+ Pillow>=10.0
8
+ python-dotenv>=1.0
9
+ replicate>=0.34.2
10
+ tqdm>=4.60
11
+ humeo-core
12
+
13
+ [dev]
14
+ pytest-asyncio>=0.23
15
+ ruff
16
+ pytest
17
+
18
+ [whisper]
19
+ whisperx @ git+https://github.com/m-bain/whisperX.git