Spaces:
Sleeping
Sleeping
Deploy ClipForge Docker Space
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +15 -0
- .gitattributes +1 -0
- Dockerfile +21 -0
- LICENSE +21 -0
- README.md +199 -10
- app.py +808 -0
- humeo-core/.gitignore +9 -0
- humeo-core/LICENSE +21 -0
- humeo-core/README.md +165 -0
- humeo-core/docs/ARCHITECTURE.md +128 -0
- humeo-core/docs/MCP_USAGE.md +100 -0
- humeo-core/examples/render_request.json +23 -0
- humeo-core/pyproject.toml +46 -0
- humeo-core/src/humeo_core.egg-info/PKG-INFO +197 -0
- humeo-core/src/humeo_core.egg-info/SOURCES.txt +33 -0
- humeo-core/src/humeo_core.egg-info/dependency_links.txt +1 -0
- humeo-core/src/humeo_core.egg-info/entry_points.txt +3 -0
- humeo-core/src/humeo_core.egg-info/requires.txt +21 -0
- humeo-core/src/humeo_core.egg-info/top_level.txt +1 -0
- humeo-core/src/humeo_core/__init__.py +49 -0
- humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf +0 -0
- humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt +93 -0
- humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt +93 -0
- humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf +3 -0
- humeo-core/src/humeo_core/primitives/__init__.py +1 -0
- humeo-core/src/humeo_core/primitives/classify.py +232 -0
- humeo-core/src/humeo_core/primitives/compile.py +602 -0
- humeo-core/src/humeo_core/primitives/face_detect.py +135 -0
- humeo-core/src/humeo_core/primitives/ingest.py +187 -0
- humeo-core/src/humeo_core/primitives/layouts.py +707 -0
- humeo-core/src/humeo_core/primitives/select_clips.py +150 -0
- humeo-core/src/humeo_core/primitives/vision.py +210 -0
- humeo-core/src/humeo_core/schemas.py +518 -0
- humeo-core/src/humeo_core/server.py +332 -0
- humeo-core/tests/__init__.py +0 -0
- humeo-core/tests/test_classify.py +39 -0
- humeo-core/tests/test_compile.py +329 -0
- humeo-core/tests/test_face_detect.py +73 -0
- humeo-core/tests/test_layout_bbox.py +17 -0
- humeo-core/tests/test_layouts.py +312 -0
- humeo-core/tests/test_schemas.py +267 -0
- humeo-core/tests/test_select_clips.py +49 -0
- humeo-core/tests/test_server_tools.py +93 -0
- humeo-core/tests/test_vision.py +228 -0
- pyproject.toml +56 -0
- src/humeo.egg-info/PKG-INFO +223 -0
- src/humeo.egg-info/SOURCES.txt +58 -0
- src/humeo.egg-info/dependency_links.txt +1 -0
- src/humeo.egg-info/entry_points.txt +2 -0
- src/humeo.egg-info/requires.txt +19 -0
.dockerignore
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.env
|
| 3 |
+
.env.*
|
| 4 |
+
!.env.example
|
| 5 |
+
.venv
|
| 6 |
+
__pycache__
|
| 7 |
+
.pytest_cache
|
| 8 |
+
.humeo_*
|
| 9 |
+
.tmp_review_frames
|
| 10 |
+
.tmp_review_frames_ticketc
|
| 11 |
+
output
|
| 12 |
+
output*
|
| 13 |
+
*.log
|
| 14 |
+
*.zip
|
| 15 |
+
*.pyc
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim-bookworm
|
| 2 |
+
|
| 3 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 4 |
+
PIP_NO_CACHE_DIR=1 \
|
| 5 |
+
PORT=7860
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
RUN apt-get update && \
|
| 10 |
+
apt-get install -y ffmpeg && \
|
| 11 |
+
rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
COPY . /app
|
| 14 |
+
|
| 15 |
+
RUN pip install --upgrade pip && \
|
| 16 |
+
pip install ./humeo-core && \
|
| 17 |
+
pip install .
|
| 18 |
+
|
| 19 |
+
EXPOSE 7860
|
| 20 |
+
|
| 21 |
+
CMD ["python", "app.py"]
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 NotABot
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,10 +1,199 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: ClipForge
|
| 3 |
+
sdk: docker
|
| 4 |
+
app_port: 7860
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# ClipForge
|
| 8 |
+
|
| 9 |
+
Current default preset:
|
| 10 |
+
|
| 11 |
+
- `native_highlight` captions
|
| 12 |
+
- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
|
| 13 |
+
- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
|
| 14 |
+
- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
|
| 15 |
+
|
| 16 |
+
Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
|
| 17 |
+
|
| 18 |
+
**Architecture (static HTML, GitHub Pages):**
|
| 19 |
+
[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
|
| 20 |
+
|
| 21 |
+
## Hugging Face Space
|
| 22 |
+
|
| 23 |
+
This repo includes a Hugging Face Docker Space entrypoint in `app.py` with the ClipForge upload/link UI.
|
| 24 |
+
|
| 25 |
+
- Paste a YouTube/video URL or upload one local video file
|
| 26 |
+
- Watch live pipeline progress in the ClipForge UI
|
| 27 |
+
- Preview and download rendered `short_*.mp4` clips from the UI
|
| 28 |
+
- Regenerate from the same source with a steering prompt
|
| 29 |
+
|
| 30 |
+
Required Space secrets:
|
| 31 |
+
|
| 32 |
+
- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
|
| 33 |
+
- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
|
| 34 |
+
|
| 35 |
+
If `HUMEO_TRANSCRIBE_PROVIDER` is not set, the Space uses ElevenLabs when
|
| 36 |
+
`ELEVENLABS_API_KEY` exists, otherwise OpenAI Whisper.
|
| 37 |
+
|
| 38 |
+
## Repo layout
|
| 39 |
+
|
| 40 |
+
| Path | Role |
|
| 41 |
+
|------|------|
|
| 42 |
+
| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
|
| 43 |
+
| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
|
| 44 |
+
|
| 45 |
+
## Pipeline (actual order)
|
| 46 |
+
|
| 47 |
+
```text
|
| 48 |
+
YouTube URL
|
| 49 |
+
→ ingest (source.mp4, transcript.json)
|
| 50 |
+
→ clip selection (Gemini → clips.json)
|
| 51 |
+
→ hook detection (Gemini → hooks.json)
|
| 52 |
+
→ content pruning (Gemini → prune.json)
|
| 53 |
+
→ keyframes + layout vision (Gemini vision → layout_vision.json)
|
| 54 |
+
→ ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
Details: **`docs/PIPELINE.md`**.
|
| 58 |
+
|
| 59 |
+
## Five layouts
|
| 60 |
+
|
| 61 |
+
A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
|
| 62 |
+
|
| 63 |
+
## Requirements
|
| 64 |
+
|
| 65 |
+
- **Python** ≥ 3.10
|
| 66 |
+
- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
|
| 67 |
+
- **`ffmpeg`** — on `PATH` for extract/render
|
| 68 |
+
- **API keys** — see **`docs/ENVIRONMENT.md`**
|
| 69 |
+
- `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
|
| 70 |
+
- `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
|
| 71 |
+
- `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
|
| 72 |
+
|
| 73 |
+
Copy **`.env.example`** → **`.env`** (never commit `.env`).
|
| 74 |
+
|
| 75 |
+
## Install
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
uv venv
|
| 79 |
+
uv sync
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
uv sync --extra whisper
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Run
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
|
| 92 |
+
humeo --long-to-shorts "C:\path\to\video.mp4"
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
|
| 96 |
+
|
| 97 |
+
## CLI guide (all flags)
|
| 98 |
+
|
| 99 |
+
Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
|
| 100 |
+
|
| 101 |
+
### Required
|
| 102 |
+
|
| 103 |
+
| Flag | Meaning |
|
| 104 |
+
|------|---------|
|
| 105 |
+
| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
|
| 106 |
+
|
| 107 |
+
### Paths and cache behavior
|
| 108 |
+
|
| 109 |
+
| Flag | Meaning |
|
| 110 |
+
|------|---------|
|
| 111 |
+
| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
|
| 112 |
+
| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
|
| 113 |
+
| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
|
| 114 |
+
| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
|
| 115 |
+
| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
|
| 116 |
+
|
| 117 |
+
### Model selection and stage forcing
|
| 118 |
+
|
| 119 |
+
| Flag | Meaning |
|
| 120 |
+
|------|---------|
|
| 121 |
+
| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
|
| 122 |
+
| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
|
| 123 |
+
| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
|
| 124 |
+
| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
|
| 125 |
+
| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
|
| 126 |
+
| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
|
| 127 |
+
| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
|
| 128 |
+
|
| 129 |
+
### Pruning and subtitles
|
| 130 |
+
|
| 131 |
+
| Flag | Meaning |
|
| 132 |
+
|------|---------|
|
| 133 |
+
| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
|
| 134 |
+
| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
|
| 135 |
+
| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
|
| 136 |
+
| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
|
| 137 |
+
| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
|
| 138 |
+
|
| 139 |
+
### Logging
|
| 140 |
+
|
| 141 |
+
| Flag | Meaning |
|
| 142 |
+
|------|---------|
|
| 143 |
+
| `--verbose`, `-v` | Enable debug logging. |
|
| 144 |
+
|
| 145 |
+
### Common command recipes
|
| 146 |
+
|
| 147 |
+
```bash
|
| 148 |
+
# Basic run
|
| 149 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
|
| 150 |
+
|
| 151 |
+
# Local MP4
|
| 152 |
+
humeo --long-to-shorts "C:\path\to\video.mp4"
|
| 153 |
+
|
| 154 |
+
# Full fresh run for debugging / prompt tuning
|
| 155 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
|
| 156 |
+
|
| 157 |
+
# Re-run only clip selection after prompt edits
|
| 158 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
|
| 159 |
+
|
| 160 |
+
# Keep intermediates in a fixed local folder
|
| 161 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
|
| 162 |
+
|
| 163 |
+
# Compare different prune levels on same source
|
| 164 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
|
| 165 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
## Documentation
|
| 169 |
+
|
| 170 |
+
| Doc | Purpose |
|
| 171 |
+
|-----|---------|
|
| 172 |
+
| **`docs/README.md`** | Index of all files under `docs/` |
|
| 173 |
+
| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
|
| 174 |
+
| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
|
| 175 |
+
| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
|
| 176 |
+
| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
|
| 177 |
+
| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
|
| 178 |
+
| **`docs/full_run_output.txt`** | Example full run log (text) |
|
| 179 |
+
| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
|
| 180 |
+
| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
|
| 181 |
+
| **`docs/TODO.md`** | Backlog |
|
| 182 |
+
| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
|
| 183 |
+
| **`docs/SOLUTIONS.md`** | Design rationale |
|
| 184 |
+
| **`TERMINOLOGY.md`** | Glossary |
|
| 185 |
+
|
| 186 |
+
## Tests
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
uv sync --extra dev
|
| 190 |
+
uv run pytest
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
## Sharing outputs
|
| 194 |
+
|
| 195 |
+
`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
|
| 196 |
+
|
| 197 |
+
## License
|
| 198 |
+
|
| 199 |
+
See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
|
app.py
ADDED
|
@@ -0,0 +1,808 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import html
|
| 4 |
+
import json
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import queue
|
| 8 |
+
import re
|
| 9 |
+
import shutil
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
import tempfile
|
| 13 |
+
import threading
|
| 14 |
+
import time
|
| 15 |
+
import traceback
|
| 16 |
+
import uuid
|
| 17 |
+
from dataclasses import dataclass, field
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Annotated
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _bootstrap_local_paths() -> None:
|
| 23 |
+
repo_root = Path(__file__).resolve().parent
|
| 24 |
+
for candidate in (repo_root / "src", repo_root / "humeo-core" / "src"):
|
| 25 |
+
candidate_str = str(candidate)
|
| 26 |
+
if candidate.is_dir() and candidate_str not in sys.path:
|
| 27 |
+
sys.path.insert(0, candidate_str)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
_bootstrap_local_paths()
|
| 31 |
+
if not (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip():
|
| 32 |
+
os.environ["HUMEO_TRANSCRIBE_PROVIDER"] = (
|
| 33 |
+
"elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
| 37 |
+
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
| 38 |
+
|
| 39 |
+
from humeo.config import PipelineConfig
|
| 40 |
+
from humeo.pipeline import run_pipeline
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
APP_TITLE = "ClipForge"
|
| 44 |
+
LOG_FORMAT = "%(asctime)s | %(levelname)-7s | %(name)s | %(message)s"
|
| 45 |
+
MAX_LOG_LINES = 700
|
| 46 |
+
LLM_KEY_NAMES = ("GOOGLE_API_KEY", "GEMINI_API_KEY", "OPENROUTER_API_KEY")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class QueueLogHandler(logging.Handler):
|
| 50 |
+
def __init__(self, sink: queue.Queue[str]):
|
| 51 |
+
super().__init__()
|
| 52 |
+
self._sink = sink
|
| 53 |
+
|
| 54 |
+
def emit(self, record: logging.LogRecord) -> None:
|
| 55 |
+
try:
|
| 56 |
+
self._sink.put_nowait(self.format(record))
|
| 57 |
+
except Exception:
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@dataclass
|
| 62 |
+
class ClipFile:
|
| 63 |
+
name: str
|
| 64 |
+
url: str
|
| 65 |
+
duration: str
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@dataclass
|
| 69 |
+
class Job:
|
| 70 |
+
id: str
|
| 71 |
+
run_root: Path
|
| 72 |
+
output_dir: Path
|
| 73 |
+
work_dir: Path
|
| 74 |
+
source: str
|
| 75 |
+
source_path: Path | None = None
|
| 76 |
+
steering_note: str | None = None
|
| 77 |
+
status: str = "Queued"
|
| 78 |
+
nav_status: str = "Processing..."
|
| 79 |
+
error: str | None = None
|
| 80 |
+
done: bool = False
|
| 81 |
+
created_at: float = field(default_factory=time.time)
|
| 82 |
+
logs: list[str] = field(default_factory=list)
|
| 83 |
+
clips: dict[str, ClipFile] = field(default_factory=dict)
|
| 84 |
+
steps: list[dict[str, object]] = field(
|
| 85 |
+
default_factory=lambda: [
|
| 86 |
+
{"name": "Uploading video", "pct": 100, "state": "done"},
|
| 87 |
+
{"name": "Generating transcript", "pct": 5, "state": "active"},
|
| 88 |
+
{"name": "Choosing short clips", "pct": 0, "state": "pending"},
|
| 89 |
+
{"name": "Producing clips", "pct": 0, "state": "pending"},
|
| 90 |
+
{"name": "Adding subtitles & light edits", "pct": 0, "state": "pending"},
|
| 91 |
+
]
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
JOBS: dict[str, Job] = {}
|
| 96 |
+
JOBS_LOCK = threading.Lock()
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _append_log(job: Job, line: str) -> None:
|
| 100 |
+
job.logs.append(line)
|
| 101 |
+
if len(job.logs) > MAX_LOG_LINES:
|
| 102 |
+
job.logs = job.logs[-MAX_LOG_LINES:]
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _set_step(job: Job, idx: int, pct: int, state: str = "active") -> None:
|
| 106 |
+
for step_idx, step in enumerate(job.steps):
|
| 107 |
+
if step_idx < idx:
|
| 108 |
+
step["pct"] = 100
|
| 109 |
+
step["state"] = "done"
|
| 110 |
+
elif step_idx == idx:
|
| 111 |
+
step["pct"] = max(int(step.get("pct", 0)), min(100, pct))
|
| 112 |
+
step["state"] = state
|
| 113 |
+
elif step.get("state") != "done":
|
| 114 |
+
step["state"] = "pending"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _update_stage_from_log(job: Job, line: str) -> None:
|
| 118 |
+
if "STAGE 1: INGESTION" in line:
|
| 119 |
+
job.status = "Generating transcript"
|
| 120 |
+
_set_step(job, 1, 15)
|
| 121 |
+
elif "Transcribing" in line:
|
| 122 |
+
job.status = "Generating transcript"
|
| 123 |
+
_set_step(job, 1, 45)
|
| 124 |
+
elif "Transcript already exists" in line or "Transcription complete" in line:
|
| 125 |
+
_set_step(job, 1, 90)
|
| 126 |
+
elif "STAGE 2: CLIP SELECTION" in line:
|
| 127 |
+
job.status = "Choosing short clips"
|
| 128 |
+
_set_step(job, 2, 20)
|
| 129 |
+
elif "STAGE 2.25: HOOK DETECTION" in line:
|
| 130 |
+
job.status = "Finding hooks"
|
| 131 |
+
_set_step(job, 2, 55)
|
| 132 |
+
elif "STAGE 2.5: CONTENT PRUNING" in line:
|
| 133 |
+
job.status = "Tightening clip windows"
|
| 134 |
+
_set_step(job, 2, 78)
|
| 135 |
+
elif "STAGE 2.75: CLIP ASSEMBLY" in line:
|
| 136 |
+
job.status = "Assembling clips"
|
| 137 |
+
_set_step(job, 3, 18)
|
| 138 |
+
elif "STAGE 3: CLIP LAYOUTS" in line:
|
| 139 |
+
job.status = "Choosing layout"
|
| 140 |
+
_set_step(job, 3, 38)
|
| 141 |
+
elif "STAGE 4: RENDER" in line:
|
| 142 |
+
job.status = "Producing clips"
|
| 143 |
+
_set_step(job, 3, 62)
|
| 144 |
+
elif "reframe_clip_ffmpeg" in line:
|
| 145 |
+
_set_step(job, 4, min(90, 20 + len(job.clips) * 12))
|
| 146 |
+
elif "RENDER QA" in line or "Render QA summary" in line:
|
| 147 |
+
job.status = "Checking clips"
|
| 148 |
+
_set_step(job, 4, 82)
|
| 149 |
+
elif "PIPELINE COMPLETE" in line:
|
| 150 |
+
job.status = "Complete"
|
| 151 |
+
job.nav_status = "Done"
|
| 152 |
+
for step in job.steps:
|
| 153 |
+
step["pct"] = 100
|
| 154 |
+
step["state"] = "done"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _install_log_handler(message_queue: queue.Queue[str]) -> tuple[logging.Handler, int, dict[str, int]]:
|
| 158 |
+
handler = QueueLogHandler(message_queue)
|
| 159 |
+
handler.setFormatter(logging.Formatter(LOG_FORMAT, datefmt="%H:%M:%S"))
|
| 160 |
+
|
| 161 |
+
root_logger = logging.getLogger()
|
| 162 |
+
previous_level = root_logger.level
|
| 163 |
+
root_logger.addHandler(handler)
|
| 164 |
+
root_logger.setLevel(logging.INFO)
|
| 165 |
+
|
| 166 |
+
previous_logger_levels: dict[str, int] = {}
|
| 167 |
+
for logger_name in ("urllib3", "httpx", "httpcore"):
|
| 168 |
+
logger = logging.getLogger(logger_name)
|
| 169 |
+
previous_logger_levels[logger_name] = logger.level
|
| 170 |
+
logger.setLevel(logging.WARNING)
|
| 171 |
+
|
| 172 |
+
return handler, previous_level, previous_logger_levels
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _remove_log_handler(
|
| 176 |
+
handler: logging.Handler,
|
| 177 |
+
previous_root_level: int,
|
| 178 |
+
previous_logger_levels: dict[str, int],
|
| 179 |
+
) -> None:
|
| 180 |
+
root_logger = logging.getLogger()
|
| 181 |
+
root_logger.removeHandler(handler)
|
| 182 |
+
root_logger.setLevel(previous_root_level)
|
| 183 |
+
for logger_name, level in previous_logger_levels.items():
|
| 184 |
+
logging.getLogger(logger_name).setLevel(level)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _duration_label(path: Path) -> str:
|
| 188 |
+
try:
|
| 189 |
+
result = subprocess.run(
|
| 190 |
+
[
|
| 191 |
+
"ffprobe",
|
| 192 |
+
"-v",
|
| 193 |
+
"error",
|
| 194 |
+
"-show_entries",
|
| 195 |
+
"format=duration",
|
| 196 |
+
"-of",
|
| 197 |
+
"default=noprint_wrappers=1:nokey=1",
|
| 198 |
+
str(path),
|
| 199 |
+
],
|
| 200 |
+
check=True,
|
| 201 |
+
capture_output=True,
|
| 202 |
+
text=True,
|
| 203 |
+
timeout=15,
|
| 204 |
+
)
|
| 205 |
+
total = max(0, int(round(float(result.stdout.strip()))))
|
| 206 |
+
except Exception:
|
| 207 |
+
total = 0
|
| 208 |
+
return f"{total // 60}:{total % 60:02d}" if total else "0:00"
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def _publish_files(job: Job) -> None:
|
| 212 |
+
for path in sorted(job.output_dir.glob("short_*.mp4")):
|
| 213 |
+
if path.name not in job.clips and path.is_file():
|
| 214 |
+
job.clips[path.name] = ClipFile(
|
| 215 |
+
name=path.name,
|
| 216 |
+
url=f"/api/jobs/{job.id}/files/{path.name}",
|
| 217 |
+
duration=_duration_label(path),
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _validate_credentials() -> None:
|
| 222 |
+
if not any((os.environ.get(name) or "").strip() for name in LLM_KEY_NAMES):
|
| 223 |
+
raise HTTPException(
|
| 224 |
+
status_code=400,
|
| 225 |
+
detail="Missing LLM secret. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENROUTER_API_KEY in the Space secrets.",
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "").strip().lower()
|
| 229 |
+
if provider in {"", "auto"}:
|
| 230 |
+
provider = "elevenlabs" if (os.environ.get("ELEVENLABS_API_KEY") or "").strip() else "openai"
|
| 231 |
+
if provider == "elevenlabs" and not (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
|
| 232 |
+
raise HTTPException(status_code=400, detail="Missing ELEVENLABS_API_KEY Space secret.")
|
| 233 |
+
if provider in {"openai", "api"} and not (os.environ.get("OPENAI_API_KEY") or "").strip():
|
| 234 |
+
raise HTTPException(status_code=400, detail="Missing OPENAI_API_KEY Space secret.")
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def _safe_url(value: str | None) -> str | None:
|
| 238 |
+
value = (value or "").strip()
|
| 239 |
+
if not value:
|
| 240 |
+
return None
|
| 241 |
+
if not re.match(r"^https?://", value, flags=re.I):
|
| 242 |
+
raise HTTPException(status_code=400, detail="Paste a valid http(s) video URL.")
|
| 243 |
+
return value
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _snapshot(job: Job) -> dict[str, object]:
|
| 247 |
+
return {
|
| 248 |
+
"id": job.id,
|
| 249 |
+
"status": job.status,
|
| 250 |
+
"nav_status": job.nav_status,
|
| 251 |
+
"done": job.done,
|
| 252 |
+
"error": job.error,
|
| 253 |
+
"logs": "\n".join(job.logs[-MAX_LOG_LINES:]),
|
| 254 |
+
"steps": job.steps,
|
| 255 |
+
"clips": [clip.__dict__ for clip in job.clips.values()],
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def _run_job(job_id: str) -> None:
|
| 260 |
+
with JOBS_LOCK:
|
| 261 |
+
job = JOBS[job_id]
|
| 262 |
+
message_queue: queue.Queue[str] = queue.Queue()
|
| 263 |
+
handler, previous_root_level, previous_logger_levels = _install_log_handler(message_queue)
|
| 264 |
+
|
| 265 |
+
def drain_queue() -> None:
|
| 266 |
+
with JOBS_LOCK:
|
| 267 |
+
local_job = JOBS[job_id]
|
| 268 |
+
while True:
|
| 269 |
+
try:
|
| 270 |
+
line = message_queue.get_nowait()
|
| 271 |
+
except queue.Empty:
|
| 272 |
+
break
|
| 273 |
+
_append_log(local_job, line)
|
| 274 |
+
_update_stage_from_log(local_job, line)
|
| 275 |
+
_publish_files(local_job)
|
| 276 |
+
|
| 277 |
+
try:
|
| 278 |
+
with JOBS_LOCK:
|
| 279 |
+
_append_log(job, f"Prepared source: {job.source}")
|
| 280 |
+
_append_log(job, f"Run id: {job.id}")
|
| 281 |
+
_set_step(job, 1, 8)
|
| 282 |
+
|
| 283 |
+
config = PipelineConfig(
|
| 284 |
+
source=job.source,
|
| 285 |
+
youtube_url=job.source,
|
| 286 |
+
output_dir=job.output_dir,
|
| 287 |
+
work_dir=job.work_dir,
|
| 288 |
+
use_video_cache=False,
|
| 289 |
+
clean_run=True,
|
| 290 |
+
interactive=False,
|
| 291 |
+
prune_level="balanced",
|
| 292 |
+
overwrite_outputs=True,
|
| 293 |
+
steering_notes=[job.steering_note] if job.steering_note else [],
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
worker_error: str | None = None
|
| 297 |
+
outputs: list[Path] = []
|
| 298 |
+
|
| 299 |
+
def pipeline_worker() -> None:
|
| 300 |
+
nonlocal outputs, worker_error
|
| 301 |
+
try:
|
| 302 |
+
outputs = run_pipeline(config)
|
| 303 |
+
except Exception as exc:
|
| 304 |
+
worker_error = str(exc)
|
| 305 |
+
for line in traceback.format_exc().splitlines():
|
| 306 |
+
if line.strip():
|
| 307 |
+
message_queue.put_nowait(line)
|
| 308 |
+
|
| 309 |
+
thread = threading.Thread(target=pipeline_worker, daemon=True)
|
| 310 |
+
thread.start()
|
| 311 |
+
while thread.is_alive():
|
| 312 |
+
drain_queue()
|
| 313 |
+
time.sleep(0.35)
|
| 314 |
+
drain_queue()
|
| 315 |
+
|
| 316 |
+
with JOBS_LOCK:
|
| 317 |
+
local_job = JOBS[job_id]
|
| 318 |
+
for output in outputs:
|
| 319 |
+
if Path(output).exists():
|
| 320 |
+
local_job.clips[Path(output).name] = ClipFile(
|
| 321 |
+
name=Path(output).name,
|
| 322 |
+
url=f"/api/jobs/{job_id}/files/{Path(output).name}",
|
| 323 |
+
duration=_duration_label(Path(output)),
|
| 324 |
+
)
|
| 325 |
+
if worker_error:
|
| 326 |
+
local_job.error = worker_error
|
| 327 |
+
local_job.status = f"Failed: {worker_error}"
|
| 328 |
+
local_job.nav_status = "Failed"
|
| 329 |
+
else:
|
| 330 |
+
local_job.status = "Complete" if local_job.clips else "Complete - no clips generated"
|
| 331 |
+
local_job.nav_status = "Done"
|
| 332 |
+
for step in local_job.steps:
|
| 333 |
+
step["pct"] = 100
|
| 334 |
+
step["state"] = "done"
|
| 335 |
+
local_job.done = True
|
| 336 |
+
finally:
|
| 337 |
+
_remove_log_handler(handler, previous_root_level, previous_logger_levels)
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
async def _stage_upload(uploaded_file: UploadFile, run_root: Path) -> Path:
|
| 341 |
+
suffix = Path(uploaded_file.filename or "input.mp4").suffix or ".mp4"
|
| 342 |
+
staged_path = run_root / f"input{suffix}"
|
| 343 |
+
with staged_path.open("wb") as handle:
|
| 344 |
+
while chunk := await uploaded_file.read(1024 * 1024):
|
| 345 |
+
handle.write(chunk)
|
| 346 |
+
return staged_path
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
app = FastAPI(title=APP_TITLE)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
@app.get("/", response_class=HTMLResponse)
|
| 353 |
+
def index() -> str:
|
| 354 |
+
return INDEX_HTML
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
@app.post("/api/jobs")
|
| 358 |
+
async def create_job(
|
| 359 |
+
video_url: Annotated[str | None, Form()] = None,
|
| 360 |
+
regen_prompt: Annotated[str | None, Form()] = None,
|
| 361 |
+
source_job_id: Annotated[str | None, Form()] = None,
|
| 362 |
+
file: Annotated[UploadFile | None, File()] = None,
|
| 363 |
+
) -> JSONResponse:
|
| 364 |
+
_validate_credentials()
|
| 365 |
+
job_id = uuid.uuid4().hex[:12]
|
| 366 |
+
run_root = Path(tempfile.mkdtemp(prefix=f"clipforge-{job_id}-"))
|
| 367 |
+
work_dir = run_root / "work"
|
| 368 |
+
output_dir = run_root / "output"
|
| 369 |
+
work_dir.mkdir(parents=True, exist_ok=True)
|
| 370 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 371 |
+
|
| 372 |
+
source_path: Path | None = None
|
| 373 |
+
source = _safe_url(video_url)
|
| 374 |
+
source_job_id = (source_job_id or "").strip()
|
| 375 |
+
if source_job_id:
|
| 376 |
+
with JOBS_LOCK:
|
| 377 |
+
previous = JOBS.get(source_job_id)
|
| 378 |
+
if previous is None:
|
| 379 |
+
raise HTTPException(status_code=404, detail="Previous job not found for regeneration.")
|
| 380 |
+
if previous.source_path and previous.source_path.exists():
|
| 381 |
+
source_path = run_root / previous.source_path.name
|
| 382 |
+
shutil.copy2(previous.source_path, source_path)
|
| 383 |
+
source = str(source_path)
|
| 384 |
+
else:
|
| 385 |
+
source = previous.source
|
| 386 |
+
elif file is not None:
|
| 387 |
+
source_path = await _stage_upload(file, run_root)
|
| 388 |
+
source = str(source_path)
|
| 389 |
+
|
| 390 |
+
if not source:
|
| 391 |
+
raise HTTPException(status_code=400, detail="Upload a video file or paste a video URL first.")
|
| 392 |
+
|
| 393 |
+
job = Job(
|
| 394 |
+
id=job_id,
|
| 395 |
+
run_root=run_root,
|
| 396 |
+
output_dir=output_dir,
|
| 397 |
+
work_dir=work_dir,
|
| 398 |
+
source=source,
|
| 399 |
+
source_path=source_path,
|
| 400 |
+
steering_note=(regen_prompt or "").strip() or None,
|
| 401 |
+
)
|
| 402 |
+
with JOBS_LOCK:
|
| 403 |
+
JOBS[job_id] = job
|
| 404 |
+
|
| 405 |
+
threading.Thread(target=_run_job, args=(job_id,), daemon=True).start()
|
| 406 |
+
return JSONResponse(_snapshot(job))
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
@app.get("/api/jobs/{job_id}")
|
| 410 |
+
def get_job(job_id: str) -> JSONResponse:
|
| 411 |
+
with JOBS_LOCK:
|
| 412 |
+
job = JOBS.get(job_id)
|
| 413 |
+
if job is None:
|
| 414 |
+
raise HTTPException(status_code=404, detail="Job not found.")
|
| 415 |
+
_publish_files(job)
|
| 416 |
+
return JSONResponse(_snapshot(job))
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
@app.get("/api/jobs/{job_id}/files/{filename}")
|
| 420 |
+
def get_job_file(job_id: str, filename: str) -> FileResponse:
|
| 421 |
+
with JOBS_LOCK:
|
| 422 |
+
job = JOBS.get(job_id)
|
| 423 |
+
if job is None:
|
| 424 |
+
raise HTTPException(status_code=404, detail="Job not found.")
|
| 425 |
+
path = (job.output_dir / Path(filename).name).resolve(strict=False)
|
| 426 |
+
if job.output_dir.resolve(strict=False) not in path.parents or not path.is_file():
|
| 427 |
+
raise HTTPException(status_code=404, detail="File not found.")
|
| 428 |
+
return FileResponse(path, media_type="video/mp4", filename=path.name)
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
@app.get("/health")
|
| 432 |
+
def health() -> dict[str, str]:
|
| 433 |
+
return {"ok": "true"}
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
INDEX_HTML = r"""<!DOCTYPE html>
|
| 437 |
+
<html lang="en">
|
| 438 |
+
<head>
|
| 439 |
+
<meta charset="UTF-8">
|
| 440 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 441 |
+
<title>ClipForge - Video to Clips</title>
|
| 442 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 443 |
+
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:ital,wght@0,300;0,400;0,500;0,600;1,300;1,400&family=DM+Sans:wght@300;400;500&display=swap" rel="stylesheet">
|
| 444 |
+
<style>
|
| 445 |
+
:root {
|
| 446 |
+
--cream: #F7F2E9; --champagne: #EDE3CC; --champagne-deep: #D9C9A6;
|
| 447 |
+
--gold: #B8924A; --gold-light: #D4AA6A; --ink: #2A1F0E;
|
| 448 |
+
--ink-soft: #5C4A2E; --ink-muted: #9A8560; --white: #FDFAF4;
|
| 449 |
+
--surface: #F0E9D8; --border: #DDD0B3; --success: #6B8C5A;
|
| 450 |
+
--radius: 12px; --radius-lg: 20px;
|
| 451 |
+
}
|
| 452 |
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
| 453 |
+
body { font-family: 'DM Sans', sans-serif; background: var(--cream); color: var(--ink); min-height: 100vh; overflow-x: hidden; }
|
| 454 |
+
nav { display: flex; align-items: center; justify-content: space-between; padding: 20px 32px; border-bottom: 1px solid var(--border); background: var(--white); position: sticky; top: 0; z-index: 100; }
|
| 455 |
+
.logo { font-family: 'Cormorant Garamond', serif; font-size: 1.6rem; font-weight: 600; color: var(--ink); letter-spacing: 0.02em; }
|
| 456 |
+
.logo span { color: var(--gold); }
|
| 457 |
+
.screen { display: none; animation: fadeIn 0.5s ease; }
|
| 458 |
+
.screen.active { display: block; }
|
| 459 |
+
@keyframes fadeIn { from { opacity: 0; transform: translateY(8px); } to { opacity: 1; transform: translateY(0); } }
|
| 460 |
+
#screen-input { display: flex; flex-direction: column; align-items: center; justify-content: center; min-height: calc(100vh - 65px); padding: 40px 20px; text-align: center; }
|
| 461 |
+
.eyebrow { font-size: 0.75rem; letter-spacing: 0.18em; text-transform: uppercase; color: var(--gold); font-weight: 500; margin-bottom: 16px; }
|
| 462 |
+
.hero-title { font-family: 'Cormorant Garamond', serif; font-size: clamp(2rem, 5vw, 3.6rem); font-weight: 500; line-height: 1.15; color: var(--ink); max-width: 620px; margin-bottom: 12px; }
|
| 463 |
+
.hero-title em { font-style: italic; color: var(--gold); }
|
| 464 |
+
.hero-sub { font-size: 0.95rem; color: var(--ink-muted); margin-bottom: 48px; font-weight: 300; }
|
| 465 |
+
.input-card { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 36px; width: 100%; max-width: 520px; box-shadow: 0 8px 32px rgba(42,31,14,0.07); }
|
| 466 |
+
.mode-tabs { display: flex; background: var(--surface); border-radius: 10px; padding: 4px; margin-bottom: 28px; gap: 4px; }
|
| 467 |
+
.mode-tab { flex: 1; padding: 10px 0; border: none; background: transparent; border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.85rem; font-weight: 500; color: var(--ink-muted); cursor: pointer; transition: all 0.2s; }
|
| 468 |
+
.mode-tab.active { background: var(--white); color: var(--ink); box-shadow: 0 2px 8px rgba(42,31,14,0.1); }
|
| 469 |
+
.input-section { display: none; } .input-section.active { display: block; }
|
| 470 |
+
.input-label { font-size: 0.78rem; letter-spacing: 0.08em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 8px; display: block; font-weight: 500; text-align:left; }
|
| 471 |
+
.yt-input { width: 100%; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; transition: border-color 0.2s; }
|
| 472 |
+
.yt-input:focus { border-color: var(--gold); } .yt-input::placeholder { color: var(--ink-muted); }
|
| 473 |
+
.upload-zone { border: 2px dashed var(--champagne-deep); border-radius: var(--radius); padding: 36px 20px; text-align: center; cursor: pointer; transition: all 0.2s; background: var(--cream); }
|
| 474 |
+
.upload-zone:hover, .upload-zone.dragover { border-color: var(--gold); background: var(--champagne); }
|
| 475 |
+
.upload-icon { width: 44px; height: 44px; background: var(--champagne); border-radius: 50%; display: flex; align-items: center; justify-content: center; margin: 0 auto 12px; font-size: 1.2rem; }
|
| 476 |
+
.upload-text { font-size: 0.9rem; color: var(--ink-soft); font-weight: 400; }
|
| 477 |
+
.upload-sub { font-size: 0.78rem; color: var(--ink-muted); margin-top: 4px; }
|
| 478 |
+
.convert-btn { width: 100%; margin-top: 28px; padding: 16px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.95rem; font-weight: 500; cursor: pointer; letter-spacing: 0.03em; transition: all 0.2s; position: relative; overflow: hidden; }
|
| 479 |
+
.convert-btn:hover { background: var(--ink-soft); transform: translateY(-1px); box-shadow: 0 6px 20px rgba(42,31,14,0.2); } .convert-btn:active { transform: translateY(0); }
|
| 480 |
+
.convert-btn:disabled { opacity: .65; cursor: progress; transform:none; }
|
| 481 |
+
#screen-processing { max-width: 780px; margin: 0 auto; padding: 48px 20px 80px; }
|
| 482 |
+
.processing-header { text-align: center; margin-bottom: 40px; }
|
| 483 |
+
.processing-title { font-family: 'Cormorant Garamond', serif; font-size: 2rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
|
| 484 |
+
.processing-sub { font-size: 0.88rem; color: var(--ink-muted); font-weight: 300; }
|
| 485 |
+
.pipeline { background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 28px; box-shadow: 0 4px 20px rgba(42,31,14,0.06); margin-bottom: 32px; }
|
| 486 |
+
.pipeline-step { display: flex; align-items: flex-start; gap: 16px; padding: 16px 0; border-bottom: 1px solid var(--champagne); opacity: 0.4; transition: opacity 0.4s; }
|
| 487 |
+
.pipeline-step:last-child { border-bottom: none; } .pipeline-step.active, .pipeline-step.done { opacity: 1; }
|
| 488 |
+
.step-icon { width: 36px; height: 36px; flex-shrink: 0; background: var(--surface); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1rem; transition: all 0.4s; border: 1.5px solid var(--border); }
|
| 489 |
+
.pipeline-step.active .step-icon { background: var(--champagne); border-color: var(--gold); }
|
| 490 |
+
.pipeline-step.done .step-icon { background: var(--gold); border-color: var(--gold); color: white; font-size: 0.85rem; }
|
| 491 |
+
.step-content { flex: 1; padding-top: 4px; }
|
| 492 |
+
.step-name { font-size: 0.9rem; font-weight: 500; color: var(--ink); margin-bottom: 8px; display: flex; align-items: center; justify-content: space-between; }
|
| 493 |
+
.step-pct { font-size: 0.8rem; color: var(--gold); font-weight: 500; }
|
| 494 |
+
.progress-track { height: 6px; background: var(--surface); border-radius: 99px; overflow: hidden; }
|
| 495 |
+
.progress-fill { height: 100%; border-radius: 99px; background: linear-gradient(90deg, var(--gold-light), var(--gold)); width: 0%; transition: width 0.25s ease; }
|
| 496 |
+
.pipeline-step.done .progress-fill { width: 100%; background: var(--gold); }
|
| 497 |
+
.tips-section { margin-bottom: 40px; }
|
| 498 |
+
.tips-label { font-size: 0.72rem; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-muted); margin-bottom: 12px; font-weight: 500; }
|
| 499 |
+
.tip-card { background: var(--champagne); border-radius: var(--radius); padding: 14px 18px; font-size: 0.85rem; color: var(--ink-soft); display: flex; align-items: flex-start; gap: 10px; margin-bottom: 8px; line-height: 1.5; }
|
| 500 |
+
.tip-dot { color: var(--gold); margin-top: 2px; flex-shrink: 0; }
|
| 501 |
+
.clips-section { margin-top: 8px; }
|
| 502 |
+
.clips-title { font-family: 'Cormorant Garamond', serif; font-size: 1.4rem; font-weight: 500; color: var(--ink); margin-bottom: 6px; }
|
| 503 |
+
.clips-sub { font-size: 0.82rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
|
| 504 |
+
.clips-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 16px; }
|
| 505 |
+
.clip-card { border-radius: var(--radius); overflow: hidden; cursor: pointer; background: var(--white); border: 1px solid var(--border); box-shadow: 0 2px 10px rgba(42,31,14,0.06); transition: all 0.2s; animation: clipAppear 0.5s ease both; }
|
| 506 |
+
.clip-card:hover { transform: translateY(-3px); box-shadow: 0 8px 24px rgba(42,31,14,0.13); }
|
| 507 |
+
@keyframes clipAppear { from { opacity: 0; transform: scale(0.9) translateY(10px); } to { opacity: 1; transform: scale(1) translateY(0); } }
|
| 508 |
+
.clip-thumb { aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; overflow: hidden; }
|
| 509 |
+
.clip-play { width: 44px; height: 44px; background: rgba(255,255,255,0.88); border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 1.1rem; z-index: 2; box-shadow: 0 2px 12px rgba(0,0,0,0.2); transition: transform 0.2s; }
|
| 510 |
+
.clip-card:hover .clip-play { transform: scale(1.1); }
|
| 511 |
+
.clip-meta { padding: 10px 12px; } .clip-num { font-size: 0.72rem; color: var(--ink-muted); text-transform: uppercase; letter-spacing: 0.08em; font-weight: 500; }
|
| 512 |
+
.clip-dur { font-size: 0.82rem; color: var(--ink); font-weight: 400; margin-top: 2px; }
|
| 513 |
+
.clip-download { margin-top: 8px; display:inline-block; font-size:.74rem; color:var(--gold); text-decoration:none; }
|
| 514 |
+
.regen-section { margin-top: 56px; background: var(--white); border: 1px solid var(--border); border-radius: var(--radius-lg); padding: 32px; display: none; animation: fadeIn 0.5s ease; box-shadow: 0 4px 20px rgba(42,31,14,0.06); }
|
| 515 |
+
.regen-title { font-family: 'Cormorant Garamond', serif; font-size: 1.5rem; font-weight: 500; margin-bottom: 6px; }
|
| 516 |
+
.regen-sub { font-size: 0.85rem; color: var(--ink-muted); margin-bottom: 20px; font-weight: 300; }
|
| 517 |
+
.regen-textarea { width: 100%; min-height: 100px; padding: 14px 16px; border: 1.5px solid var(--border); border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.9rem; background: var(--cream); color: var(--ink); outline: none; resize: vertical; transition: border-color 0.2s; line-height: 1.6; margin-bottom: 14px; }
|
| 518 |
+
.regen-textarea:focus { border-color: var(--gold); } .regen-textarea::placeholder { color: var(--ink-muted); }
|
| 519 |
+
.regen-row { display: flex; gap: 10px; align-items: center; flex-wrap: wrap; }
|
| 520 |
+
.chip { padding: 7px 14px; background: var(--champagne); border: 1px solid var(--border); border-radius: 99px; font-size: 0.78rem; color: var(--ink-soft); cursor: pointer; transition: all 0.15s; font-weight: 400; white-space: nowrap; }
|
| 521 |
+
.chip:hover { background: var(--champagne-deep); color: var(--ink); border-color: var(--gold); }
|
| 522 |
+
.regen-btn { margin-left: auto; padding: 12px 24px; background: var(--ink); color: var(--cream); border: none; border-radius: var(--radius); font-family: 'DM Sans', sans-serif; font-size: 0.88rem; font-weight: 500; cursor: pointer; transition: all 0.2s; white-space: nowrap; }
|
| 523 |
+
.regen-btn:hover { background: var(--ink-soft); }
|
| 524 |
+
.modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
|
| 525 |
+
.modal-overlay.open { display: flex; }
|
| 526 |
+
.modal-box { background: var(--white); border-radius: var(--radius-lg); width: 100%; max-width: 390px; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
|
| 527 |
+
@keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
|
| 528 |
+
.modal-video { aspect-ratio: 9/16; max-height: 70vh; display: flex; align-items: center; justify-content: center; position: relative; background:var(--ink); }
|
| 529 |
+
.modal-video video { width:100%; height:100%; object-fit:contain; background:#000; }
|
| 530 |
+
.modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
|
| 531 |
+
.modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
|
| 532 |
+
.modal-actions { display:flex; align-items:center; gap:8px; }
|
| 533 |
+
.modal-close, .modal-download { padding: 8px 14px; background: var(--surface); border: 1px solid var(--border); border-radius: 8px; font-family: 'DM Sans', sans-serif; font-size: 0.82rem; cursor: pointer; transition: all 0.15s; color:var(--ink); text-decoration:none; }
|
| 534 |
+
.modal-close:hover, .modal-download:hover { background: var(--champagne); }
|
| 535 |
+
.log-panel { display:none; margin-top:24px; background:var(--ink); color:var(--cream); border-radius:12px; padding:14px; font:12px/1.45 ui-monospace, SFMono-Regular, Consolas, monospace; white-space:pre-wrap; max-height:240px; overflow:auto; text-align:left; }
|
| 536 |
+
@media (max-width: 600px) { nav { padding: 16px 20px; } .input-card { padding: 24px 20px; } #screen-processing { padding: 32px 16px 60px; } .pipeline { padding: 20px 16px; } .clips-grid { grid-template-columns: repeat(2, 1fr); gap: 10px; } .regen-section { padding: 22px 18px; } .regen-btn { width: 100%; margin-left: 0; } .regen-row { flex-direction: column; align-items: flex-start; } }
|
| 537 |
+
.thumb-1 { background: linear-gradient(135deg, #D4A96A 0%, #8B5E3C 100%); } .thumb-2 { background: linear-gradient(135deg, #7A9E8A 0%, #3D6650 100%); }
|
| 538 |
+
.thumb-3 { background: linear-gradient(135deg, #9E8A7A 0%, #5C3E2E 100%); } .thumb-4 { background: linear-gradient(135deg, #8A7A9E 0%, #4A3866 100%); }
|
| 539 |
+
.thumb-5 { background: linear-gradient(135deg, #9E9A7A 0%, #5C5820 100%); } .thumb-6 { background: linear-gradient(135deg, #C4856A 0%, #7A3020 100%); }
|
| 540 |
+
.thumb-7 { background: linear-gradient(135deg, #7AABBE 0%, #2A5A6E 100%); } .thumb-8 { background: linear-gradient(135deg, #9EAA7A 0%, #4A5E20 100%); }
|
| 541 |
+
.thumb-9 { background: linear-gradient(135deg, #AA7A9E 0%, #5E2060 100%); } .thumb-0 { background: linear-gradient(135deg, #D4C36A 0%, #8B7820 100%); }
|
| 542 |
+
.spin { display: inline-block; width: 14px; height: 14px; border: 2px solid var(--border); border-top-color: var(--gold); border-radius: 50%; animation: spin 0.8s linear infinite; }
|
| 543 |
+
@keyframes spin { to { transform: rotate(360deg); } }
|
| 544 |
+
</style>
|
| 545 |
+
</head>
|
| 546 |
+
<body>
|
| 547 |
+
<nav>
|
| 548 |
+
<div class="logo">Clip<span>Forge</span></div>
|
| 549 |
+
<div style="font-size:0.8rem;color:var(--ink-muted);font-weight:300;display:none" id="nav-status">Processing...</div>
|
| 550 |
+
</nav>
|
| 551 |
+
<div class="screen active" id="screen-input">
|
| 552 |
+
<div style="display:flex;flex-direction:column;align-items:center;justify-content:center;min-height:calc(100vh - 65px);padding:40px 20px;text-align:center;">
|
| 553 |
+
<div class="eyebrow">AI Video Editor</div>
|
| 554 |
+
<h1 class="hero-title">Convert your long video to <em>short clips</em> for social media</h1>
|
| 555 |
+
<p class="hero-sub">Paste a link or upload a file - we handle the rest</p>
|
| 556 |
+
<div class="input-card">
|
| 557 |
+
<div class="mode-tabs">
|
| 558 |
+
<button class="mode-tab active" onclick="switchMode('yt')">Link</button>
|
| 559 |
+
<button class="mode-tab" onclick="switchMode('upload')">Upload File</button>
|
| 560 |
+
</div>
|
| 561 |
+
<div class="input-section active" id="mode-yt">
|
| 562 |
+
<label class="input-label">Video URL</label>
|
| 563 |
+
<input class="yt-input" type="text" placeholder="https://youtube.com/watch?v=..." id="yt-url">
|
| 564 |
+
</div>
|
| 565 |
+
<div class="input-section" id="mode-upload">
|
| 566 |
+
<input type="file" id="file-input" accept="video/mp4,video/quicktime,video/*" hidden>
|
| 567 |
+
<div class="upload-zone" id="upload-zone" onclick="openUpload()">
|
| 568 |
+
<div class="upload-icon">File</div>
|
| 569 |
+
<div class="upload-text">Click to browse or drag & drop</div>
|
| 570 |
+
<div class="upload-sub">MP4, MOV, AVI - up to your Space limit</div>
|
| 571 |
+
</div>
|
| 572 |
+
</div>
|
| 573 |
+
<button class="convert-btn" id="convert-btn" onclick="startProcessing()">Convert to Clips -></button>
|
| 574 |
+
</div>
|
| 575 |
+
</div>
|
| 576 |
+
</div>
|
| 577 |
+
<div class="screen" id="screen-processing">
|
| 578 |
+
<div class="processing-header">
|
| 579 |
+
<div class="eyebrow">Working on it</div>
|
| 580 |
+
<h2 class="processing-title">Your clips are being crafted</h2>
|
| 581 |
+
<p class="processing-sub" id="processing-sub">Sit back - long videos can take a little while</p>
|
| 582 |
+
</div>
|
| 583 |
+
<div class="pipeline" id="pipeline">
|
| 584 |
+
<div class="pipeline-step" id="step-0"><div class="step-icon">Up</div><div class="step-content"><div class="step-name">Uploading video <span class="step-pct" id="pct-0">0%</span></div><div class="progress-track"><div class="progress-fill" id="fill-0"></div></div></div></div>
|
| 585 |
+
<div class="pipeline-step" id="step-1"><div class="step-icon">Text</div><div class="step-content"><div class="step-name">Generating transcript <span class="step-pct" id="pct-1"></span></div><div class="progress-track"><div class="progress-fill" id="fill-1"></div></div></div></div>
|
| 586 |
+
<div class="pipeline-step" id="step-2"><div class="step-icon">Cut</div><div class="step-content"><div class="step-name">Choosing short clips <span class="step-pct" id="pct-2"></span></div><div class="progress-track"><div class="progress-fill" id="fill-2"></div></div></div></div>
|
| 587 |
+
<div class="pipeline-step" id="step-3"><div class="step-icon">Film</div><div class="step-content"><div class="step-name">Producing clips <span class="step-pct" id="pct-3"></span></div><div class="progress-track"><div class="progress-fill" id="fill-3"></div></div></div></div>
|
| 588 |
+
<div class="pipeline-step" id="step-4"><div class="step-icon">Edit</div><div class="step-content"><div class="step-name">Adding subtitles & light edits <span class="step-pct" id="pct-4"></span></div><div class="progress-track"><div class="progress-fill" id="fill-4"></div></div></div></div>
|
| 589 |
+
</div>
|
| 590 |
+
<div class="tips-section" id="tips-section">
|
| 591 |
+
<div class="tips-label">Tips while you wait</div>
|
| 592 |
+
<div class="tip-card"><span class="tip-dot">◆</span> Clips are automatically trimmed around the strongest hook.</div>
|
| 593 |
+
<div class="tip-card"><span class="tip-dot">◆</span> The system can pick centered speaker or split presentation layout per clip.</div>
|
| 594 |
+
<div class="tip-card"><span class="tip-dot">◆</span> Word-by-word subtitles are added by default.</div>
|
| 595 |
+
<div class="tip-card"><span class="tip-dot">◆</span> You can regenerate with different instructions after the first batch.</div>
|
| 596 |
+
</div>
|
| 597 |
+
<div class="clips-section" id="clips-section" style="display:none">
|
| 598 |
+
<div class="clips-title">Your clips</div>
|
| 599 |
+
<p class="clips-sub" id="clips-sub-text">Tap any clip to preview</p>
|
| 600 |
+
<div class="clips-grid" id="clips-grid"></div>
|
| 601 |
+
</div>
|
| 602 |
+
<div class="regen-section" id="regen-section">
|
| 603 |
+
<div class="regen-title">Produce a different set</div>
|
| 604 |
+
<p class="regen-sub">Describe what you're looking for and we'll re-cut your video</p>
|
| 605 |
+
<textarea class="regen-textarea" placeholder="e.g. Focus on the funniest moments, keep clips under 30 seconds, add a text hook at the start..." id="regen-prompt"></textarea>
|
| 606 |
+
<div class="regen-row">
|
| 607 |
+
<span class="chip" onclick="setChip('Highlight key insights')">Key insights</span>
|
| 608 |
+
<span class="chip" onclick="setChip('Funny & entertaining moments')">Funny moments</span>
|
| 609 |
+
<span class="chip" onclick="setChip('Emotional or inspiring clips')">Emotional</span>
|
| 610 |
+
<span class="chip" onclick="setChip('Fast-paced, high energy edits')">High energy</span>
|
| 611 |
+
<button class="regen-btn" onclick="triggerRegen()">Regenerate Clips -></button>
|
| 612 |
+
</div>
|
| 613 |
+
</div>
|
| 614 |
+
<pre class="log-panel" id="log-panel"></pre>
|
| 615 |
+
</div>
|
| 616 |
+
<div class="modal-overlay" id="modal" onclick="closeModal(event)">
|
| 617 |
+
<div class="modal-box">
|
| 618 |
+
<div class="modal-video" id="modal-video"><div class="clip-play" style="width:56px;height:56px;font-size:1.4rem;background:rgba(255,255,255,0.9)">▶</div></div>
|
| 619 |
+
<div class="modal-footer">
|
| 620 |
+
<div class="modal-clip-label" id="modal-label">Clip 1</div>
|
| 621 |
+
<div class="modal-actions"><a class="modal-download" id="modal-download" href="#" download>Download</a><button class="modal-close" onclick="document.getElementById('modal').classList.remove('open')">Close</button></div>
|
| 622 |
+
</div>
|
| 623 |
+
</div>
|
| 624 |
+
</div>
|
| 625 |
+
<script>
|
| 626 |
+
let currentMode = 'yt';
|
| 627 |
+
let selectedFile = null;
|
| 628 |
+
let currentJobId = null;
|
| 629 |
+
let renderedClips = [];
|
| 630 |
+
const iconLabels = ['Up','Text','Cut','Film','Edit'];
|
| 631 |
+
|
| 632 |
+
function switchMode(m) {
|
| 633 |
+
currentMode = m;
|
| 634 |
+
document.querySelectorAll('.mode-tab').forEach((t,i) => t.classList.toggle('active', (i===0 && m==='yt') || (i===1 && m==='upload')));
|
| 635 |
+
document.getElementById('mode-yt').classList.toggle('active', m==='yt');
|
| 636 |
+
document.getElementById('mode-upload').classList.toggle('active', m==='upload');
|
| 637 |
+
}
|
| 638 |
+
|
| 639 |
+
function openUpload() { document.getElementById('file-input').click(); }
|
| 640 |
+
|
| 641 |
+
function setSelectedFile(file) {
|
| 642 |
+
selectedFile = file;
|
| 643 |
+
const zone = document.getElementById('upload-zone');
|
| 644 |
+
zone.innerHTML = `<div class="upload-icon">OK</div><div class="upload-text" style="color:var(--gold)">File selected: ${escapeHtml(file.name)}</div><div class="upload-sub">Ready to convert</div>`;
|
| 645 |
+
}
|
| 646 |
+
|
| 647 |
+
const uploadZone = document.getElementById('upload-zone');
|
| 648 |
+
document.getElementById('file-input').addEventListener('change', e => { if (e.target.files[0]) setSelectedFile(e.target.files[0]); });
|
| 649 |
+
uploadZone.addEventListener('dragover', e => { e.preventDefault(); uploadZone.classList.add('dragover'); });
|
| 650 |
+
uploadZone.addEventListener('dragleave', () => uploadZone.classList.remove('dragover'));
|
| 651 |
+
uploadZone.addEventListener('drop', e => { e.preventDefault(); uploadZone.classList.remove('dragover'); if (e.dataTransfer.files[0]) setSelectedFile(e.dataTransfer.files[0]); });
|
| 652 |
+
|
| 653 |
+
function escapeHtml(s) {
|
| 654 |
+
return String(s).replace(/[&<>"']/g, c => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[c]));
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
async function createJob(extraPrompt = '') {
|
| 658 |
+
const form = new FormData();
|
| 659 |
+
if (extraPrompt && currentJobId) {
|
| 660 |
+
form.append('source_job_id', currentJobId);
|
| 661 |
+
form.append('regen_prompt', extraPrompt);
|
| 662 |
+
} else if (currentMode === 'upload') {
|
| 663 |
+
if (!selectedFile) throw new Error('Choose a video file first.');
|
| 664 |
+
form.append('file', selectedFile);
|
| 665 |
+
} else {
|
| 666 |
+
const url = document.getElementById('yt-url').value.trim();
|
| 667 |
+
if (!url) throw new Error('Paste a video URL first.');
|
| 668 |
+
form.append('video_url', url);
|
| 669 |
+
}
|
| 670 |
+
const res = await fetch('/api/jobs', { method: 'POST', body: form });
|
| 671 |
+
const data = await res.json();
|
| 672 |
+
if (!res.ok) throw new Error(data.detail || 'Could not start job.');
|
| 673 |
+
return data;
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
async function startProcessing() {
|
| 677 |
+
const btn = document.getElementById('convert-btn');
|
| 678 |
+
try {
|
| 679 |
+
btn.disabled = true;
|
| 680 |
+
btn.textContent = 'Starting...';
|
| 681 |
+
const job = await createJob();
|
| 682 |
+
currentJobId = job.id;
|
| 683 |
+
renderedClips = [];
|
| 684 |
+
document.getElementById('clips-grid').innerHTML = '';
|
| 685 |
+
document.getElementById('screen-input').classList.remove('active');
|
| 686 |
+
document.getElementById('screen-processing').classList.add('active');
|
| 687 |
+
document.getElementById('nav-status').style.display = 'block';
|
| 688 |
+
syncJob(job);
|
| 689 |
+
pollJob(job.id);
|
| 690 |
+
} catch (err) {
|
| 691 |
+
alert(err.message || err);
|
| 692 |
+
} finally {
|
| 693 |
+
btn.disabled = false;
|
| 694 |
+
btn.textContent = 'Convert to Clips ->';
|
| 695 |
+
}
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
async function pollJob(id) {
|
| 699 |
+
let done = false;
|
| 700 |
+
while (!done && currentJobId === id) {
|
| 701 |
+
await new Promise(r => setTimeout(r, 1400));
|
| 702 |
+
const res = await fetch(`/api/jobs/${id}`);
|
| 703 |
+
const job = await res.json();
|
| 704 |
+
syncJob(job);
|
| 705 |
+
done = job.done;
|
| 706 |
+
}
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
function syncJob(job) {
|
| 710 |
+
document.getElementById('nav-status').textContent = job.nav_status || 'Processing...';
|
| 711 |
+
document.getElementById('processing-sub').textContent = job.error ? job.error : job.status;
|
| 712 |
+
document.getElementById('log-panel').textContent = job.logs || '';
|
| 713 |
+
(job.steps || []).forEach((step, i) => {
|
| 714 |
+
const el = document.getElementById(`step-${i}`);
|
| 715 |
+
const fill = document.getElementById(`fill-${i}`);
|
| 716 |
+
const pct = document.getElementById(`pct-${i}`);
|
| 717 |
+
el.classList.toggle('active', step.state === 'active');
|
| 718 |
+
el.classList.toggle('done', step.state === 'done');
|
| 719 |
+
el.querySelector('.step-icon').innerHTML = step.state === 'done' ? '✓' : (step.state === 'active' ? '<span class="spin"></span>' : iconLabels[i]);
|
| 720 |
+
fill.style.width = `${step.pct || 0}%`;
|
| 721 |
+
pct.textContent = step.pct ? `${Math.floor(step.pct)}%` : '';
|
| 722 |
+
});
|
| 723 |
+
(job.clips || []).forEach((clip, idx) => {
|
| 724 |
+
if (!renderedClips.some(c => c.name === clip.name)) {
|
| 725 |
+
renderedClips.push(clip);
|
| 726 |
+
addClip(renderedClips.length - 1);
|
| 727 |
+
}
|
| 728 |
+
});
|
| 729 |
+
if (renderedClips.length) {
|
| 730 |
+
document.getElementById('clips-section').style.display = 'block';
|
| 731 |
+
document.getElementById('clips-sub-text').textContent = job.done
|
| 732 |
+
? `All ${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - tap to preview`
|
| 733 |
+
: `${renderedClips.length} clip${renderedClips.length > 1 ? 's' : ''} ready - more coming...`;
|
| 734 |
+
}
|
| 735 |
+
if (job.done) {
|
| 736 |
+
document.getElementById('regen-section').style.display = 'block';
|
| 737 |
+
if (job.error) document.getElementById('log-panel').style.display = 'block';
|
| 738 |
+
}
|
| 739 |
+
}
|
| 740 |
+
|
| 741 |
+
function addClip(idx) {
|
| 742 |
+
const clip = renderedClips[idx];
|
| 743 |
+
const grid = document.getElementById('clips-grid');
|
| 744 |
+
const card = document.createElement('div');
|
| 745 |
+
card.className = 'clip-card';
|
| 746 |
+
card.innerHTML = `<div class="clip-thumb thumb-${idx % 10}"><div class="clip-play">▶</div></div><div class="clip-meta"><div class="clip-num">Clip ${idx + 1}</div><div class="clip-dur">${clip.duration || '0:00'}</div><a class="clip-download" href="${clip.url}" download onclick="event.stopPropagation()">Download</a></div>`;
|
| 747 |
+
card.onclick = () => openModal(idx);
|
| 748 |
+
grid.appendChild(card);
|
| 749 |
+
}
|
| 750 |
+
|
| 751 |
+
function openModal(idx) {
|
| 752 |
+
const clip = renderedClips[idx];
|
| 753 |
+
const modal = document.getElementById('modal');
|
| 754 |
+
const video = document.getElementById('modal-video');
|
| 755 |
+
video.className = 'modal-video';
|
| 756 |
+
video.innerHTML = `<video src="${clip.url}" controls autoplay playsinline></video>`;
|
| 757 |
+
document.getElementById('modal-label').textContent = `Clip ${idx + 1}`;
|
| 758 |
+
document.getElementById('modal-download').href = clip.url;
|
| 759 |
+
modal.classList.add('open');
|
| 760 |
+
}
|
| 761 |
+
|
| 762 |
+
function closeModal(e) {
|
| 763 |
+
if (e.target === document.getElementById('modal')) {
|
| 764 |
+
document.getElementById('modal').classList.remove('open');
|
| 765 |
+
document.getElementById('modal-video').innerHTML = '';
|
| 766 |
+
}
|
| 767 |
+
}
|
| 768 |
+
|
| 769 |
+
function setChip(text) {
|
| 770 |
+
const ta = document.getElementById('regen-prompt');
|
| 771 |
+
ta.value = text;
|
| 772 |
+
ta.focus();
|
| 773 |
+
}
|
| 774 |
+
|
| 775 |
+
async function triggerRegen() {
|
| 776 |
+
const prompt = document.getElementById('regen-prompt').value.trim();
|
| 777 |
+
if (!prompt) { document.getElementById('regen-prompt').focus(); return; }
|
| 778 |
+
if (!currentJobId) { alert('Run a video first.'); return; }
|
| 779 |
+
renderedClips = [];
|
| 780 |
+
document.getElementById('clips-grid').innerHTML = '';
|
| 781 |
+
document.getElementById('clips-section').style.display = 'none';
|
| 782 |
+
document.getElementById('regen-section').style.display = 'none';
|
| 783 |
+
document.getElementById('nav-status').textContent = 'Regenerating...';
|
| 784 |
+
document.querySelectorAll('.pipeline-step').forEach((s, i) => {
|
| 785 |
+
s.classList.remove('active', 'done');
|
| 786 |
+
s.querySelector('.step-icon').innerHTML = iconLabels[i];
|
| 787 |
+
document.getElementById(`fill-${i}`).style.width = '0%';
|
| 788 |
+
document.getElementById(`pct-${i}`).textContent = '';
|
| 789 |
+
});
|
| 790 |
+
window.scrollTo({ top: 0, behavior: 'smooth' });
|
| 791 |
+
try {
|
| 792 |
+
const job = await createJob(prompt);
|
| 793 |
+
currentJobId = job.id;
|
| 794 |
+
syncJob(job);
|
| 795 |
+
pollJob(job.id);
|
| 796 |
+
} catch (err) {
|
| 797 |
+
alert(err.message || err);
|
| 798 |
+
}
|
| 799 |
+
}
|
| 800 |
+
</script>
|
| 801 |
+
</body>
|
| 802 |
+
</html>"""
|
| 803 |
+
|
| 804 |
+
|
| 805 |
+
if __name__ == "__main__":
|
| 806 |
+
import uvicorn
|
| 807 |
+
|
| 808 |
+
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "7860")))
|
humeo-core/.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.pytest_cache/
|
| 6 |
+
build/
|
| 7 |
+
dist/
|
| 8 |
+
.venv/
|
| 9 |
+
.env
|
humeo-core/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026 NotABot
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
humeo-core/README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# humeo-core
|
| 2 |
+
|
| 3 |
+
**Reusable-rocket MCP server for long-video → 9:16 shorts.**
|
| 4 |
+
|
| 5 |
+
First-principles design, from the HIVE paper + Bryan's rocket analogy:
|
| 6 |
+
we don't build doors and windows (general subject-tracker UI, retraining
|
| 7 |
+
models). We build the **container** (schemas), **landing gear** (deterministic
|
| 8 |
+
local extraction), and **five thrusters** (the five 9:16 layouts this video
|
| 9 |
+
format actually uses). Everything else is pluggable.
|
| 10 |
+
|
| 11 |
+
## The rocket, in one picture
|
| 12 |
+
|
| 13 |
+
```
|
| 14 |
+
┌──────────────────────────────────────────┐
|
| 15 |
+
│ Control panel (MCP tools) │ <- any MCP client
|
| 16 |
+
└───────────────────┬──────────────────────┘
|
| 17 |
+
│ strict JSON
|
| 18 |
+
┌────────────────┬───────────┼────────────────┬─────────────────┐
|
| 19 |
+
▼ ▼ ▼ ▼ ▼
|
| 20 |
+
ingest classify_scenes select_clips plan_layout render_clip
|
| 21 |
+
(scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile,
|
| 22 |
+
keyframes + classifier) heuristic + pure filter dry-run safe)
|
| 23 |
+
transcript) LLM-ready) math)
|
| 24 |
+
│
|
| 25 |
+
▼
|
| 26 |
+
┌────────────────────┐
|
| 27 |
+
│ LayoutKind │
|
| 28 |
+
│ ──────────────── │
|
| 29 |
+
│ zoom_call_center │
|
| 30 |
+
│ sit_center │
|
| 31 |
+
│ split_chart_person│
|
| 32 |
+
│ split_two_persons │
|
| 33 |
+
│ split_two_charts │
|
| 34 |
+
└────────────────────┘
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
Only the classifier and clip-selector have optional LLM hooks; everything
|
| 38 |
+
else is deterministic, local, and cheap.
|
| 39 |
+
|
| 40 |
+
## Why five layouts? (the "max 2 items" rule)
|
| 41 |
+
|
| 42 |
+
The hard constraint for this format: **a short shows at most two on-screen
|
| 43 |
+
items** — where an "item" is a `person` (a human speaker) or a `chart`
|
| 44 |
+
(slide, graph, data visual, screenshare). That gives exactly five recipes:
|
| 45 |
+
|
| 46 |
+
1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
|
| 47 |
+
2. **`sit_center`** — 1 person, interview / seated framing.
|
| 48 |
+
3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
|
| 49 |
+
(default: **even 50/50** top/bottom, chart on top).
|
| 50 |
+
4. **`split_two_persons`** — 2 speakers, stacked vertically.
|
| 51 |
+
5. **`split_two_charts`** — 2 charts, stacked vertically.
|
| 52 |
+
|
| 53 |
+
Because the geometry is bounded, we do NOT need a general subject-tracker
|
| 54 |
+
ML model or a drag-to-highlight UI. We need five small, correct pieces of
|
| 55 |
+
crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
|
| 56 |
+
is.
|
| 57 |
+
|
| 58 |
+
See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
|
| 59 |
+
used across these docs (subject, crop, band, seam, bbox, layout, etc.).
|
| 60 |
+
|
| 61 |
+
## Install
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
uv venv
|
| 65 |
+
uv sync
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
External requirements: `ffmpeg` and `ffprobe` on PATH.
|
| 69 |
+
|
| 70 |
+
`scenedetect` requires OpenCV. Install `opencv-python-headless` or
|
| 71 |
+
`opencv-python` alongside `scenedetect`.
|
| 72 |
+
|
| 73 |
+
## Use it as an MCP server
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
humeo-core # stdio transport (primary console script)
|
| 77 |
+
# humeo-mcp # same entrypoint — kept so existing MCP configs keep working
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
Example Cursor/Claude Desktop config:
|
| 81 |
+
|
| 82 |
+
```json
|
| 83 |
+
{
|
| 84 |
+
"mcpServers": {
|
| 85 |
+
"humeo": { "command": "humeo-core" }
|
| 86 |
+
}
|
| 87 |
+
}
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Tools exposed:
|
| 91 |
+
|
| 92 |
+
| Tool | Purpose |
|
| 93 |
+
| --------------------------------- | --------------------------------------------------------------------------- |
|
| 94 |
+
| `list_layouts` | Enumerate the 5 supported layouts. |
|
| 95 |
+
| `ingest` | Scene detection + keyframe extraction (+ optional transcript). |
|
| 96 |
+
| `classify_scenes` | Pixel-heuristic per-scene layout classification. |
|
| 97 |
+
| `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). |
|
| 98 |
+
| `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
|
| 99 |
+
| `select_clips` | Heuristic clip picker over a word-level transcript. |
|
| 100 |
+
| `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. |
|
| 101 |
+
| `build_render_cmd` | Build the ffmpeg command (no execution) — review before spend. |
|
| 102 |
+
| `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. |
|
| 103 |
+
|
| 104 |
+
Resource: `humeo://layouts` (JSON listing of the 5 layouts).
|
| 105 |
+
|
| 106 |
+
### Three interchangeable region detectors
|
| 107 |
+
|
| 108 |
+
All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
classify.py (pixel variance, no ML)
|
| 112 |
+
face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
|
| 113 |
+
vision.py (multimodal LLM + OCR bboxes)
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
## JSON contracts (non-negotiable)
|
| 117 |
+
|
| 118 |
+
All tools take and return Pydantic-validated JSON. The contracts live in
|
| 119 |
+
[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
|
| 120 |
+
|
| 121 |
+
- `Scene` `{scene_id, start_time, end_time, keyframe_path?}`
|
| 122 |
+
- `TranscriptWord` `{word, start_time, end_time}`
|
| 123 |
+
- `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
|
| 124 |
+
- `SceneClassification` `{scene_id, layout, confidence, reason}`
|
| 125 |
+
- `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized)
|
| 126 |
+
- `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
|
| 127 |
+
- `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
|
| 128 |
+
- `ClipPlan` `{source_path, clips[]}`
|
| 129 |
+
- `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
|
| 130 |
+
- `RenderRequest` / `RenderResult`
|
| 131 |
+
|
| 132 |
+
## First-principles decisions (what we intentionally did NOT build)
|
| 133 |
+
|
| 134 |
+
- **No giant subject-tracker ML.** The video format has 5 fixed layouts
|
| 135 |
+
(with a hard "max 2 items" rule); pixel-level tracking is not needed.
|
| 136 |
+
- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
|
| 137 |
+
agent-first workflow. If a human wants to override, they pass a
|
| 138 |
+
`LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
|
| 139 |
+
`zoom`.
|
| 140 |
+
- **No end-to-end video→video model.** The HIVE paper's core insight is
|
| 141 |
+
that decomposed orchestration beats monolithic generation. We reify
|
| 142 |
+
that insight as six small composable tools.
|
| 143 |
+
|
| 144 |
+
## Extending the pilot
|
| 145 |
+
|
| 146 |
+
- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
|
| 147 |
+
- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
|
| 148 |
+
- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
|
| 149 |
+
to get per-scene bboxes + OCR text, then feed the results back through
|
| 150 |
+
`classify_scenes_with_vision`. This is the scene-change → v3 images →
|
| 151 |
+
LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
|
| 152 |
+
- All enforce strict JSON outputs, so bad model output can't corrupt
|
| 153 |
+
downstream stages.
|
| 154 |
+
|
| 155 |
+
## Testing
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
python -m pytest
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
|
| 162 |
+
|
| 163 |
+
## License
|
| 164 |
+
|
| 165 |
+
MIT
|
humeo-core/docs/ARCHITECTURE.md
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture — Reusable Rocket
|
| 2 |
+
|
| 3 |
+
> *"We don't need to build the door or windows — just a container with landing
|
| 4 |
+
> gear and thrusters that move in different directions."*
|
| 5 |
+
> — Bryan
|
| 6 |
+
|
| 7 |
+
That analogy maps exactly onto this MCP:
|
| 8 |
+
|
| 9 |
+
| Rocket part | Codebase | Purpose |
|
| 10 |
+
| --------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------- |
|
| 11 |
+
| Container | `src/humeo_core/schemas.py` | Strict JSON contracts every stage reads/writes. |
|
| 12 |
+
| Landing gear | `src/humeo_core/primitives/ingest.py` | Deterministic local extraction (scenes, keyframes, transcript). |
|
| 13 |
+
| Thrusters (×5) | `src/humeo_core/primitives/layouts.py` | Five fixed 9:16 crop/compose recipes (max 2 on-screen items). |
|
| 14 |
+
| Pilot | `primitives/classify.py` + `primitives/select_clips.py` | Heuristic + LLM-ready decision makers. |
|
| 15 |
+
| Compiler | `src/humeo_core/primitives/compile.py` | Deterministic ffmpeg assembly. |
|
| 16 |
+
| Control panel | `src/humeo_core/server.py` | MCP tools exposing every primitive. |
|
| 17 |
+
| Control surface | `src/humeo_core/server.py` | MCP tool surface for agents and clients. |
|
| 18 |
+
|
| 19 |
+
## First-principles reasoning
|
| 20 |
+
|
| 21 |
+
The HIVE paper's core insight is that good short-video editing requires
|
| 22 |
+
**staged reasoning with strict intermediate artifacts**, not a single
|
| 23 |
+
giant model call. Three consequences flow from that:
|
| 24 |
+
|
| 25 |
+
1. **Extraction must be local and deterministic.** No model call should
|
| 26 |
+
ever touch raw video bytes. `ingest.py` runs ffprobe + PySceneDetect
|
| 27 |
+
+ ffmpeg + (optional) faster-whisper. Everything it emits is JSON or
|
| 28 |
+
a file path.
|
| 29 |
+
|
| 30 |
+
2. **Reasoning must be decomposed into narrow sub-tasks.** Classifying a
|
| 31 |
+
scene's layout is a completely different task from selecting a viral
|
| 32 |
+
clip. Each has its own schema, its own prompt, its own validation.
|
| 33 |
+
This is why `primitives/` is five files instead of one.
|
| 34 |
+
|
| 35 |
+
3. **Every model call must emit schema-validated JSON.** Free-form model
|
| 36 |
+
output is not allowed to enter the pipeline. `classify_scenes_with_llm`
|
| 37 |
+
and `select_clips_with_llm` both `model_validate(...)` the raw output
|
| 38 |
+
before returning; parse failures degrade gracefully to `SIT_CENTER` +
|
| 39 |
+
low confidence, not crashes.
|
| 40 |
+
|
| 41 |
+
## Why only five layouts?
|
| 42 |
+
|
| 43 |
+
The hard rule for this format: **a short shows at most two on-screen
|
| 44 |
+
items**, where an "item" is a `person` or a `chart`. That gives exactly
|
| 45 |
+
five recipes — all implemented as pure functions from
|
| 46 |
+
`LayoutInstruction` to an ffmpeg filtergraph string in `layouts.py`:
|
| 47 |
+
|
| 48 |
+
| Layout | Items | Recipe |
|
| 49 |
+
| ---------------------- | --------------- | --------------------------------------------- |
|
| 50 |
+
| `zoom_call_center` | 1 person | tight centered 9:16 crop (zoom ≥ 1.25). |
|
| 51 |
+
| `sit_center` | 1 person | wider centered 9:16 crop. |
|
| 52 |
+
| `split_chart_person` | 1 chart + person| source partitioned L/R by bboxes, stacked. |
|
| 53 |
+
| `split_two_persons` | 2 persons | L/R speakers, stacked top/bottom. |
|
| 54 |
+
| `split_two_charts` | 2 charts | L/R charts, stacked top/bottom. |
|
| 55 |
+
|
| 56 |
+
A general subject-tracker ML model is orders of magnitude more expensive
|
| 57 |
+
and less reliable than five hand-written crop recipes. If a new geometry
|
| 58 |
+
ever shows up in future source videos, adding a sixth thruster is
|
| 59 |
+
strictly additive: write a new `plan_*` function, add it to `_DISPATCH`,
|
| 60 |
+
add an enum variant. No existing code has to change.
|
| 61 |
+
|
| 62 |
+
## 9:16 layout math
|
| 63 |
+
|
| 64 |
+
Source is assumed 16:9 (1920×1080 by default, but probed per-clip).
|
| 65 |
+
Target is 1080×1920. For each layout:
|
| 66 |
+
|
| 67 |
+
### `zoom_call_center` and `sit_center`
|
| 68 |
+
|
| 69 |
+
Standard centered aspect-ratio crop to 9:16, then scale to 1080×1920:
|
| 70 |
+
|
| 71 |
+
```
|
| 72 |
+
crop=cw:ch:x:y,scale=1080:1920:flags=lanczos,setsar=1[vout]
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
`cw`, `ch` are the largest 9:16 window that fits in the source, divided
|
| 76 |
+
by `zoom`. `x`, `y` center the window on `person_x_norm` / 0.5.
|
| 77 |
+
Dimensions are rounded to even values so libx264 is happy. The window is
|
| 78 |
+
clamped inside the source so a high `person_x_norm` never crops outside.
|
| 79 |
+
|
| 80 |
+
### Split layouts (`split_chart_person`, `split_two_persons`, `split_two_charts`)
|
| 81 |
+
|
| 82 |
+
All three splits share one recipe — only the items differ:
|
| 83 |
+
|
| 84 |
+
1. **Horizontal partition.** The source is cut at a single vertical seam
|
| 85 |
+
so the two source strips are **complementary** (no overlap, no gap).
|
| 86 |
+
When both bboxes are set (Gemini vision), the seam is the midpoint
|
| 87 |
+
between `left.x2` and `right.x1`. Otherwise the seam defaults to
|
| 88 |
+
either an even 50/50 (two-of-a-kind splits) or a 2/3 | 1/3 split
|
| 89 |
+
(legacy `split_chart_person` fallback).
|
| 90 |
+
2. **Vertical crop.** Each strip's vertical extent comes from the
|
| 91 |
+
corresponding bbox when provided, so each item **fills** its output
|
| 92 |
+
band instead of being lost in full-height source context.
|
| 93 |
+
3. **Cover-scale to the band.** Each strip is scaled with
|
| 94 |
+
`force_original_aspect_ratio=increase` + center-cropped to the band
|
| 95 |
+
dimensions. Bands are always fully painted; no letterbox bars.
|
| 96 |
+
4. **Stack.** Two branches produced by `split=2` are `vstack`-ed into
|
| 97 |
+
the final 1080×1920.
|
| 98 |
+
|
| 99 |
+
**Band heights** are controlled by `LayoutInstruction.top_band_ratio`,
|
| 100 |
+
which defaults to **0.5** (even 50/50 — the symmetric look Bryan asked
|
| 101 |
+
for after the uneven Cathy Wood shorts). Legacy 60/40 is still reachable
|
| 102 |
+
by setting `top_band_ratio=0.6`.
|
| 103 |
+
|
| 104 |
+
**Stack order** (for `split_chart_person`) is controlled by
|
| 105 |
+
`focus_stack_order`: chart-on-top (default) or person-on-top.
|
| 106 |
+
|
| 107 |
+
## Extensibility story
|
| 108 |
+
|
| 109 |
+
- **Smarter classifier:** implement `LLMVisionFn` with any multimodal
|
| 110 |
+
model and pass it to `classify_scenes_with_llm`. The fallback heuristic
|
| 111 |
+
stays available for offline runs and tests.
|
| 112 |
+
- **Smarter clip selector:** same pattern, `LLMTextFn` → `select_clips_with_llm`.
|
| 113 |
+
- **New layout:** add a `plan_*` planner, register in `_DISPATCH`, add a
|
| 114 |
+
`LayoutKind` variant. Tests in `test_layouts.py` automatically iterate
|
| 115 |
+
over all `LayoutKind`s, so the dispatch coverage test will catch a
|
| 116 |
+
missing registration immediately.
|
| 117 |
+
|
| 118 |
+
## What we intentionally did NOT build
|
| 119 |
+
|
| 120 |
+
- Drag-and-highlight subject-selector UI.
|
| 121 |
+
- A general ML subject-tracker.
|
| 122 |
+
- A monolithic video-in-video-out model.
|
| 123 |
+
- Any network calls in the core library. The MCP server is stdio-only;
|
| 124 |
+
the CLI runs fully offline.
|
| 125 |
+
|
| 126 |
+
This keeps the rocket **reusable**: the same primitives power the MCP
|
| 127 |
+
server, the CLI, a Python library, and (soon) a web UI if that's ever
|
| 128 |
+
warranted.
|
humeo-core/docs/MCP_USAGE.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Using humeo-core from an MCP client
|
| 2 |
+
|
| 3 |
+
The installed console command is **`humeo-core`**. For backward compatibility,
|
| 4 |
+
**`humeo-mcp`** is also registered (same entrypoint); either works in
|
| 5 |
+
`"command": ...` if both are on `PATH` from the same install.
|
| 6 |
+
|
| 7 |
+
## 1. Add to your client
|
| 8 |
+
|
| 9 |
+
`claude_desktop_config.json` or `.cursor/mcp.json`:
|
| 10 |
+
|
| 11 |
+
```json
|
| 12 |
+
{
|
| 13 |
+
"mcpServers": {
|
| 14 |
+
"humeo": {
|
| 15 |
+
"command": "humeo-core"
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
## 2. A typical agent plan
|
| 22 |
+
|
| 23 |
+
```
|
| 24 |
+
→ humeo.list_layouts()
|
| 25 |
+
# discover the 5 layouts (max 2 on-screen items per short)
|
| 26 |
+
|
| 27 |
+
→ humeo.ingest(source_path="/abs/long.mp4", work_dir="/abs/work", with_transcript=true)
|
| 28 |
+
# IngestResult: scenes[], keyframes, transcript_words[]
|
| 29 |
+
|
| 30 |
+
→ humeo.classify_scenes(scenes=<IngestResult.scenes>)
|
| 31 |
+
# SceneClassification[] — one layout per scene
|
| 32 |
+
|
| 33 |
+
→ humeo.select_clips(
|
| 34 |
+
source_path=..., transcript_words=..., duration_sec=...,
|
| 35 |
+
target_count=5, min_sec=30, max_sec=60
|
| 36 |
+
)
|
| 37 |
+
# ClipPlan — top non-overlapping clips
|
| 38 |
+
|
| 39 |
+
# For each clip, pick the layout of the scene its midpoint falls in,
|
| 40 |
+
# build a LayoutInstruction, and:
|
| 41 |
+
|
| 42 |
+
→ humeo.build_render_cmd(request={...})
|
| 43 |
+
# dry-run: returns the exact ffmpeg argv, no execution
|
| 44 |
+
|
| 45 |
+
→ humeo.render_clip(request={..., "mode": "normal"})
|
| 46 |
+
# actually renders the 9:16 MP4
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## 3. Strict JSON all the way
|
| 50 |
+
|
| 51 |
+
Every request/response is validated against the schemas in
|
| 52 |
+
[`schemas.py`](../src/humeo_core/schemas.py). Invalid input is rejected
|
| 53 |
+
*before* ffmpeg is touched, so a confused agent can't accidentally
|
| 54 |
+
rm-rf your disk or burn GPU hours.
|
| 55 |
+
|
| 56 |
+
## 4. Override knobs
|
| 57 |
+
|
| 58 |
+
`LayoutInstruction` accepts:
|
| 59 |
+
|
| 60 |
+
- `zoom`, `person_x_norm`, `chart_x_norm` — single-subject knobs.
|
| 61 |
+
- `split_chart_region`, `split_person_region`,
|
| 62 |
+
`split_second_chart_region`, `split_second_person_region` —
|
| 63 |
+
normalized bboxes that drive split-layout cropping.
|
| 64 |
+
- `top_band_ratio` — fraction of output height used by the top band
|
| 65 |
+
(default 0.5 = even 50/50, the symmetric look).
|
| 66 |
+
- `focus_stack_order` — for `split_chart_person`, chart-on-top vs
|
| 67 |
+
person-on-top.
|
| 68 |
+
|
| 69 |
+
Example: chart + person with a precise bbox crop and an even split.
|
| 70 |
+
|
| 71 |
+
```json
|
| 72 |
+
{
|
| 73 |
+
"clip_id": "001",
|
| 74 |
+
"layout": "split_chart_person",
|
| 75 |
+
"split_chart_region": {"x1": 0.00, "y1": 0.10, "x2": 0.52, "y2": 0.95},
|
| 76 |
+
"split_person_region": {"x1": 0.55, "y1": 0.05, "x2": 1.00, "y2": 1.00},
|
| 77 |
+
"top_band_ratio": 0.5,
|
| 78 |
+
"focus_stack_order": "chart_then_person"
|
| 79 |
+
}
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
Example: two-speaker interview.
|
| 83 |
+
|
| 84 |
+
```json
|
| 85 |
+
{
|
| 86 |
+
"clip_id": "002",
|
| 87 |
+
"layout": "split_two_persons",
|
| 88 |
+
"split_person_region": {"x1": 0.02, "y1": 0.05, "x2": 0.48, "y2": 0.95},
|
| 89 |
+
"split_second_person_region": {"x1": 0.52, "y1": 0.05, "x2": 0.98, "y2": 0.95}
|
| 90 |
+
}
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## 5. When to stay in dry-run
|
| 94 |
+
|
| 95 |
+
- You want to show an approval UI before spending CPU.
|
| 96 |
+
- You want to diff the planned ffmpeg commands against a previous run.
|
| 97 |
+
- You're building tests.
|
| 98 |
+
|
| 99 |
+
`mode="dry_run"` is always safe, never writes output, and returns the
|
| 100 |
+
exact argv list.
|
humeo-core/examples/render_request.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"source_path": "/absolute/path/to/long.mp4",
|
| 3 |
+
"clip": {
|
| 4 |
+
"clip_id": "001",
|
| 5 |
+
"topic": "Prediction Market Explosion",
|
| 6 |
+
"start_time_sec": 289.0,
|
| 7 |
+
"end_time_sec": 331.5,
|
| 8 |
+
"viral_hook": "Prediction markets could explode to $5 trillion.",
|
| 9 |
+
"virality_score": 0.94,
|
| 10 |
+
"transcript": "Full text for subtitle generation...",
|
| 11 |
+
"suggested_overlay_title": "$5T Prediction Markets"
|
| 12 |
+
},
|
| 13 |
+
"layout": {
|
| 14 |
+
"clip_id": "001",
|
| 15 |
+
"layout": "split_chart_person",
|
| 16 |
+
"zoom": 1.0,
|
| 17 |
+
"person_x_norm": 0.83,
|
| 18 |
+
"chart_x_norm": 0.0
|
| 19 |
+
},
|
| 20 |
+
"output_path": "/absolute/path/to/out/clip_001.mp4",
|
| 21 |
+
"title_text": "$5T Prediction Markets",
|
| 22 |
+
"mode": "dry_run"
|
| 23 |
+
}
|
humeo-core/pyproject.toml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "humeo-core"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints)."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
license = { text = "MIT" }
|
| 12 |
+
authors = [{ name = "Humeo" }]
|
| 13 |
+
keywords = ["mcp", "video", "shorts", "ffmpeg", "editing", "humeo", "hive"]
|
| 14 |
+
classifiers = [
|
| 15 |
+
"Programming Language :: Python :: 3",
|
| 16 |
+
"Programming Language :: Python :: 3.10",
|
| 17 |
+
"Programming Language :: Python :: 3.11",
|
| 18 |
+
"Programming Language :: Python :: 3.12",
|
| 19 |
+
]
|
| 20 |
+
dependencies = [
|
| 21 |
+
"mcp[cli]>=1.2.0",
|
| 22 |
+
"pydantic>=2.0",
|
| 23 |
+
"scenedetect>=0.6",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
[project.optional-dependencies]
|
| 27 |
+
transcribe = ["faster-whisper>=1.0"]
|
| 28 |
+
download = ["yt-dlp>=2024.0"]
|
| 29 |
+
face = ["mediapipe>=0.10", "opencv-python>=4.8"]
|
| 30 |
+
vision = ["Pillow>=10.0"]
|
| 31 |
+
dev = ["pytest>=7", "pytest-asyncio>=0.23", "Pillow>=10.0"]
|
| 32 |
+
|
| 33 |
+
[project.scripts]
|
| 34 |
+
humeo-core = "humeo_core.server:main"
|
| 35 |
+
# Backward-compatible entry point (same module); existing MCP configs may still call `humeo-mcp`.
|
| 36 |
+
humeo-mcp = "humeo_core.server:main"
|
| 37 |
+
|
| 38 |
+
[tool.setuptools.packages.find]
|
| 39 |
+
where = ["src"]
|
| 40 |
+
|
| 41 |
+
[tool.setuptools.package-data]
|
| 42 |
+
humeo_core = ["assets/fonts/*"]
|
| 43 |
+
|
| 44 |
+
[tool.pytest.ini_options]
|
| 45 |
+
testpaths = ["tests"]
|
| 46 |
+
addopts = "-ra -q"
|
humeo-core/src/humeo_core.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: humeo-core
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Humeo core library: strict schemas, deterministic ffmpeg 9:16 layouts, optional MCP server (humeo-core / humeo-mcp entrypoints).
|
| 5 |
+
Author: Humeo
|
| 6 |
+
License: MIT
|
| 7 |
+
Keywords: mcp,video,shorts,ffmpeg,editing,humeo,hive
|
| 8 |
+
Classifier: Programming Language :: Python :: 3
|
| 9 |
+
Classifier: Programming Language :: Python :: 3.10
|
| 10 |
+
Classifier: Programming Language :: Python :: 3.11
|
| 11 |
+
Classifier: Programming Language :: Python :: 3.12
|
| 12 |
+
Requires-Python: >=3.10
|
| 13 |
+
Description-Content-Type: text/markdown
|
| 14 |
+
License-File: LICENSE
|
| 15 |
+
Requires-Dist: mcp[cli]>=1.2.0
|
| 16 |
+
Requires-Dist: pydantic>=2.0
|
| 17 |
+
Requires-Dist: scenedetect>=0.6
|
| 18 |
+
Provides-Extra: transcribe
|
| 19 |
+
Requires-Dist: faster-whisper>=1.0; extra == "transcribe"
|
| 20 |
+
Provides-Extra: download
|
| 21 |
+
Requires-Dist: yt-dlp>=2024.0; extra == "download"
|
| 22 |
+
Provides-Extra: face
|
| 23 |
+
Requires-Dist: mediapipe>=0.10; extra == "face"
|
| 24 |
+
Requires-Dist: opencv-python>=4.8; extra == "face"
|
| 25 |
+
Provides-Extra: vision
|
| 26 |
+
Requires-Dist: Pillow>=10.0; extra == "vision"
|
| 27 |
+
Provides-Extra: dev
|
| 28 |
+
Requires-Dist: pytest>=7; extra == "dev"
|
| 29 |
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
| 30 |
+
Requires-Dist: Pillow>=10.0; extra == "dev"
|
| 31 |
+
Dynamic: license-file
|
| 32 |
+
|
| 33 |
+
# humeo-core
|
| 34 |
+
|
| 35 |
+
**Reusable-rocket MCP server for long-video → 9:16 shorts.**
|
| 36 |
+
|
| 37 |
+
First-principles design, from the HIVE paper + Bryan's rocket analogy:
|
| 38 |
+
we don't build doors and windows (general subject-tracker UI, retraining
|
| 39 |
+
models). We build the **container** (schemas), **landing gear** (deterministic
|
| 40 |
+
local extraction), and **five thrusters** (the five 9:16 layouts this video
|
| 41 |
+
format actually uses). Everything else is pluggable.
|
| 42 |
+
|
| 43 |
+
## The rocket, in one picture
|
| 44 |
+
|
| 45 |
+
```
|
| 46 |
+
┌──────────────────────────────────────────┐
|
| 47 |
+
│ Control panel (MCP tools) │ <- any MCP client
|
| 48 |
+
└───────────────────┬──────────────────────┘
|
| 49 |
+
│ strict JSON
|
| 50 |
+
┌────────────────┬───────────┼────────────────┬─────────────────┐
|
| 51 |
+
▼ ▼ ▼ ▼ ▼
|
| 52 |
+
ingest classify_scenes select_clips plan_layout render_clip
|
| 53 |
+
(scenes + (5-way layout (clip picker, (5 thrusters, (ffmpeg compile,
|
| 54 |
+
keyframes + classifier) heuristic + pure filter dry-run safe)
|
| 55 |
+
transcript) LLM-ready) math)
|
| 56 |
+
│
|
| 57 |
+
▼
|
| 58 |
+
┌────────────────────┐
|
| 59 |
+
│ LayoutKind │
|
| 60 |
+
│ ──────────────── │
|
| 61 |
+
│ zoom_call_center │
|
| 62 |
+
│ sit_center │
|
| 63 |
+
│ split_chart_person│
|
| 64 |
+
│ split_two_persons │
|
| 65 |
+
│ split_two_charts │
|
| 66 |
+
└────────────────────┘
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
Only the classifier and clip-selector have optional LLM hooks; everything
|
| 70 |
+
else is deterministic, local, and cheap.
|
| 71 |
+
|
| 72 |
+
## Why five layouts? (the "max 2 items" rule)
|
| 73 |
+
|
| 74 |
+
The hard constraint for this format: **a short shows at most two on-screen
|
| 75 |
+
items** — where an "item" is a `person` (a human speaker) or a `chart`
|
| 76 |
+
(slide, graph, data visual, screenshare). That gives exactly five recipes:
|
| 77 |
+
|
| 78 |
+
1. **`zoom_call_center`** — 1 person, tight zoom-call / webcam framing.
|
| 79 |
+
2. **`sit_center`** — 1 person, interview / seated framing.
|
| 80 |
+
3. **`split_chart_person`** — 1 chart + 1 person, stacked vertically
|
| 81 |
+
(default: **even 50/50** top/bottom, chart on top).
|
| 82 |
+
4. **`split_two_persons`** — 2 speakers, stacked vertically.
|
| 83 |
+
5. **`split_two_charts`** — 2 charts, stacked vertically.
|
| 84 |
+
|
| 85 |
+
Because the geometry is bounded, we do NOT need a general subject-tracker
|
| 86 |
+
ML model or a drag-to-highlight UI. We need five small, correct pieces of
|
| 87 |
+
crop/compose math. That is exactly what `src/humeo_core/primitives/layouts.py`
|
| 88 |
+
is.
|
| 89 |
+
|
| 90 |
+
See [`TERMINOLOGY.md`](../TERMINOLOGY.md) for the full glossary of terms
|
| 91 |
+
used across these docs (subject, crop, band, seam, bbox, layout, etc.).
|
| 92 |
+
|
| 93 |
+
## Install
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
uv venv
|
| 97 |
+
uv sync
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
External requirements: `ffmpeg` and `ffprobe` on PATH.
|
| 101 |
+
|
| 102 |
+
`scenedetect` requires OpenCV. Install `opencv-python-headless` or
|
| 103 |
+
`opencv-python` alongside `scenedetect`.
|
| 104 |
+
|
| 105 |
+
## Use it as an MCP server
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
humeo-core # stdio transport (primary console script)
|
| 109 |
+
# humeo-mcp # same entrypoint — kept so existing MCP configs keep working
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
Example Cursor/Claude Desktop config:
|
| 113 |
+
|
| 114 |
+
```json
|
| 115 |
+
{
|
| 116 |
+
"mcpServers": {
|
| 117 |
+
"humeo": { "command": "humeo-core" }
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
Tools exposed:
|
| 123 |
+
|
| 124 |
+
| Tool | Purpose |
|
| 125 |
+
| --------------------------------- | --------------------------------------------------------------------------- |
|
| 126 |
+
| `list_layouts` | Enumerate the 5 supported layouts. |
|
| 127 |
+
| `ingest` | Scene detection + keyframe extraction (+ optional transcript). |
|
| 128 |
+
| `classify_scenes` | Pixel-heuristic per-scene layout classification. |
|
| 129 |
+
| `detect_scene_regions` | Return the bbox prompt + per-scene jobs (agent runs its own vision model). |
|
| 130 |
+
| `classify_scenes_with_vision` | Classify scenes from already-gathered `SceneRegions` bbox JSON + build layout instructions. |
|
| 131 |
+
| `select_clips` | Heuristic clip picker over a word-level transcript. |
|
| 132 |
+
| `plan_layout` | Return the exact `ffmpeg -filter_complex` for a layout. |
|
| 133 |
+
| `build_render_cmd` | Build the ffmpeg command (no execution) — review before spend. |
|
| 134 |
+
| `render_clip` | Build + run ffmpeg to produce a 9:16 MP4. |
|
| 135 |
+
|
| 136 |
+
Resource: `humeo://layouts` (JSON listing of the 5 layouts).
|
| 137 |
+
|
| 138 |
+
### Three interchangeable region detectors
|
| 139 |
+
|
| 140 |
+
All three emit the same `SceneRegions` schema, so the layout planner and renderer don't care which one you used:
|
| 141 |
+
|
| 142 |
+
```
|
| 143 |
+
classify.py (pixel variance, no ML)
|
| 144 |
+
face_detect.py (MediaPipe, local) ──► SceneRegions ──► SceneClassification ──► LayoutInstruction ──► ffmpeg
|
| 145 |
+
vision.py (multimodal LLM + OCR bboxes)
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## JSON contracts (non-negotiable)
|
| 149 |
+
|
| 150 |
+
All tools take and return Pydantic-validated JSON. The contracts live in
|
| 151 |
+
[`src/humeo_core/schemas.py`](src/humeo_core/schemas.py):
|
| 152 |
+
|
| 153 |
+
- `Scene` `{scene_id, start_time, end_time, keyframe_path?}`
|
| 154 |
+
- `TranscriptWord` `{word, start_time, end_time}`
|
| 155 |
+
- `IngestResult` `{source_path, duration_sec, scenes[], transcript_words[], keyframes_dir?}`
|
| 156 |
+
- `SceneClassification` `{scene_id, layout, confidence, reason}`
|
| 157 |
+
- `BoundingBox` `{x1, y1, x2, y2, label, confidence}` (all coords normalized)
|
| 158 |
+
- `SceneRegions` `{scene_id, person_bbox?, chart_bbox?, ocr_text, raw_reason}`
|
| 159 |
+
- `Clip` `{clip_id, topic, start_time_sec, end_time_sec, viral_hook, virality_score, transcript, suggested_overlay_title, layout?}`
|
| 160 |
+
- `ClipPlan` `{source_path, clips[]}`
|
| 161 |
+
- `LayoutInstruction` `{clip_id, layout, zoom, person_x_norm, chart_x_norm, split_chart_region?, split_person_region?, split_second_chart_region?, split_second_person_region?, top_band_ratio, focus_stack_order}`
|
| 162 |
+
- `RenderRequest` / `RenderResult`
|
| 163 |
+
|
| 164 |
+
## First-principles decisions (what we intentionally did NOT build)
|
| 165 |
+
|
| 166 |
+
- **No giant subject-tracker ML.** The video format has 5 fixed layouts
|
| 167 |
+
(with a hard "max 2 items" rule); pixel-level tracking is not needed.
|
| 168 |
+
- **No drag-and-highlight UI.** An MCP tool is a better "UI" for an
|
| 169 |
+
agent-first workflow. If a human wants to override, they pass a
|
| 170 |
+
`LayoutInstruction` with their own `person_x_norm` / `chart_x_norm` /
|
| 171 |
+
`zoom`.
|
| 172 |
+
- **No end-to-end video→video model.** The HIVE paper's core insight is
|
| 173 |
+
that decomposed orchestration beats monolithic generation. We reify
|
| 174 |
+
that insight as six small composable tools.
|
| 175 |
+
|
| 176 |
+
## Extending the pilot
|
| 177 |
+
|
| 178 |
+
- Plug a real multimodal model into `classify_scenes_with_llm(vision_fn)`.
|
| 179 |
+
- Plug a real reasoning model into `select_clips_with_llm(text_fn)`.
|
| 180 |
+
- Plug a real vision-LLM into `detect_regions_with_llm(scenes, vision_fn)`
|
| 181 |
+
to get per-scene bboxes + OCR text, then feed the results back through
|
| 182 |
+
`classify_scenes_with_vision`. This is the scene-change → v3 images →
|
| 183 |
+
LLM+OCR → bbox path; see `../docs/SOLUTIONS.md §4` for rationale.
|
| 184 |
+
- All enforce strict JSON outputs, so bad model output can't corrupt
|
| 185 |
+
downstream stages.
|
| 186 |
+
|
| 187 |
+
## Testing
|
| 188 |
+
|
| 189 |
+
```bash
|
| 190 |
+
python -m pytest
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
See [`docs/ARCHITECTURE.md`](docs/ARCHITECTURE.md) for deeper rationale.
|
| 194 |
+
|
| 195 |
+
## License
|
| 196 |
+
|
| 197 |
+
MIT
|
humeo-core/src/humeo_core.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
pyproject.toml
|
| 4 |
+
src/humeo_core/__init__.py
|
| 5 |
+
src/humeo_core/schemas.py
|
| 6 |
+
src/humeo_core/server.py
|
| 7 |
+
src/humeo_core.egg-info/PKG-INFO
|
| 8 |
+
src/humeo_core.egg-info/SOURCES.txt
|
| 9 |
+
src/humeo_core.egg-info/dependency_links.txt
|
| 10 |
+
src/humeo_core.egg-info/entry_points.txt
|
| 11 |
+
src/humeo_core.egg-info/requires.txt
|
| 12 |
+
src/humeo_core.egg-info/top_level.txt
|
| 13 |
+
src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
|
| 14 |
+
src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
|
| 15 |
+
src/humeo_core/assets/fonts/SourceSans3-OFL.txt
|
| 16 |
+
src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
|
| 17 |
+
src/humeo_core/primitives/__init__.py
|
| 18 |
+
src/humeo_core/primitives/classify.py
|
| 19 |
+
src/humeo_core/primitives/compile.py
|
| 20 |
+
src/humeo_core/primitives/face_detect.py
|
| 21 |
+
src/humeo_core/primitives/ingest.py
|
| 22 |
+
src/humeo_core/primitives/layouts.py
|
| 23 |
+
src/humeo_core/primitives/select_clips.py
|
| 24 |
+
src/humeo_core/primitives/vision.py
|
| 25 |
+
tests/test_classify.py
|
| 26 |
+
tests/test_compile.py
|
| 27 |
+
tests/test_face_detect.py
|
| 28 |
+
tests/test_layout_bbox.py
|
| 29 |
+
tests/test_layouts.py
|
| 30 |
+
tests/test_schemas.py
|
| 31 |
+
tests/test_select_clips.py
|
| 32 |
+
tests/test_server_tools.py
|
| 33 |
+
tests/test_vision.py
|
humeo-core/src/humeo_core.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
humeo-core/src/humeo_core.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
humeo-core = humeo_core.server:main
|
| 3 |
+
humeo-mcp = humeo_core.server:main
|
humeo-core/src/humeo_core.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mcp[cli]>=1.2.0
|
| 2 |
+
pydantic>=2.0
|
| 3 |
+
scenedetect>=0.6
|
| 4 |
+
|
| 5 |
+
[dev]
|
| 6 |
+
pytest>=7
|
| 7 |
+
pytest-asyncio>=0.23
|
| 8 |
+
Pillow>=10.0
|
| 9 |
+
|
| 10 |
+
[download]
|
| 11 |
+
yt-dlp>=2024.0
|
| 12 |
+
|
| 13 |
+
[face]
|
| 14 |
+
mediapipe>=0.10
|
| 15 |
+
opencv-python>=4.8
|
| 16 |
+
|
| 17 |
+
[transcribe]
|
| 18 |
+
faster-whisper>=1.0
|
| 19 |
+
|
| 20 |
+
[vision]
|
| 21 |
+
Pillow>=10.0
|
humeo-core/src/humeo_core.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
humeo_core
|
humeo-core/src/humeo_core/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""humeo-core: reusable-rocket MCP primitives for long-video-to-shorts editing.
|
| 2 |
+
|
| 3 |
+
First-principles design (rocket analogy):
|
| 4 |
+
Container -> schemas.py (strict JSON contracts)
|
| 5 |
+
Landing gear -> primitives/ingest.py, primitives/compile.py (deterministic local)
|
| 6 |
+
Thrusters -> primitives/layouts.py (5 fixed 9:16 layouts, max 2 items)
|
| 7 |
+
Pilot -> primitives/classify.py, primitives/select_clips.py (heuristic, LLM-ready)
|
| 8 |
+
Control panel -> server.py (FastMCP tools that expose all primitives)
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from .schemas import (
|
| 12 |
+
BoundingBox,
|
| 13 |
+
Clip,
|
| 14 |
+
ClipPlan,
|
| 15 |
+
ClipRenderSpan,
|
| 16 |
+
ClipSubtitleWords,
|
| 17 |
+
FocusStackOrder,
|
| 18 |
+
IngestResult,
|
| 19 |
+
LayoutInstruction,
|
| 20 |
+
LayoutKind,
|
| 21 |
+
RenderRequest,
|
| 22 |
+
RenderResult,
|
| 23 |
+
RenderTheme,
|
| 24 |
+
Scene,
|
| 25 |
+
SceneClassification,
|
| 26 |
+
SceneRegions,
|
| 27 |
+
TranscriptWord,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
__all__ = [
|
| 31 |
+
"BoundingBox",
|
| 32 |
+
"Clip",
|
| 33 |
+
"ClipPlan",
|
| 34 |
+
"ClipRenderSpan",
|
| 35 |
+
"ClipSubtitleWords",
|
| 36 |
+
"FocusStackOrder",
|
| 37 |
+
"IngestResult",
|
| 38 |
+
"LayoutInstruction",
|
| 39 |
+
"LayoutKind",
|
| 40 |
+
"RenderRequest",
|
| 41 |
+
"RenderResult",
|
| 42 |
+
"RenderTheme",
|
| 43 |
+
"Scene",
|
| 44 |
+
"SceneClassification",
|
| 45 |
+
"SceneRegions",
|
| 46 |
+
"TranscriptWord",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
__version__ = "0.1.0"
|
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-Bold.ttf
ADDED
|
Binary file (95.1 kB). View file
|
|
|
humeo-core/src/humeo_core/assets/fonts/LeagueSpartan-OFL.txt
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright 2020 The League Spartan Project Authors (https://github.com/theleagueof/league-spartan)
|
| 2 |
+
|
| 3 |
+
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
| 4 |
+
This license is copied below, and is also available with a FAQ at:
|
| 5 |
+
https://scripts.sil.org/OFL
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
-----------------------------------------------------------
|
| 9 |
+
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
|
| 10 |
+
-----------------------------------------------------------
|
| 11 |
+
|
| 12 |
+
PREAMBLE
|
| 13 |
+
The goals of the Open Font License (OFL) are to stimulate worldwide
|
| 14 |
+
development of collaborative font projects, to support the font creation
|
| 15 |
+
efforts of academic and linguistic communities, and to provide a free and
|
| 16 |
+
open framework in which fonts may be shared and improved in partnership
|
| 17 |
+
with others.
|
| 18 |
+
|
| 19 |
+
The OFL allows the licensed fonts to be used, studied, modified and
|
| 20 |
+
redistributed freely as long as they are not sold by themselves. The
|
| 21 |
+
fonts, including any derivative works, can be bundled, embedded,
|
| 22 |
+
redistributed and/or sold with any software provided that any reserved
|
| 23 |
+
names are not used by derivative works. The fonts and derivatives,
|
| 24 |
+
however, cannot be released under any other type of license. The
|
| 25 |
+
requirement for fonts to remain under this license does not apply
|
| 26 |
+
to any document created using the fonts or their derivatives.
|
| 27 |
+
|
| 28 |
+
DEFINITIONS
|
| 29 |
+
"Font Software" refers to the set of files released by the Copyright
|
| 30 |
+
Holder(s) under this license and clearly marked as such. This may
|
| 31 |
+
include source files, build scripts and documentation.
|
| 32 |
+
|
| 33 |
+
"Reserved Font Name" refers to any names specified as such after the
|
| 34 |
+
copyright statement(s).
|
| 35 |
+
|
| 36 |
+
"Original Version" refers to the collection of Font Software components as
|
| 37 |
+
distributed by the Copyright Holder(s).
|
| 38 |
+
|
| 39 |
+
"Modified Version" refers to any derivative made by adding to, deleting,
|
| 40 |
+
or substituting -- in part or in whole -- any of the components of the
|
| 41 |
+
Original Version, by changing formats or by porting the Font Software to a
|
| 42 |
+
new environment.
|
| 43 |
+
|
| 44 |
+
"Author" refers to any designer, engineer, programmer, technical
|
| 45 |
+
writer or other person who contributed to the Font Software.
|
| 46 |
+
|
| 47 |
+
PERMISSION & CONDITIONS
|
| 48 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
| 49 |
+
a copy of the Font Software, to use, study, copy, merge, embed, modify,
|
| 50 |
+
redistribute, and sell modified and unmodified copies of the Font
|
| 51 |
+
Software, subject to the following conditions:
|
| 52 |
+
|
| 53 |
+
1) Neither the Font Software nor any of its individual components,
|
| 54 |
+
in Original or Modified Versions, may be sold by itself.
|
| 55 |
+
|
| 56 |
+
2) Original or Modified Versions of the Font Software may be bundled,
|
| 57 |
+
redistributed and/or sold with any software, provided that each copy
|
| 58 |
+
contains the above copyright notice and this license. These can be
|
| 59 |
+
included either as stand-alone text files, human-readable headers or
|
| 60 |
+
in the appropriate machine-readable metadata fields within text or
|
| 61 |
+
binary files as long as those fields can be easily viewed by the user.
|
| 62 |
+
|
| 63 |
+
3) No Modified Version of the Font Software may use the Reserved Font
|
| 64 |
+
Name(s) unless explicit written permission is granted by the corresponding
|
| 65 |
+
Copyright Holder. This restriction only applies to the primary font name as
|
| 66 |
+
presented to the users.
|
| 67 |
+
|
| 68 |
+
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
|
| 69 |
+
Software shall not be used to promote, endorse or advertise any
|
| 70 |
+
Modified Version, except to acknowledge the contribution(s) of the
|
| 71 |
+
Copyright Holder(s) and the Author(s) or with their explicit written
|
| 72 |
+
permission.
|
| 73 |
+
|
| 74 |
+
5) The Font Software, modified or unmodified, in part or in whole,
|
| 75 |
+
must be distributed entirely under this license, and must not be
|
| 76 |
+
distributed under any other license. The requirement for fonts to
|
| 77 |
+
remain under this license does not apply to any document created
|
| 78 |
+
using the Font Software.
|
| 79 |
+
|
| 80 |
+
TERMINATION
|
| 81 |
+
This license becomes null and void if any of the above conditions are
|
| 82 |
+
not met.
|
| 83 |
+
|
| 84 |
+
DISCLAIMER
|
| 85 |
+
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 86 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
|
| 87 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
| 88 |
+
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
|
| 89 |
+
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
| 90 |
+
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
| 91 |
+
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
| 92 |
+
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
|
| 93 |
+
OTHER DEALINGS IN THE FONT SOFTWARE.
|
humeo-core/src/humeo_core/assets/fonts/SourceSans3-OFL.txt
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright 2010-2020 Adobe (http://www.adobe.com/), with Reserved Font Name 'Source'. All Rights Reserved. Source is a trademark of Adobe in the United States and/or other countries.
|
| 2 |
+
|
| 3 |
+
This Font Software is licensed under the SIL Open Font License, Version 1.1.
|
| 4 |
+
|
| 5 |
+
This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
-----------------------------------------------------------
|
| 9 |
+
SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007
|
| 10 |
+
-----------------------------------------------------------
|
| 11 |
+
|
| 12 |
+
PREAMBLE
|
| 13 |
+
The goals of the Open Font License (OFL) are to stimulate worldwide
|
| 14 |
+
development of collaborative font projects, to support the font creation
|
| 15 |
+
efforts of academic and linguistic communities, and to provide a free and
|
| 16 |
+
open framework in which fonts may be shared and improved in partnership
|
| 17 |
+
with others.
|
| 18 |
+
|
| 19 |
+
The OFL allows the licensed fonts to be used, studied, modified and
|
| 20 |
+
redistributed freely as long as they are not sold by themselves. The
|
| 21 |
+
fonts, including any derivative works, can be bundled, embedded,
|
| 22 |
+
redistributed and/or sold with any software provided that any reserved
|
| 23 |
+
names are not used by derivative works. The fonts and derivatives,
|
| 24 |
+
however, cannot be released under any other type of license. The
|
| 25 |
+
requirement for fonts to remain under this license does not apply
|
| 26 |
+
to any document created using the fonts or their derivatives.
|
| 27 |
+
|
| 28 |
+
DEFINITIONS
|
| 29 |
+
"Font Software" refers to the set of files released by the Copyright
|
| 30 |
+
Holder(s) under this license and clearly marked as such. This may
|
| 31 |
+
include source files, build scripts and documentation.
|
| 32 |
+
|
| 33 |
+
"Reserved Font Name" refers to any names specified as such after the
|
| 34 |
+
copyright statement(s).
|
| 35 |
+
|
| 36 |
+
"Original Version" refers to the collection of Font Software components as
|
| 37 |
+
distributed by the Copyright Holder(s).
|
| 38 |
+
|
| 39 |
+
"Modified Version" refers to any derivative made by adding to, deleting,
|
| 40 |
+
or substituting -- in part or in whole -- any of the components of the
|
| 41 |
+
Original Version, by changing formats or by porting the Font Software to a
|
| 42 |
+
new environment.
|
| 43 |
+
|
| 44 |
+
"Author" refers to any designer, engineer, programmer, technical
|
| 45 |
+
writer or other person who contributed to the Font Software.
|
| 46 |
+
|
| 47 |
+
PERMISSION & CONDITIONS
|
| 48 |
+
Permission is hereby granted, free of charge, to any person obtaining
|
| 49 |
+
a copy of the Font Software, to use, study, copy, merge, embed, modify,
|
| 50 |
+
redistribute, and sell modified and unmodified copies of the Font
|
| 51 |
+
Software, subject to the following conditions:
|
| 52 |
+
|
| 53 |
+
1) Neither the Font Software nor any of its individual components,
|
| 54 |
+
in Original or Modified Versions, may be sold by itself.
|
| 55 |
+
|
| 56 |
+
2) Original or Modified Versions of the Font Software may be bundled,
|
| 57 |
+
redistributed and/or sold with any software, provided that each copy
|
| 58 |
+
contains the above copyright notice and this license. These can be
|
| 59 |
+
included either as stand-alone text files, human-readable headers or
|
| 60 |
+
in the appropriate machine-readable metadata fields within text or
|
| 61 |
+
binary files as long as those fields can be easily viewed by the user.
|
| 62 |
+
|
| 63 |
+
3) No Modified Version of the Font Software may use the Reserved Font
|
| 64 |
+
Name(s) unless explicit written permission is granted by the corresponding
|
| 65 |
+
Copyright Holder. This restriction only applies to the primary font name as
|
| 66 |
+
presented to the users.
|
| 67 |
+
|
| 68 |
+
4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font
|
| 69 |
+
Software shall not be used to promote, endorse or advertise any
|
| 70 |
+
Modified Version, except to acknowledge the contribution(s) of the
|
| 71 |
+
Copyright Holder(s) and the Author(s) or with their explicit written
|
| 72 |
+
permission.
|
| 73 |
+
|
| 74 |
+
5) The Font Software, modified or unmodified, in part or in whole,
|
| 75 |
+
must be distributed entirely under this license, and must not be
|
| 76 |
+
distributed under any other license. The requirement for fonts to
|
| 77 |
+
remain under this license does not apply to any document created
|
| 78 |
+
using the Font Software.
|
| 79 |
+
|
| 80 |
+
TERMINATION
|
| 81 |
+
This license becomes null and void if any of the above conditions are
|
| 82 |
+
not met.
|
| 83 |
+
|
| 84 |
+
DISCLAIMER
|
| 85 |
+
THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 86 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF
|
| 87 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
| 88 |
+
OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE
|
| 89 |
+
COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
| 90 |
+
INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL
|
| 91 |
+
DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
| 92 |
+
FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM
|
| 93 |
+
OTHER DEALINGS IN THE FONT SOFTWARE.
|
humeo-core/src/humeo_core/assets/fonts/SourceSans3-SemiBoldItalic.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39e3ab05ccd7cb94907c31005bb5bec1d5432f0b096a2b782976e217a540eb6c
|
| 3 |
+
size 395372
|
humeo-core/src/humeo_core/primitives/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Primitives: deterministic, composable building blocks of the rocket."""
|
humeo-core/src/humeo_core/primitives/classify.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Scene classifier: assigns one of the 5 layouts to each scene.
|
| 2 |
+
|
| 3 |
+
Two backends share the same contract:
|
| 4 |
+
|
| 5 |
+
* ``classify_scenes_heuristic`` — no model call. Uses keyframe pixel analysis
|
| 6 |
+
(edge density + color variance + face-rectangle heuristic-free approach)
|
| 7 |
+
to guess which of the 5 layouts fits best. Fully offline, deterministic.
|
| 8 |
+
Note: the heuristic only picks between ``SIT_CENTER`` / ``ZOOM_CALL_CENTER`` /
|
| 9 |
+
``SPLIT_CHART_PERSON``; the two-of-a-kind splits (``SPLIT_TWO_PERSONS`` /
|
| 10 |
+
``SPLIT_TWO_CHARTS``) are only selectable by the vision-LLM backend.
|
| 11 |
+
* ``classify_scenes_with_llm`` — pluggable LLM hook. Takes a callable
|
| 12 |
+
``(image_path, prompt) -> str`` so the caller (MCP client or test) can
|
| 13 |
+
wire up whatever multimodal model they want. Enforces strict JSON output.
|
| 14 |
+
|
| 15 |
+
Even without a model, the heuristic is good enough for many real inputs and
|
| 16 |
+
keeps the whole pipeline runnable with zero external dependencies.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import struct
|
| 24 |
+
from typing import Callable, Iterable
|
| 25 |
+
|
| 26 |
+
from ..schemas import LayoutKind, Scene, SceneClassification
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# Tiny PNG/JPEG reader → down-sampled grayscale column profile
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# We intentionally avoid a hard dependency on Pillow. If Pillow is available
|
| 33 |
+
# we use it; otherwise we fall back to reading just PNG dimensions, which is
|
| 34 |
+
# enough for a coarse column-variance heuristic on any pre-decoded frame.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _load_grayscale(path: str) -> tuple[list[list[int]], int, int] | None:
|
| 38 |
+
try:
|
| 39 |
+
from PIL import Image # type: ignore
|
| 40 |
+
|
| 41 |
+
img = Image.open(path).convert("L")
|
| 42 |
+
w, h = img.size
|
| 43 |
+
# Down-sample to at most 128 cols x 72 rows for cheap analysis.
|
| 44 |
+
tw = min(128, w)
|
| 45 |
+
th = min(72, h)
|
| 46 |
+
img = img.resize((tw, th))
|
| 47 |
+
px = list(img.getdata())
|
| 48 |
+
grid = [px[i * tw : (i + 1) * tw] for i in range(th)]
|
| 49 |
+
return grid, tw, th
|
| 50 |
+
except Exception:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _png_dims(path: str) -> tuple[int, int] | None:
|
| 55 |
+
try:
|
| 56 |
+
with open(path, "rb") as f:
|
| 57 |
+
head = f.read(24)
|
| 58 |
+
if head[:8] != b"\x89PNG\r\n\x1a\n":
|
| 59 |
+
return None
|
| 60 |
+
w, h = struct.unpack(">II", head[16:24])
|
| 61 |
+
return int(w), int(h)
|
| 62 |
+
except Exception:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _column_profile(grid: list[list[int]]) -> list[float]:
|
| 67 |
+
if not grid:
|
| 68 |
+
return []
|
| 69 |
+
h = len(grid)
|
| 70 |
+
w = len(grid[0])
|
| 71 |
+
out: list[float] = []
|
| 72 |
+
for x in range(w):
|
| 73 |
+
s = 0
|
| 74 |
+
for y in range(h):
|
| 75 |
+
s += grid[y][x]
|
| 76 |
+
out.append(s / h)
|
| 77 |
+
return out
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _variance(values: Iterable[float]) -> float:
|
| 81 |
+
vs = list(values)
|
| 82 |
+
if not vs:
|
| 83 |
+
return 0.0
|
| 84 |
+
m = sum(vs) / len(vs)
|
| 85 |
+
return sum((v - m) ** 2 for v in vs) / len(vs)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
# Heuristic classifier
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _classify_one_heuristic(keyframe_path: str | None) -> SceneClassification:
|
| 94 |
+
if not keyframe_path or not os.path.exists(keyframe_path):
|
| 95 |
+
return SceneClassification(
|
| 96 |
+
scene_id="?",
|
| 97 |
+
layout=LayoutKind.SIT_CENTER,
|
| 98 |
+
confidence=0.3,
|
| 99 |
+
reason="no keyframe available — defaulting to SIT_CENTER",
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
gs = _load_grayscale(keyframe_path)
|
| 103 |
+
if gs is None:
|
| 104 |
+
# Can't read pixels: still return a safe default with low confidence.
|
| 105 |
+
return SceneClassification(
|
| 106 |
+
scene_id="?",
|
| 107 |
+
layout=LayoutKind.SIT_CENTER,
|
| 108 |
+
confidence=0.25,
|
| 109 |
+
reason="PIL unavailable or image unreadable — defaulting to SIT_CENTER",
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
grid, w, h = gs
|
| 113 |
+
cols = _column_profile(grid)
|
| 114 |
+
|
| 115 |
+
def _split_contrast(left: list[float], right: list[float]) -> float:
|
| 116 |
+
lm = sum(left) / max(1, len(left))
|
| 117 |
+
rm = sum(right) / max(1, len(right))
|
| 118 |
+
lv = _variance(left)
|
| 119 |
+
rv = _variance(right)
|
| 120 |
+
between = (lm - rm) ** 2
|
| 121 |
+
within = (lv + rv) / 2.0 + 1e-6
|
| 122 |
+
return between / within
|
| 123 |
+
|
| 124 |
+
# Left/right halves — good for symmetric two-up scenes.
|
| 125 |
+
mid = max(1, w // 2)
|
| 126 |
+
split_halves = _split_contrast(cols[:mid], cols[mid:])
|
| 127 |
+
|
| 128 |
+
# Left 2/3 vs right 1/3 — matches explainer slides (chart + talking head).
|
| 129 |
+
t = max(1, w // 3)
|
| 130 |
+
left_two_thirds = cols[: 2 * t]
|
| 131 |
+
right_one_third = cols[2 * t :]
|
| 132 |
+
split_thirds = _split_contrast(left_two_thirds, right_one_third)
|
| 133 |
+
|
| 134 |
+
split_score = max(split_halves, split_thirds)
|
| 135 |
+
# Overall column variance: low variance → flat composition (zoom call).
|
| 136 |
+
overall_var = _variance(cols)
|
| 137 |
+
|
| 138 |
+
# Threshold tuned on Ark-style 2/3 chart + 1/3 speaker; "thirds" score catches
|
| 139 |
+
# layouts where half-vs-half contrast was too weak (e.g. clip 005 vs 004).
|
| 140 |
+
if split_score > 20.0:
|
| 141 |
+
return SceneClassification(
|
| 142 |
+
scene_id="?",
|
| 143 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 144 |
+
confidence=min(0.95, 0.5 + split_score / 200.0),
|
| 145 |
+
reason=(
|
| 146 |
+
f"chart/person contrast (halves={split_halves:.1f}, "
|
| 147 |
+
f"thirds={split_thirds:.1f} → max={split_score:.1f})"
|
| 148 |
+
),
|
| 149 |
+
)
|
| 150 |
+
if overall_var < 100.0:
|
| 151 |
+
return SceneClassification(
|
| 152 |
+
scene_id="?",
|
| 153 |
+
layout=LayoutKind.ZOOM_CALL_CENTER,
|
| 154 |
+
confidence=0.7,
|
| 155 |
+
reason=f"low column variance ({overall_var:.1f}) — flat centered framing",
|
| 156 |
+
)
|
| 157 |
+
return SceneClassification(
|
| 158 |
+
scene_id="?",
|
| 159 |
+
layout=LayoutKind.SIT_CENTER,
|
| 160 |
+
confidence=0.6,
|
| 161 |
+
reason=f"moderate composition (score={split_score:.1f}, var={overall_var:.1f})",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def classify_scenes_heuristic(scenes: list[Scene]) -> list[SceneClassification]:
|
| 166 |
+
out: list[SceneClassification] = []
|
| 167 |
+
for s in scenes:
|
| 168 |
+
r = _classify_one_heuristic(s.keyframe_path)
|
| 169 |
+
out.append(r.model_copy(update={"scene_id": s.scene_id}))
|
| 170 |
+
return out
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# ---------------------------------------------------------------------------
|
| 174 |
+
# LLM-backed classifier (caller provides the model hook)
|
| 175 |
+
# ---------------------------------------------------------------------------
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
LLMVisionFn = Callable[[str, str], str]
|
| 179 |
+
"""Signature: (image_path, prompt) -> raw model string (expected JSON)."""
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
CLASSIFIER_PROMPT = """You are a scene layout classifier for a short-video editor.
|
| 183 |
+
Return ONLY a JSON object of the form:
|
| 184 |
+
{"layout": "<one of: zoom_call_center | sit_center | split_chart_person>",
|
| 185 |
+
"confidence": <0..1 float>,
|
| 186 |
+
"reason": "<=15 words"}
|
| 187 |
+
|
| 188 |
+
Layout definitions:
|
| 189 |
+
- zoom_call_center: one person on a video call (webcam grid / talking head tight crop), subject centered.
|
| 190 |
+
- sit_center: one person sitting in frame, subject centered, wider framing than a zoom call.
|
| 191 |
+
- split_chart_person: an explainer scene with a chart/graphic on the LEFT (~2/3 of frame) and a person on the RIGHT (~1/3).
|
| 192 |
+
|
| 193 |
+
Pick the single best match. No prose, no markdown, JSON only.
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def classify_scenes_with_llm(
|
| 198 |
+
scenes: list[Scene], vision_fn: LLMVisionFn
|
| 199 |
+
) -> list[SceneClassification]:
|
| 200 |
+
out: list[SceneClassification] = []
|
| 201 |
+
for s in scenes:
|
| 202 |
+
if not s.keyframe_path:
|
| 203 |
+
out.append(
|
| 204 |
+
SceneClassification(
|
| 205 |
+
scene_id=s.scene_id,
|
| 206 |
+
layout=LayoutKind.SIT_CENTER,
|
| 207 |
+
confidence=0.2,
|
| 208 |
+
reason="no keyframe",
|
| 209 |
+
)
|
| 210 |
+
)
|
| 211 |
+
continue
|
| 212 |
+
raw = vision_fn(s.keyframe_path, CLASSIFIER_PROMPT)
|
| 213 |
+
try:
|
| 214 |
+
data = json.loads(raw)
|
| 215 |
+
out.append(
|
| 216 |
+
SceneClassification(
|
| 217 |
+
scene_id=s.scene_id,
|
| 218 |
+
layout=LayoutKind(data["layout"]),
|
| 219 |
+
confidence=float(data.get("confidence", 0.5)),
|
| 220 |
+
reason=str(data.get("reason", ""))[:200],
|
| 221 |
+
)
|
| 222 |
+
)
|
| 223 |
+
except Exception as e:
|
| 224 |
+
out.append(
|
| 225 |
+
SceneClassification(
|
| 226 |
+
scene_id=s.scene_id,
|
| 227 |
+
layout=LayoutKind.SIT_CENTER,
|
| 228 |
+
confidence=0.25,
|
| 229 |
+
reason=f"LLM parse error: {e!r}",
|
| 230 |
+
)
|
| 231 |
+
)
|
| 232 |
+
return out
|
humeo-core/src/humeo_core/primitives/compile.py
ADDED
|
@@ -0,0 +1,602 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compiler: assemble a final 9:16 clip from source + clip + layout instruction.
|
| 2 |
+
|
| 3 |
+
Builds the ffmpeg invocation, optionally runs it. Keeping ``dry_run`` as a
|
| 4 |
+
first-class mode means the MCP server can return the exact command without
|
| 5 |
+
executing — ideal for an agent that wants to review before spending CPU.
|
| 6 |
+
|
| 7 |
+
Rendering order is fixed and intentional:
|
| 8 |
+
|
| 9 |
+
1. **Cut + crop/compose.** ``plan_layout`` produces the base filtergraph
|
| 10 |
+
that takes the source, applies the layout-specific crops, and emits a
|
| 11 |
+
labelled ``[vout]`` at the exact output resolution (e.g. 1080x1920).
|
| 12 |
+
2. **Overlay title** (``drawtext``) — skipped for split layouts because
|
| 13 |
+
the source itself already has a slide/chart title and an extra overlay
|
| 14 |
+
just obscures content.
|
| 15 |
+
3. **Subtitles.** ``subtitles`` filter runs **last** so text is drawn over
|
| 16 |
+
the finished composition, not the source. ``original_size`` is pinned
|
| 17 |
+
to the output resolution so libass coordinate math (MarginV, FontSize)
|
| 18 |
+
is in *output pixels*, not libass's default PlayResY=288 — which was
|
| 19 |
+
the bug behind the "subtitles blocked / floating in the middle" look.
|
| 20 |
+
4. **Mux** with the source audio stream (``0:a:0``).
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
import shutil
|
| 27 |
+
import subprocess
|
| 28 |
+
import tempfile
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
from ..schemas import RenderRequest, RenderResult, RenderTheme, SPLIT_LAYOUTS
|
| 32 |
+
from .layouts import plan_layout
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _ensure_ffmpeg() -> str:
|
| 36 |
+
exe = shutil.which("ffmpeg")
|
| 37 |
+
if not exe:
|
| 38 |
+
raise RuntimeError("ffmpeg not found on PATH")
|
| 39 |
+
return exe
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _ensure_windows_fontconfig() -> dict[str, str]:
|
| 43 |
+
"""Return subprocess env with a minimal fontconfig setup on Windows.
|
| 44 |
+
|
| 45 |
+
Some Windows FFmpeg builds ship libass + fontconfig but do not bundle a
|
| 46 |
+
default fontconfig config, which makes subtitle rendering fail with:
|
| 47 |
+
|
| 48 |
+
``Fontconfig error: Cannot load default config file: No such file: (null)``
|
| 49 |
+
|
| 50 |
+
We generate a tiny config that points fontconfig at ``C:/Windows/Fonts`` and
|
| 51 |
+
a writable cache dir under ``%LOCALAPPDATA%/humeo``. Non-Windows platforms
|
| 52 |
+
pass through the existing environment unchanged.
|
| 53 |
+
"""
|
| 54 |
+
env = os.environ.copy()
|
| 55 |
+
if os.name != "nt":
|
| 56 |
+
return env
|
| 57 |
+
if env.get("FONTCONFIG_FILE"):
|
| 58 |
+
return env
|
| 59 |
+
|
| 60 |
+
local_appdata = Path(
|
| 61 |
+
env.get("LOCALAPPDATA", str(Path(tempfile.gettempdir()) / "humeo-local"))
|
| 62 |
+
)
|
| 63 |
+
cfg_dir = local_appdata / "humeo" / "fontconfig"
|
| 64 |
+
cache_dir = local_appdata / "humeo" / "fontconfig-cache"
|
| 65 |
+
cfg_dir.mkdir(parents=True, exist_ok=True)
|
| 66 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 67 |
+
|
| 68 |
+
cfg_file = cfg_dir / "fonts.conf"
|
| 69 |
+
windows_fonts = Path(env.get("WINDIR", r"C:\Windows")) / "Fonts"
|
| 70 |
+
if not cfg_file.exists():
|
| 71 |
+
cfg_file.write_text(
|
| 72 |
+
"\n".join(
|
| 73 |
+
[
|
| 74 |
+
'<?xml version="1.0"?>',
|
| 75 |
+
"<fontconfig>",
|
| 76 |
+
f" <dir>{windows_fonts.as_posix()}</dir>",
|
| 77 |
+
f" <cachedir>{cache_dir.as_posix()}</cachedir>",
|
| 78 |
+
"</fontconfig>",
|
| 79 |
+
"",
|
| 80 |
+
]
|
| 81 |
+
),
|
| 82 |
+
encoding="utf-8",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
env["FONTCONFIG_PATH"] = str(cfg_dir)
|
| 86 |
+
env["FONTCONFIG_FILE"] = str(cfg_file)
|
| 87 |
+
return env
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _escape_drawtext(text: str) -> str:
|
| 91 |
+
# drawtext quoting is brittle across ffmpeg builds. Keep it simple:
|
| 92 |
+
# collapse whitespace, drop apostrophes, and escape the characters
|
| 93 |
+
# that are still significant to the filter parser.
|
| 94 |
+
safe = " ".join(text.split()).replace("'", "")
|
| 95 |
+
return safe.replace("\\", "\\\\").replace(":", "\\:")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
# Title overlay planning
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
#
|
| 102 |
+
# ffmpeg ``drawtext`` does not wrap text by itself; whatever you hand it is
|
| 103 |
+
# emitted as a single line. With a fixed 72px font and no width budget, the
|
| 104 |
+
# "Prediction Markets vs Derivatives" title on a 1080px canvas would spill
|
| 105 |
+
# past both edges and show up clipped (the user reported exactly this bug).
|
| 106 |
+
#
|
| 107 |
+
# The helpers below plan a title layout BEFORE it hits drawtext:
|
| 108 |
+
#
|
| 109 |
+
# 1. Short titles (fit at 72px single line): emit the existing single
|
| 110 |
+
# ``drawtext`` call unchanged so golden tests and previously-calibrated
|
| 111 |
+
# visuals stay byte-for-byte identical.
|
| 112 |
+
# 2. Long titles: split at the best word boundary into two balanced lines and
|
| 113 |
+
# emit two stacked ``drawtext`` filters at a slightly smaller font
|
| 114 |
+
# (60px / 52px / 44px, auto-shrinking until both lines fit).
|
| 115 |
+
# 3. Single-word titles that still overflow: shrink the single line until it
|
| 116 |
+
# fits, then hard-truncate with an ellipsis as a last resort.
|
| 117 |
+
#
|
| 118 |
+
# The character-width estimate is deliberately conservative (0.55 * fontsize)
|
| 119 |
+
# so mixed-case prose with wide letters like W/M still clears the margin.
|
| 120 |
+
# Calibrated visually against Arial Bold on 1080p output.
|
| 121 |
+
|
| 122 |
+
_TITLE_PRIMARY_SIZE = 72 # Current "hero" title size; preserved for short titles.
|
| 123 |
+
_TITLE_MIN_SIZE = 44 # Readability floor at 1080x1920 output.
|
| 124 |
+
_TITLE_MARGIN_PX = 60 # Horizontal safe-area on each side.
|
| 125 |
+
_TITLE_Y_TOP = 80 # Pixel offset of the top title baseline (matches pre-P2 look).
|
| 126 |
+
_TITLE_CHAR_WIDTH_RATIO = 0.55
|
| 127 |
+
_TITLE_LINE_SPACING_RATIO = 1.3
|
| 128 |
+
|
| 129 |
+
# Keep the overlay font explicit. Without a ``font=`` directive, drawtext
|
| 130 |
+
# falls back to fontconfig's "Sans", which resolves to a serif (Times New
|
| 131 |
+
# Roman) on default Windows installs — the "ugly serif title" bug reported
|
| 132 |
+
# against v1. Arial matches the ASS subtitle ``Fontname`` below so the
|
| 133 |
+
# title and captions read as a single typographic family. Keep this in
|
| 134 |
+
# sync with the ``Fontname=Arial`` in the subtitle filter if it ever
|
| 135 |
+
# changes.
|
| 136 |
+
_TITLE_FONT_NAME = "Arial"
|
| 137 |
+
_REFERENCE_TITLE_FONT_NAME = "League Spartan"
|
| 138 |
+
_REFERENCE_CAPTION_FONT_NAME = "Source Sans 3"
|
| 139 |
+
_REFERENCE_TITLE_BAR_X = 28
|
| 140 |
+
_REFERENCE_TITLE_BAR_Y = 32
|
| 141 |
+
_REFERENCE_TITLE_BAR_W = 1024
|
| 142 |
+
_REFERENCE_TITLE_BAR_H = 148
|
| 143 |
+
_REFERENCE_TITLE_TEXT_X = 72
|
| 144 |
+
_REFERENCE_TITLE_TEXT_Y = 54
|
| 145 |
+
_REFERENCE_TITLE_SIZE = 64
|
| 146 |
+
_REFERENCE_CAPTION_BAR_X = 0
|
| 147 |
+
_REFERENCE_CAPTION_BAR_W = 1080
|
| 148 |
+
_REFERENCE_CAPTION_BAR_H = 120
|
| 149 |
+
_REFERENCE_CAPTION_TEXT_MARGIN_L = 92
|
| 150 |
+
_REFERENCE_CAPTION_TEXT_MARGIN_R = 92
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _fonts_dir() -> Path:
|
| 154 |
+
return Path(__file__).resolve().parents[1] / "assets" / "fonts"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _bundled_font_path(filename: str) -> Path | None:
|
| 158 |
+
path = _fonts_dir() / filename
|
| 159 |
+
return path if path.is_file() else None
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def _title_char_px(size_px: int) -> float:
|
| 163 |
+
return size_px * _TITLE_CHAR_WIDTH_RATIO
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _title_fits(text: str, size_px: int, usable_w: int) -> bool:
|
| 167 |
+
return int(len(text) * _title_char_px(size_px)) <= usable_w
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _wrap_title_two_lines(text: str) -> tuple[str, str]:
|
| 171 |
+
"""Split ``text`` at the word boundary that most balances the two halves.
|
| 172 |
+
|
| 173 |
+
Returns ``(line1, line2)``. If ``text`` has fewer than two words, returns
|
| 174 |
+
``(text, "")`` and the caller should fall back to single-line shrinking.
|
| 175 |
+
"""
|
| 176 |
+
words = text.split()
|
| 177 |
+
if len(words) < 2:
|
| 178 |
+
return text, ""
|
| 179 |
+
best_idx = 1
|
| 180 |
+
best_delta = 10**9
|
| 181 |
+
for i in range(1, len(words)):
|
| 182 |
+
left = " ".join(words[:i])
|
| 183 |
+
right = " ".join(words[i:])
|
| 184 |
+
delta = abs(len(left) - len(right))
|
| 185 |
+
if delta < best_delta:
|
| 186 |
+
best_delta = delta
|
| 187 |
+
best_idx = i
|
| 188 |
+
return " ".join(words[:best_idx]), " ".join(words[best_idx:])
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def _drawtext_font_arg() -> str:
|
| 192 |
+
"""Return a drawtext font selector that is stable on the current platform."""
|
| 193 |
+
if os.name == "nt":
|
| 194 |
+
arial = Path(os.environ.get("WINDIR", r"C:\Windows")) / "Fonts" / "arial.ttf"
|
| 195 |
+
if arial.is_file():
|
| 196 |
+
return f"fontfile='{_escape_filter_path(str(arial))}'"
|
| 197 |
+
return f"font={_TITLE_FONT_NAME}"
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _reference_title_font_arg() -> str:
|
| 201 |
+
bundled = _bundled_font_path("LeagueSpartan-Bold-static.ttf") or _bundled_font_path(
|
| 202 |
+
"LeagueSpartan-Bold.ttf"
|
| 203 |
+
)
|
| 204 |
+
if bundled is not None:
|
| 205 |
+
return f"fontfile='{_escape_filter_path(str(bundled))}'"
|
| 206 |
+
return f"font={_REFERENCE_TITLE_FONT_NAME}"
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def _drawtext_single(text: str, size: int, y: int) -> str:
|
| 210 |
+
esc = _escape_drawtext(text)
|
| 211 |
+
return (
|
| 212 |
+
f"drawtext=text='{esc}':"
|
| 213 |
+
"expansion=none:"
|
| 214 |
+
f"{_drawtext_font_arg()}:"
|
| 215 |
+
f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
|
| 216 |
+
f"x=(w-text_w)/2:y={y}"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def _drawtext_two(line1: str, line2: str, size: int, y_top: int) -> str:
|
| 221 |
+
"""Two drawtext filters chained by comma — one ffmpeg filter chain, two lines."""
|
| 222 |
+
esc1 = _escape_drawtext(line1)
|
| 223 |
+
esc2 = _escape_drawtext(line2)
|
| 224 |
+
y_bottom = y_top + int(round(size * _TITLE_LINE_SPACING_RATIO))
|
| 225 |
+
return (
|
| 226 |
+
f"drawtext=text='{esc1}':"
|
| 227 |
+
"expansion=none:"
|
| 228 |
+
f"{_drawtext_font_arg()}:"
|
| 229 |
+
f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
|
| 230 |
+
f"x=(w-text_w)/2:y={y_top},"
|
| 231 |
+
f"drawtext=text='{esc2}':"
|
| 232 |
+
"expansion=none:"
|
| 233 |
+
f"{_drawtext_font_arg()}:"
|
| 234 |
+
f"fontcolor=white:fontsize={size}:borderw=4:bordercolor=black:"
|
| 235 |
+
f"x=(w-text_w)/2:y={y_bottom}"
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def plan_title_drawtext(title_text: str, out_w: int = 1080) -> str | None:
|
| 240 |
+
"""Return the ``drawtext`` filter fragment for ``title_text`` or None to skip.
|
| 241 |
+
|
| 242 |
+
The returned string is intended to be spliced into the main filtergraph
|
| 243 |
+
between the ``[v_prepad]`` and ``[vout]`` labels by
|
| 244 |
+
:func:`build_ffmpeg_cmd`. It does NOT include those labels itself.
|
| 245 |
+
|
| 246 |
+
Backward compatibility: when the title fits on one line at the original
|
| 247 |
+
72px size, the output is identical to the pre-P2 single-``drawtext``
|
| 248 |
+
form (same x/y/fontsize/borderw), so golden ffmpeg tests stay green.
|
| 249 |
+
"""
|
| 250 |
+
text = " ".join((title_text or "").split())
|
| 251 |
+
if not text:
|
| 252 |
+
return None
|
| 253 |
+
usable_w = max(1, out_w - 2 * _TITLE_MARGIN_PX)
|
| 254 |
+
|
| 255 |
+
if _title_fits(text, _TITLE_PRIMARY_SIZE, usable_w):
|
| 256 |
+
return _drawtext_single(text, _TITLE_PRIMARY_SIZE, _TITLE_Y_TOP)
|
| 257 |
+
|
| 258 |
+
line1, line2 = _wrap_title_two_lines(text)
|
| 259 |
+
if line2:
|
| 260 |
+
for size in (60, 52, _TITLE_MIN_SIZE):
|
| 261 |
+
if _title_fits(line1, size, usable_w) and _title_fits(line2, size, usable_w):
|
| 262 |
+
return _drawtext_two(line1, line2, size, _TITLE_Y_TOP)
|
| 263 |
+
|
| 264 |
+
for size in (64, 56, 52, _TITLE_MIN_SIZE):
|
| 265 |
+
if _title_fits(text, size, usable_w):
|
| 266 |
+
return _drawtext_single(text, size, _TITLE_Y_TOP)
|
| 267 |
+
|
| 268 |
+
max_chars = max(4, int(usable_w / _title_char_px(_TITLE_MIN_SIZE)))
|
| 269 |
+
truncated = text[: max_chars - 1].rstrip() + "..."
|
| 270 |
+
return _drawtext_single(truncated, _TITLE_MIN_SIZE, _TITLE_Y_TOP)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _reference_title_fragment(title_text: str, out_w: int = 1080) -> str:
|
| 274 |
+
bar_w = min(_REFERENCE_TITLE_BAR_W, max(320, out_w - 2 * _REFERENCE_TITLE_BAR_X))
|
| 275 |
+
accent_w = 16
|
| 276 |
+
title = " ".join((title_text or "").split())
|
| 277 |
+
usable_w = max(220, bar_w - (_REFERENCE_TITLE_TEXT_X - _REFERENCE_TITLE_BAR_X) - 30)
|
| 278 |
+
text_filters: list[str] = []
|
| 279 |
+
if title:
|
| 280 |
+
if _title_fits(title, _REFERENCE_TITLE_SIZE, usable_w):
|
| 281 |
+
esc = _escape_drawtext(title)
|
| 282 |
+
text_filters.append(
|
| 283 |
+
f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
|
| 284 |
+
f"fontcolor=white:fontsize={_REFERENCE_TITLE_SIZE}:"
|
| 285 |
+
"borderw=1.2:bordercolor=0x101010@0.18:"
|
| 286 |
+
f"x={_REFERENCE_TITLE_TEXT_X}:"
|
| 287 |
+
f"y={_REFERENCE_TITLE_TEXT_Y}"
|
| 288 |
+
)
|
| 289 |
+
else:
|
| 290 |
+
line1, line2 = _wrap_title_two_lines(title)
|
| 291 |
+
two_line_size = 54
|
| 292 |
+
while (
|
| 293 |
+
line2
|
| 294 |
+
and two_line_size > 42
|
| 295 |
+
and not (
|
| 296 |
+
_title_fits(line1, two_line_size, usable_w)
|
| 297 |
+
and _title_fits(line2, two_line_size, usable_w)
|
| 298 |
+
)
|
| 299 |
+
):
|
| 300 |
+
two_line_size -= 2
|
| 301 |
+
if line2 and _title_fits(line1, two_line_size, usable_w) and _title_fits(line2, two_line_size, usable_w):
|
| 302 |
+
y_top = 36
|
| 303 |
+
y_bottom = y_top + int(round(two_line_size * 1.08))
|
| 304 |
+
for line, y in ((line1, y_top), (line2, y_bottom)):
|
| 305 |
+
esc = _escape_drawtext(line)
|
| 306 |
+
text_filters.append(
|
| 307 |
+
f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
|
| 308 |
+
f"fontcolor=white:fontsize={two_line_size}:"
|
| 309 |
+
"borderw=1.2:bordercolor=0x101010@0.18:"
|
| 310 |
+
f"x={_REFERENCE_TITLE_TEXT_X}:y={y}"
|
| 311 |
+
)
|
| 312 |
+
else:
|
| 313 |
+
size = _REFERENCE_TITLE_SIZE
|
| 314 |
+
while title and not _title_fits(title, size, usable_w) and size > 38:
|
| 315 |
+
size -= 2
|
| 316 |
+
if title and not _title_fits(title, size, usable_w):
|
| 317 |
+
max_chars = max(8, int(usable_w / _title_char_px(size)))
|
| 318 |
+
title = title[: max_chars - 1].rstrip() + "..."
|
| 319 |
+
esc = _escape_drawtext(title)
|
| 320 |
+
text_filters.append(
|
| 321 |
+
f"drawtext=text='{esc}':expansion=none:{_reference_title_font_arg()}:"
|
| 322 |
+
f"fontcolor=white:fontsize={size}:"
|
| 323 |
+
"borderw=1.2:bordercolor=0x101010@0.18:"
|
| 324 |
+
f"x={_REFERENCE_TITLE_TEXT_X}:"
|
| 325 |
+
f"y={_REFERENCE_TITLE_TEXT_Y}"
|
| 326 |
+
)
|
| 327 |
+
text_filter = f",{','.join(text_filters)}" if text_filters else ""
|
| 328 |
+
return (
|
| 329 |
+
f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
|
| 330 |
+
f"w={bar_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x1F1F1F@0.84:t=fill,"
|
| 331 |
+
f"drawbox=x={_REFERENCE_TITLE_BAR_X}:y={_REFERENCE_TITLE_BAR_Y}:"
|
| 332 |
+
f"w={accent_w}:h={_REFERENCE_TITLE_BAR_H}:color=0x2A2453@0.98:t=fill"
|
| 333 |
+
f"{text_filter}"
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def _reference_caption_bar_fragment(
|
| 338 |
+
*,
|
| 339 |
+
out_w: int = 1080,
|
| 340 |
+
out_h: int = 1920,
|
| 341 |
+
margin_v: int = 166,
|
| 342 |
+
font_size: int = 38,
|
| 343 |
+
) -> str:
|
| 344 |
+
bar_w = min(_REFERENCE_CAPTION_BAR_W, max(320, out_w - 2 * _REFERENCE_CAPTION_BAR_X))
|
| 345 |
+
bar_h = max(_REFERENCE_CAPTION_BAR_H, int(round(font_size * 2.05)))
|
| 346 |
+
bar_y = max(
|
| 347 |
+
_REFERENCE_TITLE_BAR_Y + _REFERENCE_TITLE_BAR_H + 36,
|
| 348 |
+
out_h - max(40, margin_v) - bar_h,
|
| 349 |
+
)
|
| 350 |
+
return (
|
| 351 |
+
f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
|
| 352 |
+
f"w={bar_w}:h={bar_h}:color=0x6570E6@1.0:t=fill,"
|
| 353 |
+
f"drawbox=x={_REFERENCE_CAPTION_BAR_X}:y={bar_y}:"
|
| 354 |
+
f"w={bar_w}:h=3:color=0xE4E7FF@0.14:t=fill"
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def _escape_filter_path(path: str) -> str:
|
| 359 |
+
return path.replace("\\", "/").replace(":", "\\:").replace("'", "\\'")
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def _has_audio_stream(media_path: str) -> bool:
|
| 363 |
+
probe = shutil.which("ffprobe")
|
| 364 |
+
if not probe:
|
| 365 |
+
return False
|
| 366 |
+
out = subprocess.run(
|
| 367 |
+
[
|
| 368 |
+
probe,
|
| 369 |
+
"-v",
|
| 370 |
+
"error",
|
| 371 |
+
"-select_streams",
|
| 372 |
+
"a:0",
|
| 373 |
+
"-show_entries",
|
| 374 |
+
"stream=codec_type",
|
| 375 |
+
"-of",
|
| 376 |
+
"csv=p=0",
|
| 377 |
+
media_path,
|
| 378 |
+
],
|
| 379 |
+
check=False,
|
| 380 |
+
capture_output=True,
|
| 381 |
+
text=True,
|
| 382 |
+
)
|
| 383 |
+
return out.returncode == 0 and "audio" in (out.stdout or "").lower()
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
def build_ffmpeg_cmd(
|
| 387 |
+
req: RenderRequest,
|
| 388 |
+
*,
|
| 389 |
+
src_w: int = 1920,
|
| 390 |
+
src_h: int = 1080,
|
| 391 |
+
include_audio: bool = True,
|
| 392 |
+
) -> list[str]:
|
| 393 |
+
exe = _ensure_ffmpeg() if req.mode != "dry_run" else "ffmpeg"
|
| 394 |
+
|
| 395 |
+
plan = plan_layout(
|
| 396 |
+
req.layout, out_w=req.width, out_h=req.height, src_w=src_w, src_h=src_h
|
| 397 |
+
)
|
| 398 |
+
fg = plan.filtergraph
|
| 399 |
+
|
| 400 |
+
if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
|
| 401 |
+
chrome_parts = [
|
| 402 |
+
_reference_title_fragment(req.title_text, out_w=req.width),
|
| 403 |
+
_reference_caption_bar_fragment(
|
| 404 |
+
out_w=req.width,
|
| 405 |
+
out_h=req.height,
|
| 406 |
+
margin_v=min(req.subtitle_margin_v, 136),
|
| 407 |
+
font_size=max(req.subtitle_font_size, 124),
|
| 408 |
+
)
|
| 409 |
+
if req.subtitle_path
|
| 410 |
+
else "",
|
| 411 |
+
]
|
| 412 |
+
fg = fg.replace(
|
| 413 |
+
"[vout]",
|
| 414 |
+
f"[v_prepad];[v_prepad]{','.join(part for part in chrome_parts if part)}[vout]",
|
| 415 |
+
)
|
| 416 |
+
elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
|
| 417 |
+
# The native-highlight theme mirrors the reference short in
|
| 418 |
+
# videoplayback (12): no separate top title card, just centered
|
| 419 |
+
# floating captions with per-word highlight timing.
|
| 420 |
+
pass
|
| 421 |
+
else:
|
| 422 |
+
# Skip the drawtext title overlay on split layouts: the top band already
|
| 423 |
+
# shows a slide/chart with its own baked-in title, so adding an overlay
|
| 424 |
+
# on top of that is pure noise (and was stacking over the chart title
|
| 425 |
+
# in the SPLIT_CHART_PERSON Cathy Wood shorts).
|
| 426 |
+
title_allowed = req.layout.layout not in SPLIT_LAYOUTS
|
| 427 |
+
if req.title_text and title_allowed:
|
| 428 |
+
# ``plan_title_drawtext`` returns a full filter fragment (possibly
|
| 429 |
+
# two chained ``drawtext`` calls) that fits within the output width.
|
| 430 |
+
# For short titles it is byte-identical to the pre-P2 single-line
|
| 431 |
+
# form, keeping existing golden tests green while fixing the
|
| 432 |
+
# "Prediction Markets vs Derivatives" edge-clip report.
|
| 433 |
+
title_fragment = plan_title_drawtext(req.title_text, out_w=req.width)
|
| 434 |
+
if title_fragment:
|
| 435 |
+
fg = fg.replace(
|
| 436 |
+
"[vout]",
|
| 437 |
+
f"[v_prepad];[v_prepad]{title_fragment}[vout]",
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
if req.subtitle_path:
|
| 441 |
+
subtitle_esc = _escape_filter_path(req.subtitle_path)
|
| 442 |
+
fonts_dir = _fonts_dir()
|
| 443 |
+
fontsdir_arg = (
|
| 444 |
+
f":fontsdir='{_escape_filter_path(str(fonts_dir))}'" if fonts_dir.is_dir() else ""
|
| 445 |
+
)
|
| 446 |
+
# ``original_size`` pins libass's PlayResY to the actual output so
|
| 447 |
+
# ``FontSize`` and ``MarginV`` are interpreted in output pixels. Without
|
| 448 |
+
# this, libass defaults to PlayResY=288 and then upscales to the real
|
| 449 |
+
# canvas (1920) -- blowing font sizes and pushing subtitles to the
|
| 450 |
+
# middle of the frame. ``WrapStyle=0`` enables smart word wrap so long
|
| 451 |
+
# lines break into readable stacks instead of running off-screen.
|
| 452 |
+
if req.render_theme == RenderTheme.REFERENCE_LOWER_THIRD:
|
| 453 |
+
force_style = (
|
| 454 |
+
f"Fontname={_REFERENCE_CAPTION_FONT_NAME},"
|
| 455 |
+
f"FontSize={max(req.subtitle_font_size, 124)},Alignment=2,"
|
| 456 |
+
f"MarginV={min(req.subtitle_margin_v, 136)},"
|
| 457 |
+
"MarginL=56,MarginR=56,"
|
| 458 |
+
"WrapStyle=0,BorderStyle=1,Outline=2,Shadow=0,"
|
| 459 |
+
"BackColour=&H00000000&,PrimaryColour=&H00FFFFFF&,"
|
| 460 |
+
"Bold=1,Italic=0,Spacing=-1"
|
| 461 |
+
)
|
| 462 |
+
subtitle_filter = (
|
| 463 |
+
"[v_sub_in];"
|
| 464 |
+
f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
|
| 465 |
+
f"original_size={req.width}x{req.height}:"
|
| 466 |
+
f"force_style='{force_style}'[vout]"
|
| 467 |
+
)
|
| 468 |
+
elif req.render_theme == RenderTheme.NATIVE_HIGHLIGHT:
|
| 469 |
+
subtitle_filter = (
|
| 470 |
+
"[v_sub_in];"
|
| 471 |
+
f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
|
| 472 |
+
f"original_size={req.width}x{req.height}[vout]"
|
| 473 |
+
)
|
| 474 |
+
else:
|
| 475 |
+
force_style = (
|
| 476 |
+
f"Fontname=Arial,"
|
| 477 |
+
f"FontSize={req.subtitle_font_size},Alignment=2,"
|
| 478 |
+
f"MarginV={req.subtitle_margin_v},MarginL=60,MarginR=60,"
|
| 479 |
+
"WrapStyle=0,BorderStyle=4,"
|
| 480 |
+
"BackColour=&H70000000&,PrimaryColour=&H00FFFFFF&,"
|
| 481 |
+
"Outline=0,Shadow=0,Bold=1"
|
| 482 |
+
)
|
| 483 |
+
subtitle_filter = (
|
| 484 |
+
"[v_sub_in];"
|
| 485 |
+
f"[v_sub_in]subtitles='{subtitle_esc}'{fontsdir_arg}:"
|
| 486 |
+
f"original_size={req.width}x{req.height}:"
|
| 487 |
+
f"force_style='{force_style}'[vout]"
|
| 488 |
+
)
|
| 489 |
+
fg = fg.replace("[vout]", subtitle_filter)
|
| 490 |
+
|
| 491 |
+
start = req.clip.start_time_sec
|
| 492 |
+
dur = max(0.1, req.clip.duration_sec)
|
| 493 |
+
|
| 494 |
+
Path(Path(req.output_path).parent).mkdir(parents=True, exist_ok=True)
|
| 495 |
+
|
| 496 |
+
cmd: list[str] = [
|
| 497 |
+
exe,
|
| 498 |
+
"-y",
|
| 499 |
+
"-ss",
|
| 500 |
+
f"{start:.3f}",
|
| 501 |
+
"-t",
|
| 502 |
+
f"{dur:.3f}",
|
| 503 |
+
"-i",
|
| 504 |
+
req.source_path,
|
| 505 |
+
"-filter_complex",
|
| 506 |
+
fg,
|
| 507 |
+
"-map",
|
| 508 |
+
"[vout]",
|
| 509 |
+
"-c:v",
|
| 510 |
+
"libx264",
|
| 511 |
+
"-preset",
|
| 512 |
+
"veryfast",
|
| 513 |
+
"-crf",
|
| 514 |
+
"20",
|
| 515 |
+
]
|
| 516 |
+
|
| 517 |
+
if include_audio:
|
| 518 |
+
cmd.extend(["-map", "0:a:0", "-c:a", "aac", "-b:a", "160k"])
|
| 519 |
+
|
| 520 |
+
cmd.extend(["-movflags", "+faststart", req.output_path])
|
| 521 |
+
return cmd
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
def probe_source_size(source_path: str) -> tuple[int, int]:
|
| 525 |
+
exe = shutil.which("ffprobe")
|
| 526 |
+
if not exe:
|
| 527 |
+
return 1920, 1080
|
| 528 |
+
out = subprocess.run(
|
| 529 |
+
[
|
| 530 |
+
exe,
|
| 531 |
+
"-v",
|
| 532 |
+
"error",
|
| 533 |
+
"-select_streams",
|
| 534 |
+
"v:0",
|
| 535 |
+
"-show_entries",
|
| 536 |
+
"stream=width,height",
|
| 537 |
+
"-of",
|
| 538 |
+
"csv=p=0",
|
| 539 |
+
source_path,
|
| 540 |
+
],
|
| 541 |
+
check=False,
|
| 542 |
+
capture_output=True,
|
| 543 |
+
text=True,
|
| 544 |
+
)
|
| 545 |
+
try:
|
| 546 |
+
w, h = out.stdout.strip().split(",")
|
| 547 |
+
return int(w), int(h)
|
| 548 |
+
except Exception:
|
| 549 |
+
return 1920, 1080
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
def render_clip(req: RenderRequest) -> RenderResult:
|
| 553 |
+
try:
|
| 554 |
+
src_w, src_h = probe_source_size(req.source_path) if req.mode != "dry_run" else (1920, 1080)
|
| 555 |
+
except Exception:
|
| 556 |
+
src_w, src_h = 1920, 1080
|
| 557 |
+
|
| 558 |
+
include_audio = True
|
| 559 |
+
if req.mode != "dry_run":
|
| 560 |
+
include_audio = _has_audio_stream(req.source_path)
|
| 561 |
+
if not include_audio:
|
| 562 |
+
return RenderResult(
|
| 563 |
+
clip_id=req.clip.clip_id,
|
| 564 |
+
output_path=req.output_path,
|
| 565 |
+
ffmpeg_cmd=[],
|
| 566 |
+
success=False,
|
| 567 |
+
error="Source media has no detectable audio stream (a:0).",
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
cmd = build_ffmpeg_cmd(req, src_w=src_w, src_h=src_h, include_audio=include_audio)
|
| 571 |
+
|
| 572 |
+
if req.mode == "dry_run":
|
| 573 |
+
return RenderResult(
|
| 574 |
+
clip_id=req.clip.clip_id,
|
| 575 |
+
output_path=req.output_path,
|
| 576 |
+
ffmpeg_cmd=cmd,
|
| 577 |
+
success=True,
|
| 578 |
+
)
|
| 579 |
+
try:
|
| 580 |
+
subprocess.run(cmd, check=True, capture_output=True, env=_ensure_windows_fontconfig())
|
| 581 |
+
if include_audio and not _has_audio_stream(req.output_path):
|
| 582 |
+
return RenderResult(
|
| 583 |
+
clip_id=req.clip.clip_id,
|
| 584 |
+
output_path=req.output_path,
|
| 585 |
+
ffmpeg_cmd=cmd,
|
| 586 |
+
success=False,
|
| 587 |
+
error="Rendered output is missing audio stream (a:0).",
|
| 588 |
+
)
|
| 589 |
+
return RenderResult(
|
| 590 |
+
clip_id=req.clip.clip_id,
|
| 591 |
+
output_path=req.output_path,
|
| 592 |
+
ffmpeg_cmd=cmd,
|
| 593 |
+
success=True,
|
| 594 |
+
)
|
| 595 |
+
except subprocess.CalledProcessError as e:
|
| 596 |
+
return RenderResult(
|
| 597 |
+
clip_id=req.clip.clip_id,
|
| 598 |
+
output_path=req.output_path,
|
| 599 |
+
ffmpeg_cmd=cmd,
|
| 600 |
+
success=False,
|
| 601 |
+
error=e.stderr.decode("utf-8", errors="replace")[-4000:] if e.stderr else str(e),
|
| 602 |
+
)
|
humeo-core/src/humeo_core/primitives/face_detect.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Local face-detection primitive — the MediaPipe path as another ``SceneRegions`` producer.
|
| 2 |
+
|
| 3 |
+
Three detection backends share the *same output schema* (``SceneRegions``):
|
| 4 |
+
|
| 5 |
+
* ``primitives/classify.py`` — pixel variance heuristic, no model.
|
| 6 |
+
* ``primitives/face_detect.py`` — MediaPipe face rectangle (this file).
|
| 7 |
+
* ``primitives/vision.py`` — multimodal LLM + OCR bboxes.
|
| 8 |
+
|
| 9 |
+
Because all three emit ``SceneRegions``, the layout planner in
|
| 10 |
+
``primitives/vision.py`` (``classify_from_regions`` + ``layout_instruction_from_regions``)
|
| 11 |
+
works on all of them unchanged. That is the whole point of the primitive
|
| 12 |
+
boundary — the *detector* is swappable, the *renderer* is fixed.
|
| 13 |
+
|
| 14 |
+
MediaPipe is imported lazily so it remains an optional extra.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import logging
|
| 20 |
+
from typing import Callable
|
| 21 |
+
|
| 22 |
+
from ..schemas import BoundingBox, Scene, SceneRegions
|
| 23 |
+
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# A bbox loader for any future cloud face API. Takes a keyframe path,
|
| 28 |
+
# returns a normalized face bbox or ``None``. Same shape as the MediaPipe
|
| 29 |
+
# wrapper below, which lets tests pass a stub and skip MediaPipe.
|
| 30 |
+
FaceBBoxFn = Callable[[str], BoundingBox | None]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def detect_face_regions(
|
| 34 |
+
scenes: list[Scene],
|
| 35 |
+
face_fn: FaceBBoxFn | None = None,
|
| 36 |
+
chart_split_threshold: float = 0.65,
|
| 37 |
+
) -> list[SceneRegions]:
|
| 38 |
+
"""Populate ``SceneRegions.person_bbox`` (+ ``chart_bbox``) from a face detector.
|
| 39 |
+
|
| 40 |
+
The face bbox is treated as the *person bbox*. If the face sits in the
|
| 41 |
+
right ``(1 - chart_split_threshold)`` of the frame, a *chart bbox* is
|
| 42 |
+
synthesised over the left region — mirroring the original
|
| 43 |
+
``reframe.py`` split heuristic.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
scenes: scenes with ``keyframe_path`` populated.
|
| 47 |
+
face_fn: pluggable face detector. Defaults to MediaPipe (lazy
|
| 48 |
+
import) if not supplied. Pass a stub in tests.
|
| 49 |
+
chart_split_threshold: face x-center above this normalized value
|
| 50 |
+
triggers a synthetic chart bbox on the left.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
if face_fn is None:
|
| 54 |
+
face_fn = _mediapipe_face_bbox
|
| 55 |
+
|
| 56 |
+
out: list[SceneRegions] = []
|
| 57 |
+
for s in scenes:
|
| 58 |
+
if not s.keyframe_path:
|
| 59 |
+
out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available"))
|
| 60 |
+
continue
|
| 61 |
+
try:
|
| 62 |
+
face = face_fn(s.keyframe_path)
|
| 63 |
+
except Exception as e: # one bad scene should not kill the batch
|
| 64 |
+
logger.warning("face detector failed on %s: %r", s.keyframe_path, e)
|
| 65 |
+
out.append(SceneRegions(scene_id=s.scene_id, raw_reason=f"face detector error: {e!r}"))
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
if face is None:
|
| 69 |
+
out.append(SceneRegions(scene_id=s.scene_id, raw_reason="no face detected"))
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
chart = None
|
| 73 |
+
if face.center_x >= chart_split_threshold:
|
| 74 |
+
# Face pushed right → assume a chart occupies the left region.
|
| 75 |
+
chart = BoundingBox(
|
| 76 |
+
x1=0.0,
|
| 77 |
+
y1=0.0,
|
| 78 |
+
x2=min(chart_split_threshold, face.x1),
|
| 79 |
+
y2=1.0,
|
| 80 |
+
label="chart_inferred",
|
| 81 |
+
confidence=max(0.0, face.center_x - chart_split_threshold + 0.5),
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
out.append(
|
| 85 |
+
SceneRegions(
|
| 86 |
+
scene_id=s.scene_id,
|
| 87 |
+
person_bbox=face,
|
| 88 |
+
chart_bbox=chart,
|
| 89 |
+
raw_reason="face detected" + (" + synthetic chart bbox" if chart else ""),
|
| 90 |
+
)
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
return out
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _mediapipe_face_bbox(keyframe_path: str) -> BoundingBox | None:
|
| 97 |
+
"""Return the largest-confidence face as a ``BoundingBox``, or ``None``.
|
| 98 |
+
|
| 99 |
+
Imports MediaPipe + OpenCV lazily so they remain optional dependencies
|
| 100 |
+
(install ``humeo-core[face]``).
|
| 101 |
+
"""
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
import cv2 # type: ignore
|
| 105 |
+
import mediapipe as mp # type: ignore
|
| 106 |
+
except ImportError as e:
|
| 107 |
+
raise RuntimeError(
|
| 108 |
+
"MediaPipe face detection requires `pip install humeo-core[face]`"
|
| 109 |
+
) from e
|
| 110 |
+
|
| 111 |
+
img = cv2.imread(keyframe_path)
|
| 112 |
+
if img is None:
|
| 113 |
+
return None
|
| 114 |
+
rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 115 |
+
|
| 116 |
+
with mp.solutions.face_detection.FaceDetection(
|
| 117 |
+
model_selection=1, min_detection_confidence=0.5
|
| 118 |
+
) as detector:
|
| 119 |
+
results = detector.process(rgb)
|
| 120 |
+
if not results.detections:
|
| 121 |
+
return None
|
| 122 |
+
best = max(results.detections, key=lambda d: d.score[0])
|
| 123 |
+
box = best.location_data.relative_bounding_box
|
| 124 |
+
x1 = max(0.0, min(1.0, float(box.xmin)))
|
| 125 |
+
y1 = max(0.0, min(1.0, float(box.ymin)))
|
| 126 |
+
x2 = max(x1 + 1e-6, min(1.0, x1 + float(box.width)))
|
| 127 |
+
y2 = max(y1 + 1e-6, min(1.0, y1 + float(box.height)))
|
| 128 |
+
return BoundingBox(
|
| 129 |
+
x1=x1,
|
| 130 |
+
y1=y1,
|
| 131 |
+
x2=x2,
|
| 132 |
+
y2=y2,
|
| 133 |
+
label="face",
|
| 134 |
+
confidence=float(best.score[0]),
|
| 135 |
+
)
|
humeo-core/src/humeo_core/primitives/ingest.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Landing gear: deterministic, local extraction.
|
| 2 |
+
|
| 3 |
+
Everything here can run without a GPU, without an API key, and without the
|
| 4 |
+
internet (once inputs are present). This follows the HIVE guide's rule
|
| 5 |
+
"extraction stays local; LLMs only reason".
|
| 6 |
+
|
| 7 |
+
Functions:
|
| 8 |
+
probe_duration — ffprobe wrapper
|
| 9 |
+
detect_scenes — PySceneDetect (ContentDetector)
|
| 10 |
+
extract_keyframes — ffmpeg snapshot at each scene midpoint
|
| 11 |
+
transcribe_audio — faster-whisper (optional dependency)
|
| 12 |
+
ingest — one-shot convenience runner that returns IngestResult
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import shutil
|
| 20 |
+
import subprocess
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
from ..schemas import IngestResult, Scene, TranscriptWord
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class IngestError(RuntimeError):
|
| 27 |
+
pass
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _require(binary: str) -> str:
|
| 31 |
+
path = shutil.which(binary)
|
| 32 |
+
if not path:
|
| 33 |
+
raise IngestError(
|
| 34 |
+
f"Required binary not on PATH: {binary!r}. Install it or add the path."
|
| 35 |
+
)
|
| 36 |
+
return path
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def probe_duration(source_path: str) -> float:
|
| 40 |
+
ffprobe = _require("ffprobe")
|
| 41 |
+
out = subprocess.run(
|
| 42 |
+
[
|
| 43 |
+
ffprobe,
|
| 44 |
+
"-v",
|
| 45 |
+
"error",
|
| 46 |
+
"-show_entries",
|
| 47 |
+
"format=duration",
|
| 48 |
+
"-of",
|
| 49 |
+
"json",
|
| 50 |
+
source_path,
|
| 51 |
+
],
|
| 52 |
+
check=True,
|
| 53 |
+
capture_output=True,
|
| 54 |
+
text=True,
|
| 55 |
+
)
|
| 56 |
+
data = json.loads(out.stdout)
|
| 57 |
+
return float(data["format"]["duration"])
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def detect_scenes(
|
| 61 |
+
source_path: str, threshold: float = 27.0, min_scene_sec: float = 1.0
|
| 62 |
+
) -> list[Scene]:
|
| 63 |
+
"""Use PySceneDetect's ContentDetector to split the video into scenes."""
|
| 64 |
+
|
| 65 |
+
try:
|
| 66 |
+
from scenedetect import detect, ContentDetector # type: ignore
|
| 67 |
+
except ModuleNotFoundError as e:
|
| 68 |
+
# scenedetect depends on OpenCV; surface the real missing module.
|
| 69 |
+
missing = getattr(e, "name", "") or str(e)
|
| 70 |
+
hint = "pip install 'scenedetect[opencv]'" if "cv2" in missing else "pip install scenedetect"
|
| 71 |
+
raise IngestError(
|
| 72 |
+
f"Scene detection unavailable (missing module: {missing}). Install with: {hint}"
|
| 73 |
+
) from e
|
| 74 |
+
|
| 75 |
+
result = detect(
|
| 76 |
+
source_path,
|
| 77 |
+
ContentDetector(threshold=threshold, min_scene_len=int(min_scene_sec * 24)),
|
| 78 |
+
)
|
| 79 |
+
scenes: list[Scene] = []
|
| 80 |
+
for i, (start, end) in enumerate(result):
|
| 81 |
+
scenes.append(
|
| 82 |
+
Scene(
|
| 83 |
+
scene_id=f"s{i:04d}",
|
| 84 |
+
start_time=float(start.get_seconds()),
|
| 85 |
+
end_time=float(end.get_seconds()),
|
| 86 |
+
)
|
| 87 |
+
)
|
| 88 |
+
# Guard: if PySceneDetect returns empty (e.g. a single long shot),
|
| 89 |
+
# fall back to one scene spanning the whole video.
|
| 90 |
+
if not scenes:
|
| 91 |
+
duration = probe_duration(source_path)
|
| 92 |
+
scenes.append(Scene(scene_id="s0000", start_time=0.0, end_time=duration))
|
| 93 |
+
return scenes
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def extract_keyframes(
|
| 97 |
+
source_path: str, scenes: list[Scene], out_dir: str
|
| 98 |
+
) -> list[Scene]:
|
| 99 |
+
"""Extract one JPG per scene at its midpoint. Mutates nothing; returns copies."""
|
| 100 |
+
|
| 101 |
+
ffmpeg = _require("ffmpeg")
|
| 102 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 103 |
+
updated: list[Scene] = []
|
| 104 |
+
for s in scenes:
|
| 105 |
+
mid = s.start_time + (s.end_time - s.start_time) / 2.0
|
| 106 |
+
out_path = os.path.join(out_dir, f"{s.scene_id}.jpg")
|
| 107 |
+
subprocess.run(
|
| 108 |
+
[
|
| 109 |
+
ffmpeg,
|
| 110 |
+
"-y",
|
| 111 |
+
"-loglevel",
|
| 112 |
+
"error",
|
| 113 |
+
"-ss",
|
| 114 |
+
f"{mid:.3f}",
|
| 115 |
+
"-i",
|
| 116 |
+
source_path,
|
| 117 |
+
"-frames:v",
|
| 118 |
+
"1",
|
| 119 |
+
"-q:v",
|
| 120 |
+
"3",
|
| 121 |
+
out_path,
|
| 122 |
+
],
|
| 123 |
+
check=True,
|
| 124 |
+
)
|
| 125 |
+
updated.append(s.model_copy(update={"keyframe_path": out_path}))
|
| 126 |
+
return updated
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def transcribe_audio(
|
| 130 |
+
source_path: str, model_name: str = "base", language: str | None = None
|
| 131 |
+
) -> list[TranscriptWord]:
|
| 132 |
+
"""Word-level transcript via faster-whisper. Optional dependency."""
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
from faster_whisper import WhisperModel # type: ignore
|
| 136 |
+
except ImportError as e:
|
| 137 |
+
raise IngestError(
|
| 138 |
+
"faster-whisper is not installed. pip install faster-whisper"
|
| 139 |
+
) from e
|
| 140 |
+
|
| 141 |
+
model = WhisperModel(model_name, device="auto", compute_type="auto")
|
| 142 |
+
segments, _info = model.transcribe(source_path, word_timestamps=True, language=language)
|
| 143 |
+
words: list[TranscriptWord] = []
|
| 144 |
+
for seg in segments:
|
| 145 |
+
for w in getattr(seg, "words", []) or []:
|
| 146 |
+
if w.word is None:
|
| 147 |
+
continue
|
| 148 |
+
words.append(
|
| 149 |
+
TranscriptWord(
|
| 150 |
+
word=str(w.word).strip(),
|
| 151 |
+
start_time=float(w.start or 0.0),
|
| 152 |
+
end_time=float(w.end or 0.0),
|
| 153 |
+
)
|
| 154 |
+
)
|
| 155 |
+
return words
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def ingest(
|
| 159 |
+
source_path: str,
|
| 160 |
+
work_dir: str,
|
| 161 |
+
*,
|
| 162 |
+
with_transcript: bool = False,
|
| 163 |
+
whisper_model: str = "base",
|
| 164 |
+
) -> IngestResult:
|
| 165 |
+
"""Run all extraction stages and return a single ``IngestResult``."""
|
| 166 |
+
|
| 167 |
+
if not os.path.exists(source_path):
|
| 168 |
+
raise IngestError(f"source_path does not exist: {source_path}")
|
| 169 |
+
|
| 170 |
+
Path(work_dir).mkdir(parents=True, exist_ok=True)
|
| 171 |
+
keyframes_dir = os.path.join(work_dir, "keyframes")
|
| 172 |
+
|
| 173 |
+
duration = probe_duration(source_path)
|
| 174 |
+
scenes = detect_scenes(source_path)
|
| 175 |
+
scenes = extract_keyframes(source_path, scenes, keyframes_dir)
|
| 176 |
+
|
| 177 |
+
words: list[TranscriptWord] = []
|
| 178 |
+
if with_transcript:
|
| 179 |
+
words = transcribe_audio(source_path, model_name=whisper_model)
|
| 180 |
+
|
| 181 |
+
return IngestResult(
|
| 182 |
+
source_path=os.path.abspath(source_path),
|
| 183 |
+
duration_sec=duration,
|
| 184 |
+
scenes=scenes,
|
| 185 |
+
transcript_words=words,
|
| 186 |
+
keyframes_dir=keyframes_dir,
|
| 187 |
+
)
|
humeo-core/src/humeo_core/primitives/layouts.py
ADDED
|
@@ -0,0 +1,707 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""The 9:16 layout thrusters — deterministic crop + compose math.
|
| 2 |
+
|
| 3 |
+
First principles: this video format has a hard constraint of **at most two
|
| 4 |
+
on-screen items** per short (see :class:`humeo_core.schemas.LayoutKind`). That
|
| 5 |
+
gives exactly five recipes:
|
| 6 |
+
|
| 7 |
+
* 1 person alone, tight → ``ZOOM_CALL_CENTER``
|
| 8 |
+
* 1 person alone, wider → ``SIT_CENTER``
|
| 9 |
+
* 1 chart + 1 person → ``SPLIT_CHART_PERSON``
|
| 10 |
+
* 2 persons → ``SPLIT_TWO_PERSONS``
|
| 11 |
+
* 2 charts → ``SPLIT_TWO_CHARTS``
|
| 12 |
+
|
| 13 |
+
Each planner returns a pure ``ffmpeg -filter_complex`` fragment ending in
|
| 14 |
+
``[vout]``. The compiler (``compile.py``) glues the fragment to the cut +
|
| 15 |
+
audio + subtitle chain. Because every planner is a pure function that
|
| 16 |
+
returns a string, the whole layout system is unit-testable without ever
|
| 17 |
+
invoking ffmpeg.
|
| 18 |
+
|
| 19 |
+
Split layouts share one contract:
|
| 20 |
+
|
| 21 |
+
* Output: 9:16 frame split into a **top band** and **bottom band**.
|
| 22 |
+
Band heights are driven by :attr:`LayoutInstruction.top_band_ratio`.
|
| 23 |
+
Default is ``0.5`` (even 50/50), matching the user-requested symmetric look.
|
| 24 |
+
* Source strips for the two items are **complementary** — they partition
|
| 25 |
+
the source width at a single seam so the two items never overlap and
|
| 26 |
+
together cover the full frame width.
|
| 27 |
+
* Each strip is scaled to fill its output band using the "cover"
|
| 28 |
+
convention (``force_original_aspect_ratio=increase`` + center crop), so
|
| 29 |
+
the band is fully painted (no letterbox bars, no stretch).
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
from dataclasses import dataclass
|
| 35 |
+
|
| 36 |
+
from ..schemas import (
|
| 37 |
+
BoundingBox,
|
| 38 |
+
FocusStackOrder,
|
| 39 |
+
LayoutInstruction,
|
| 40 |
+
LayoutKind,
|
| 41 |
+
TimedCenterPoint,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Source geometry assumption. Most podcast sources are 1920x1080; we still
|
| 46 |
+
# normalize everything by the actual source size so changing this is safe.
|
| 47 |
+
DEFAULT_SRC_W = 1920
|
| 48 |
+
DEFAULT_SRC_H = 1080
|
| 49 |
+
TRACKING_BLEND_SEC = 0.30
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@dataclass(frozen=True)
|
| 53 |
+
class FilterPlan:
|
| 54 |
+
"""Result of planning a layout.
|
| 55 |
+
|
| 56 |
+
``filtergraph`` is the body of ``-filter_complex`` and ends with
|
| 57 |
+
``[vout]`` as the final labelled stream.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
filtergraph: str
|
| 61 |
+
out_label: str = "vout"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# Tiny pixel helpers
|
| 66 |
+
# ---------------------------------------------------------------------------
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _clamp01(v: float) -> float:
|
| 70 |
+
return max(0.0, min(1.0, v))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _even(v: int) -> int:
|
| 74 |
+
"""Floor ``v`` to an even integer (ffmpeg ``crop``/``scale`` need even dims)."""
|
| 75 |
+
return v - (v % 2)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _bbox_to_crop_pixels(
|
| 79 |
+
box: BoundingBox, src_w: int, src_h: int
|
| 80 |
+
) -> tuple[int, int, int, int]:
|
| 81 |
+
"""Normalized bbox → ``(cw, ch, x, y)`` with even dimensions for ffmpeg."""
|
| 82 |
+
x1 = int(round(_clamp01(box.x1) * float(src_w)))
|
| 83 |
+
y1 = int(round(_clamp01(box.y1) * float(src_h)))
|
| 84 |
+
x2 = int(round(_clamp01(box.x2) * float(src_w)))
|
| 85 |
+
y2 = int(round(_clamp01(box.y2) * float(src_h)))
|
| 86 |
+
x1 = max(0, min(src_w - 2, x1))
|
| 87 |
+
y1 = max(0, min(src_h - 2, y1))
|
| 88 |
+
x2 = max(x1 + 2, min(src_w, x2))
|
| 89 |
+
y2 = max(y1 + 2, min(src_h, y2))
|
| 90 |
+
cw = _even(x2 - x1)
|
| 91 |
+
ch = _even(y2 - y1)
|
| 92 |
+
return max(2, cw), max(2, ch), _even(x1), _even(y1)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _base_crop_size(
|
| 96 |
+
src_w: int,
|
| 97 |
+
src_h: int,
|
| 98 |
+
target_aspect: float,
|
| 99 |
+
) -> tuple[int, int]:
|
| 100 |
+
if src_w / src_h >= target_aspect:
|
| 101 |
+
base_ch = src_h
|
| 102 |
+
base_cw = int(round(base_ch * target_aspect))
|
| 103 |
+
else:
|
| 104 |
+
base_cw = src_w
|
| 105 |
+
base_ch = int(round(base_cw / target_aspect))
|
| 106 |
+
return _even(max(2, base_cw)), _even(max(2, base_ch))
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _crop_box(
|
| 110 |
+
src_w: int,
|
| 111 |
+
src_h: int,
|
| 112 |
+
target_aspect: float,
|
| 113 |
+
zoom: float,
|
| 114 |
+
center_x_norm: float,
|
| 115 |
+
center_y_norm: float = 0.5,
|
| 116 |
+
) -> tuple[int, int, int, int]:
|
| 117 |
+
"""Return ``(cw, ch, x, y)`` crop values for a centered aspect-ratio crop.
|
| 118 |
+
|
| 119 |
+
``zoom > 1`` means tighter crop (smaller window around the center). The
|
| 120 |
+
function always keeps the crop window fully inside the source frame.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
zoom = max(1.0, zoom)
|
| 124 |
+
base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
|
| 125 |
+
|
| 126 |
+
cw = _even(max(2, int(round(base_cw / zoom))))
|
| 127 |
+
ch = _even(max(2, int(round(base_ch / zoom))))
|
| 128 |
+
|
| 129 |
+
cx = int(round(_clamp01(center_x_norm) * src_w))
|
| 130 |
+
cy = int(round(_clamp01(center_y_norm) * src_h))
|
| 131 |
+
x = _even(max(0, min(src_w - cw, cx - cw // 2)))
|
| 132 |
+
y = _even(max(0, min(src_h - ch, cy - ch // 2)))
|
| 133 |
+
return cw, ch, x, y
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _center_crop_to_9x16(
|
| 137 |
+
src_w: int, src_h: int, zoom: float, person_x_norm: float
|
| 138 |
+
) -> tuple[int, int, int, int]:
|
| 139 |
+
return _crop_box(src_w, src_h, 9 / 16, zoom, person_x_norm, 0.5)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _crop_x_from_center(src_w: int, cw: int, center_x_norm: float) -> int:
|
| 143 |
+
"""Return an even, in-bounds crop x for a normalized horizontal center."""
|
| 144 |
+
cx = int(round(_clamp01(center_x_norm) * src_w))
|
| 145 |
+
return _even(max(0, min(src_w - cw, cx - cw // 2)))
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _tracked_value_expr(
|
| 149 |
+
values: list[tuple[float, float]],
|
| 150 |
+
*,
|
| 151 |
+
clamp_min: float | None = None,
|
| 152 |
+
clamp_max: float | None = None,
|
| 153 |
+
round_even: bool = False,
|
| 154 |
+
) -> str:
|
| 155 |
+
if not values:
|
| 156 |
+
raise ValueError("values must not be empty")
|
| 157 |
+
|
| 158 |
+
expr = f"{float(values[-1][0]):.3f}"
|
| 159 |
+
for idx in range(len(values) - 2, -1, -1):
|
| 160 |
+
v0, t0 = float(values[idx][0]), float(values[idx][1])
|
| 161 |
+
v1, t1 = float(values[idx + 1][0]), float(values[idx + 1][1])
|
| 162 |
+
if t1 <= t0:
|
| 163 |
+
expr = f"if(lt(t\\,{t1:.3f})\\,{v0:.3f}\\,{expr})"
|
| 164 |
+
continue
|
| 165 |
+
|
| 166 |
+
switch_t = (t0 + t1) / 2.0
|
| 167 |
+
blend_half = TRACKING_BLEND_SEC / 2.0
|
| 168 |
+
blend_start = max(t0, switch_t - blend_half)
|
| 169 |
+
blend_end = min(t1, switch_t + blend_half)
|
| 170 |
+
|
| 171 |
+
if blend_end <= blend_start:
|
| 172 |
+
expr = f"if(lt(t\\,{switch_t:.3f})\\,{v0:.3f}\\,{expr})"
|
| 173 |
+
continue
|
| 174 |
+
|
| 175 |
+
blend_expr = (
|
| 176 |
+
f"{v0:.3f}+({v1 - v0:.3f})*(t-{blend_start:.3f})/({blend_end - blend_start:.3f})"
|
| 177 |
+
)
|
| 178 |
+
expr = (
|
| 179 |
+
f"if(lt(t\\,{blend_start:.3f})\\,{v0:.3f}\\,"
|
| 180 |
+
f"if(lt(t\\,{blend_end:.3f})\\,{blend_expr}\\,{expr}))"
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
if clamp_min is not None:
|
| 184 |
+
expr = f"max({clamp_min:.3f}\\,{expr})"
|
| 185 |
+
if clamp_max is not None:
|
| 186 |
+
expr = f"min({clamp_max:.3f}\\,{expr})"
|
| 187 |
+
if round_even:
|
| 188 |
+
expr = f"floor(({expr})/2)*2"
|
| 189 |
+
return expr
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _tracked_crop_x_expr(
|
| 193 |
+
*,
|
| 194 |
+
src_w: int,
|
| 195 |
+
crop_w: int,
|
| 196 |
+
tracking: list[TimedCenterPoint],
|
| 197 |
+
) -> str:
|
| 198 |
+
"""Return an ffmpeg expression for a time-varying crop x position.
|
| 199 |
+
|
| 200 |
+
We mostly hold each framing until the midpoint between adjacent samples,
|
| 201 |
+
then blend over a short window. That keeps edited talk footage from
|
| 202 |
+
drifting for seconds after a cut while still avoiding a one-frame jump
|
| 203 |
+
in the crop position.
|
| 204 |
+
"""
|
| 205 |
+
if not tracking:
|
| 206 |
+
raise ValueError("tracking must not be empty")
|
| 207 |
+
|
| 208 |
+
center_points = [
|
| 209 |
+
(_clamp01(point.x_norm) * src_w, float(point.t_sec))
|
| 210 |
+
for point in tracking
|
| 211 |
+
]
|
| 212 |
+
center_expr = _tracked_value_expr(
|
| 213 |
+
center_points,
|
| 214 |
+
clamp_min=0.0,
|
| 215 |
+
clamp_max=float(src_w),
|
| 216 |
+
)
|
| 217 |
+
max_x = max(0, src_w - crop_w)
|
| 218 |
+
return f"floor(max(0\\,min({max_x}\\,({center_expr})-{crop_w}/2))/2)*2"
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _tracked_crop_exprs(
|
| 222 |
+
*,
|
| 223 |
+
src_w: int,
|
| 224 |
+
src_h: int,
|
| 225 |
+
target_aspect: float,
|
| 226 |
+
default_zoom: float,
|
| 227 |
+
center_y_norm: float,
|
| 228 |
+
tracking: list[TimedCenterPoint],
|
| 229 |
+
) -> tuple[str, str, str, str]:
|
| 230 |
+
if not tracking:
|
| 231 |
+
raise ValueError("tracking must not be empty")
|
| 232 |
+
|
| 233 |
+
base_cw, base_ch = _base_crop_size(src_w, src_h, target_aspect)
|
| 234 |
+
width_points: list[tuple[float, float]] = []
|
| 235 |
+
height_points: list[tuple[float, float]] = []
|
| 236 |
+
center_points: list[tuple[float, float]] = []
|
| 237 |
+
for point in tracking:
|
| 238 |
+
zoom = max(1.0, float(point.zoom if point.zoom is not None else default_zoom))
|
| 239 |
+
width_points.append((float(_even(max(2, int(round(base_cw / zoom))))), float(point.t_sec)))
|
| 240 |
+
height_points.append((float(_even(max(2, int(round(base_ch / zoom))))), float(point.t_sec)))
|
| 241 |
+
center_points.append((_clamp01(point.x_norm) * src_w, float(point.t_sec)))
|
| 242 |
+
|
| 243 |
+
w_expr = _tracked_value_expr(
|
| 244 |
+
width_points,
|
| 245 |
+
clamp_min=2.0,
|
| 246 |
+
clamp_max=float(base_cw),
|
| 247 |
+
round_even=True,
|
| 248 |
+
)
|
| 249 |
+
h_expr = _tracked_value_expr(
|
| 250 |
+
height_points,
|
| 251 |
+
clamp_min=2.0,
|
| 252 |
+
clamp_max=float(base_ch),
|
| 253 |
+
round_even=True,
|
| 254 |
+
)
|
| 255 |
+
center_expr = _tracked_value_expr(
|
| 256 |
+
center_points,
|
| 257 |
+
clamp_min=0.0,
|
| 258 |
+
clamp_max=float(src_w),
|
| 259 |
+
)
|
| 260 |
+
center_y_px = _clamp01(center_y_norm) * src_h
|
| 261 |
+
x_expr = f"floor(max(0\\,min({src_w}-out_w\\,({center_expr})-out_w/2))/2)*2"
|
| 262 |
+
y_expr = f"floor(max(0\\,min({src_h}-out_h\\,{center_y_px:.3f}-out_h/2))/2)*2"
|
| 263 |
+
return w_expr, h_expr, x_expr, y_expr
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# ---------------------------------------------------------------------------
|
| 267 |
+
# Split helpers — shared by all three split layouts
|
| 268 |
+
# ---------------------------------------------------------------------------
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
# Minimum source-strip width for a split, as a fraction of source width.
|
| 272 |
+
# Prevents a chart/person bbox that hugs one edge from starving the other.
|
| 273 |
+
_MIN_SPLIT_STRIP_FRAC = 0.2
|
| 274 |
+
_CHART_STRIP_VERTICAL_PAD_FRAC = 0.12
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
@dataclass(frozen=True)
|
| 278 |
+
class _SplitStrip:
|
| 279 |
+
"""A source-frame crop rectangle destined for one output band."""
|
| 280 |
+
|
| 281 |
+
cw: int
|
| 282 |
+
ch: int
|
| 283 |
+
x: int
|
| 284 |
+
y: int
|
| 285 |
+
|
| 286 |
+
def filter_crop(self, input_label: str, out_w: int, band_h: int, out_label: str) -> str:
|
| 287 |
+
"""Return ``[input]crop=...,scale=...,crop=...,setsar=1[out_label]``.
|
| 288 |
+
|
| 289 |
+
Uses the "cover" convention: scale so the band is fully painted, then
|
| 290 |
+
center-crop any overflow. Bands always get filled — no letterbox bars.
|
| 291 |
+
"""
|
| 292 |
+
return (
|
| 293 |
+
f"[{input_label}]crop={self.cw}:{self.ch}:{self.x}:{self.y},"
|
| 294 |
+
f"scale={out_w}:{band_h}:force_original_aspect_ratio=increase,"
|
| 295 |
+
f"crop={out_w}:{band_h},setsar=1[{out_label}]"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _bbox_strip(
|
| 300 |
+
box: BoundingBox | None,
|
| 301 |
+
*,
|
| 302 |
+
src_w: int,
|
| 303 |
+
src_h: int,
|
| 304 |
+
x_start: int,
|
| 305 |
+
x_end: int,
|
| 306 |
+
) -> _SplitStrip:
|
| 307 |
+
"""Build a source crop for one band.
|
| 308 |
+
|
| 309 |
+
Horizontal range is fixed by ``[x_start, x_end)`` (from the seam math so
|
| 310 |
+
strips partition the source width). Vertical range comes from ``box``
|
| 311 |
+
when available — that's what makes the chart **fill** the output band
|
| 312 |
+
instead of being squashed inside full-height source context.
|
| 313 |
+
"""
|
| 314 |
+
x = _even(max(0, min(src_w - 2, x_start)))
|
| 315 |
+
cw = _even(max(2, min(src_w - x, x_end - x)))
|
| 316 |
+
|
| 317 |
+
if box is not None:
|
| 318 |
+
y1 = int(round(_clamp01(box.y1) * float(src_h)))
|
| 319 |
+
y2 = int(round(_clamp01(box.y2) * float(src_h)))
|
| 320 |
+
y = _even(max(0, min(src_h - 2, y1)))
|
| 321 |
+
ch = _even(max(2, min(src_h - y, y2 - y)))
|
| 322 |
+
else:
|
| 323 |
+
y = 0
|
| 324 |
+
ch = _even(src_h)
|
| 325 |
+
|
| 326 |
+
return _SplitStrip(cw=cw, ch=ch, x=x, y=y)
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def _chart_strip_with_vertical_pad(
|
| 330 |
+
strip: _SplitStrip,
|
| 331 |
+
*,
|
| 332 |
+
src_h: int,
|
| 333 |
+
pad_frac: float = _CHART_STRIP_VERTICAL_PAD_FRAC,
|
| 334 |
+
) -> _SplitStrip:
|
| 335 |
+
"""Relax chart crops vertically so cover-scaling trims fewer chart edges."""
|
| 336 |
+
|
| 337 |
+
pad = _even(max(0, int(round(strip.ch * max(0.0, pad_frac)))))
|
| 338 |
+
if pad <= 0:
|
| 339 |
+
return strip
|
| 340 |
+
|
| 341 |
+
top = max(0, strip.y - pad)
|
| 342 |
+
bottom = min(src_h, strip.y + strip.ch + pad)
|
| 343 |
+
ch = _even(max(2, bottom - top))
|
| 344 |
+
if ch <= strip.ch:
|
| 345 |
+
return strip
|
| 346 |
+
y = _even(max(0, min(src_h - ch, top)))
|
| 347 |
+
return _SplitStrip(cw=strip.cw, ch=ch, x=strip.x, y=y)
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def _compute_seam(
|
| 351 |
+
*,
|
| 352 |
+
left_box: BoundingBox | None,
|
| 353 |
+
right_box: BoundingBox | None,
|
| 354 |
+
src_w: int,
|
| 355 |
+
src_h: int,
|
| 356 |
+
default_fraction: float = 0.5,
|
| 357 |
+
) -> int:
|
| 358 |
+
"""Return an even x-coordinate that partitions the source into two strips.
|
| 359 |
+
|
| 360 |
+
When both bboxes are known, the seam is the midpoint of the gap/overlap
|
| 361 |
+
between ``left_box.x2`` and ``right_box.x1``. Falls back to
|
| 362 |
+
``default_fraction * src_w`` (0.5 = even) otherwise. The seam is clamped
|
| 363 |
+
so neither strip is thinner than :data:`_MIN_SPLIT_STRIP_FRAC` of source.
|
| 364 |
+
"""
|
| 365 |
+
if left_box is not None and right_box is not None:
|
| 366 |
+
_, _, left_x, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
|
| 367 |
+
left_cw, _, _, _ = _bbox_to_crop_pixels(left_box, src_w, src_h)
|
| 368 |
+
_, _, right_x, _ = _bbox_to_crop_pixels(right_box, src_w, src_h)
|
| 369 |
+
|
| 370 |
+
left_right = left_x + left_cw
|
| 371 |
+
seam = int(round((left_right + right_x) / 2.0))
|
| 372 |
+
else:
|
| 373 |
+
seam = int(round(default_fraction * float(src_w)))
|
| 374 |
+
|
| 375 |
+
seam = _even(seam)
|
| 376 |
+
min_strip = _even(max(2, int(round(src_w * _MIN_SPLIT_STRIP_FRAC))))
|
| 377 |
+
if min_strip * 2 >= src_w:
|
| 378 |
+
min_strip = _even(max(2, src_w // 4))
|
| 379 |
+
return max(min_strip, min(src_w - min_strip, seam))
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def _band_heights(out_h: int, top_ratio: float) -> tuple[int, int]:
|
| 383 |
+
"""Return ``(top_h, bot_h)`` even band heights that sum to ``out_h``."""
|
| 384 |
+
top_h = _even(int(round(out_h * top_ratio)))
|
| 385 |
+
top_h = max(2, min(out_h - 2, top_h))
|
| 386 |
+
bot_h = out_h - top_h
|
| 387 |
+
return top_h, bot_h
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def _stack_filtergraph(
|
| 391 |
+
*,
|
| 392 |
+
top_strip: _SplitStrip,
|
| 393 |
+
bot_strip: _SplitStrip,
|
| 394 |
+
out_w: int,
|
| 395 |
+
top_h: int,
|
| 396 |
+
bot_h: int,
|
| 397 |
+
) -> str:
|
| 398 |
+
"""Compose the split filter graph: ``[0:v]split=2 → two crops → vstack → [vout]``."""
|
| 399 |
+
top_fg = top_strip.filter_crop("src1", out_w, top_h, "top")
|
| 400 |
+
bot_fg = bot_strip.filter_crop("src2", out_w, bot_h, "bot")
|
| 401 |
+
return (
|
| 402 |
+
f"[0:v]split=2[src1][src2];"
|
| 403 |
+
f"{top_fg};"
|
| 404 |
+
f"{bot_fg};"
|
| 405 |
+
f"[top][bot]vstack=inputs=2[vout]"
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
# ---------------------------------------------------------------------------
|
| 410 |
+
# Layout: single-subject (centered) — 1 person
|
| 411 |
+
# ---------------------------------------------------------------------------
|
| 412 |
+
|
| 413 |
+
|
| 414 |
+
def plan_zoom_call_center(
|
| 415 |
+
instruction: LayoutInstruction,
|
| 416 |
+
*,
|
| 417 |
+
out_w: int,
|
| 418 |
+
out_h: int,
|
| 419 |
+
src_w: int = DEFAULT_SRC_W,
|
| 420 |
+
src_h: int = DEFAULT_SRC_H,
|
| 421 |
+
) -> FilterPlan:
|
| 422 |
+
"""1 person, tight zoom-call framing. ``zoom`` clamped to ``>= 1.25``."""
|
| 423 |
+
zoom = max(instruction.zoom, 1.25)
|
| 424 |
+
cw, ch, x, y = _center_crop_to_9x16(src_w, src_h, zoom, instruction.person_x_norm)
|
| 425 |
+
if instruction.person_tracking:
|
| 426 |
+
if any(point.zoom is not None for point in instruction.person_tracking):
|
| 427 |
+
w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
|
| 428 |
+
src_w=src_w,
|
| 429 |
+
src_h=src_h,
|
| 430 |
+
target_aspect=9 / 16,
|
| 431 |
+
default_zoom=zoom,
|
| 432 |
+
center_y_norm=0.5,
|
| 433 |
+
tracking=instruction.person_tracking,
|
| 434 |
+
)
|
| 435 |
+
fg = (
|
| 436 |
+
f"[0:v]setpts=PTS-STARTPTS[vsrc];"
|
| 437 |
+
f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
|
| 438 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 439 |
+
)
|
| 440 |
+
else:
|
| 441 |
+
x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
|
| 442 |
+
fg = (
|
| 443 |
+
f"[0:v]setpts=PTS-STARTPTS[vsrc];"
|
| 444 |
+
f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
|
| 445 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 446 |
+
)
|
| 447 |
+
else:
|
| 448 |
+
fg = (
|
| 449 |
+
f"[0:v]crop={cw}:{ch}:{x}:{y},"
|
| 450 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 451 |
+
)
|
| 452 |
+
return FilterPlan(filtergraph=fg)
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def plan_sit_center(
|
| 456 |
+
instruction: LayoutInstruction,
|
| 457 |
+
*,
|
| 458 |
+
out_w: int,
|
| 459 |
+
out_h: int,
|
| 460 |
+
src_w: int = DEFAULT_SRC_W,
|
| 461 |
+
src_h: int = DEFAULT_SRC_H,
|
| 462 |
+
) -> FilterPlan:
|
| 463 |
+
"""1 person, interview/seated framing. Vertical center biased to ``0.48``
|
| 464 |
+
so faces sit slightly above the 9:16 middle instead of centered on a
|
| 465 |
+
subject's chest.
|
| 466 |
+
"""
|
| 467 |
+
zoom = max(instruction.zoom, 1.0)
|
| 468 |
+
cw, ch, x, y = _crop_box(
|
| 469 |
+
src_w, src_h, 9 / 16, zoom, instruction.person_x_norm, 0.48
|
| 470 |
+
)
|
| 471 |
+
if instruction.person_tracking:
|
| 472 |
+
if any(point.zoom is not None for point in instruction.person_tracking):
|
| 473 |
+
w_expr, h_expr, x_expr, y_expr = _tracked_crop_exprs(
|
| 474 |
+
src_w=src_w,
|
| 475 |
+
src_h=src_h,
|
| 476 |
+
target_aspect=9 / 16,
|
| 477 |
+
default_zoom=zoom,
|
| 478 |
+
center_y_norm=0.48,
|
| 479 |
+
tracking=instruction.person_tracking,
|
| 480 |
+
)
|
| 481 |
+
fg = (
|
| 482 |
+
f"[0:v]setpts=PTS-STARTPTS[vsrc];"
|
| 483 |
+
f"[vsrc]crop={w_expr}:{h_expr}:{x_expr}:{y_expr},"
|
| 484 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 485 |
+
)
|
| 486 |
+
else:
|
| 487 |
+
x_expr = _tracked_crop_x_expr(src_w=src_w, crop_w=cw, tracking=instruction.person_tracking)
|
| 488 |
+
fg = (
|
| 489 |
+
f"[0:v]setpts=PTS-STARTPTS[vsrc];"
|
| 490 |
+
f"[vsrc]crop={cw}:{ch}:{x_expr}:{y},"
|
| 491 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 492 |
+
)
|
| 493 |
+
else:
|
| 494 |
+
fg = (
|
| 495 |
+
f"[0:v]crop={cw}:{ch}:{x}:{y},"
|
| 496 |
+
f"scale={out_w}:{out_h}:flags=lanczos,setsar=1[vout]"
|
| 497 |
+
)
|
| 498 |
+
return FilterPlan(filtergraph=fg)
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
# ---------------------------------------------------------------------------
|
| 502 |
+
# Split layouts — 2 items stacked vertically
|
| 503 |
+
# ---------------------------------------------------------------------------
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def plan_split_chart_person(
|
| 507 |
+
instruction: LayoutInstruction,
|
| 508 |
+
*,
|
| 509 |
+
out_w: int,
|
| 510 |
+
out_h: int,
|
| 511 |
+
src_w: int = DEFAULT_SRC_W,
|
| 512 |
+
src_h: int = DEFAULT_SRC_H,
|
| 513 |
+
) -> FilterPlan:
|
| 514 |
+
"""1 chart + 1 person.
|
| 515 |
+
|
| 516 |
+
**Horizontal partition.** Chart occupies the left source strip, person the
|
| 517 |
+
right strip. When both bboxes are set (Gemini vision), the seam sits at
|
| 518 |
+
the midpoint between ``chart.x2`` and ``person.x1`` so the strips are
|
| 519 |
+
complementary (no overlap, no gap). Otherwise the seam defaults to a
|
| 520 |
+
2/3 | 1/3 split (chart left, person right), matching the Ark-style
|
| 521 |
+
explainer-slide geometry this codebase was originally written against.
|
| 522 |
+
|
| 523 |
+
**Vertical crop.** Each strip's vertical extent comes from the
|
| 524 |
+
corresponding bbox when provided — crucial so the chart **fills** its
|
| 525 |
+
output band instead of being lost inside full-height source context
|
| 526 |
+
(plant, background, lower-third graphics, etc.). Falls back to full
|
| 527 |
+
source height when bboxes are unavailable.
|
| 528 |
+
|
| 529 |
+
**Output bands.** Controlled by :attr:`LayoutInstruction.top_band_ratio`
|
| 530 |
+
(default 0.5 = even 50/50 — the user-requested symmetric look). Focus
|
| 531 |
+
stack order picks chart-on-top (default) vs person-on-top.
|
| 532 |
+
"""
|
| 533 |
+
|
| 534 |
+
top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
|
| 535 |
+
|
| 536 |
+
chart_box = instruction.split_chart_region
|
| 537 |
+
person_box = instruction.split_person_region
|
| 538 |
+
|
| 539 |
+
if chart_box is not None and person_box is not None:
|
| 540 |
+
seam = _compute_seam(
|
| 541 |
+
left_box=chart_box, right_box=person_box, src_w=src_w, src_h=src_h
|
| 542 |
+
)
|
| 543 |
+
chart_start = 0
|
| 544 |
+
else:
|
| 545 |
+
# Historical default: chart = left 2/3, person = right 1/3 (the
|
| 546 |
+
# Ark-style explainer-slide geometry this codebase was originally
|
| 547 |
+
# written against). ``chart_x_norm`` trims the chart strip from its
|
| 548 |
+
# left edge when we have no vision bbox to do it precisely.
|
| 549 |
+
seam = _even(max(2, min(src_w - 2, int(round((2.0 / 3.0) * float(src_w))))))
|
| 550 |
+
trim = int(round(_clamp01(instruction.chart_x_norm) * float(seam)))
|
| 551 |
+
chart_start = _even(max(0, min(seam - 2, trim)))
|
| 552 |
+
|
| 553 |
+
chart_strip = _bbox_strip(
|
| 554 |
+
chart_box, src_w=src_w, src_h=src_h, x_start=chart_start, x_end=seam
|
| 555 |
+
)
|
| 556 |
+
if chart_box is not None:
|
| 557 |
+
chart_strip = _chart_strip_with_vertical_pad(chart_strip, src_h=src_h)
|
| 558 |
+
person_strip = _bbox_strip(
|
| 559 |
+
person_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
|
| 560 |
+
)
|
| 561 |
+
return _emit_split(
|
| 562 |
+
chart_strip=chart_strip,
|
| 563 |
+
person_strip=person_strip,
|
| 564 |
+
order=instruction.focus_stack_order,
|
| 565 |
+
out_w=out_w,
|
| 566 |
+
top_h=top_h,
|
| 567 |
+
bot_h=bot_h,
|
| 568 |
+
)
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def _emit_split(
|
| 572 |
+
*,
|
| 573 |
+
chart_strip: _SplitStrip,
|
| 574 |
+
person_strip: _SplitStrip,
|
| 575 |
+
order: FocusStackOrder,
|
| 576 |
+
out_w: int,
|
| 577 |
+
top_h: int,
|
| 578 |
+
bot_h: int,
|
| 579 |
+
) -> FilterPlan:
|
| 580 |
+
if order == FocusStackOrder.CHART_THEN_PERSON:
|
| 581 |
+
fg = _stack_filtergraph(
|
| 582 |
+
top_strip=chart_strip,
|
| 583 |
+
bot_strip=person_strip,
|
| 584 |
+
out_w=out_w,
|
| 585 |
+
top_h=top_h,
|
| 586 |
+
bot_h=bot_h,
|
| 587 |
+
)
|
| 588 |
+
else:
|
| 589 |
+
fg = _stack_filtergraph(
|
| 590 |
+
top_strip=person_strip,
|
| 591 |
+
bot_strip=chart_strip,
|
| 592 |
+
out_w=out_w,
|
| 593 |
+
top_h=top_h,
|
| 594 |
+
bot_h=bot_h,
|
| 595 |
+
)
|
| 596 |
+
return FilterPlan(filtergraph=fg)
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
def plan_split_two_persons(
|
| 600 |
+
instruction: LayoutInstruction,
|
| 601 |
+
*,
|
| 602 |
+
out_w: int,
|
| 603 |
+
out_h: int,
|
| 604 |
+
src_w: int = DEFAULT_SRC_W,
|
| 605 |
+
src_h: int = DEFAULT_SRC_H,
|
| 606 |
+
) -> FilterPlan:
|
| 607 |
+
"""2 persons (interview two-up) stacked vertically.
|
| 608 |
+
|
| 609 |
+
First person = ``split_person_region``, second person = ``split_second_person_region``.
|
| 610 |
+
Seam sits at the midpoint between the two bboxes when both are known;
|
| 611 |
+
otherwise defaults to a centered 50/50 split.
|
| 612 |
+
"""
|
| 613 |
+
top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
|
| 614 |
+
|
| 615 |
+
left_box = instruction.split_person_region
|
| 616 |
+
right_box = instruction.split_second_person_region
|
| 617 |
+
|
| 618 |
+
seam = _compute_seam(
|
| 619 |
+
left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
left_strip = _bbox_strip(
|
| 623 |
+
left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
|
| 624 |
+
)
|
| 625 |
+
right_strip = _bbox_strip(
|
| 626 |
+
right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
|
| 627 |
+
)
|
| 628 |
+
fg = _stack_filtergraph(
|
| 629 |
+
top_strip=left_strip,
|
| 630 |
+
bot_strip=right_strip,
|
| 631 |
+
out_w=out_w,
|
| 632 |
+
top_h=top_h,
|
| 633 |
+
bot_h=bot_h,
|
| 634 |
+
)
|
| 635 |
+
return FilterPlan(filtergraph=fg)
|
| 636 |
+
|
| 637 |
+
|
| 638 |
+
def plan_split_two_charts(
|
| 639 |
+
instruction: LayoutInstruction,
|
| 640 |
+
*,
|
| 641 |
+
out_w: int,
|
| 642 |
+
out_h: int,
|
| 643 |
+
src_w: int = DEFAULT_SRC_W,
|
| 644 |
+
src_h: int = DEFAULT_SRC_H,
|
| 645 |
+
) -> FilterPlan:
|
| 646 |
+
"""2 charts stacked vertically.
|
| 647 |
+
|
| 648 |
+
First chart = ``split_chart_region``, second chart = ``split_second_chart_region``.
|
| 649 |
+
Uses the same seam/bbox-y-crop recipe as the other splits, so each chart
|
| 650 |
+
fills its output band instead of being surrounded by source context.
|
| 651 |
+
"""
|
| 652 |
+
top_h, bot_h = _band_heights(out_h, instruction.top_band_ratio)
|
| 653 |
+
|
| 654 |
+
left_box = instruction.split_chart_region
|
| 655 |
+
right_box = instruction.split_second_chart_region
|
| 656 |
+
|
| 657 |
+
seam = _compute_seam(
|
| 658 |
+
left_box=left_box, right_box=right_box, src_w=src_w, src_h=src_h
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
left_strip = _bbox_strip(
|
| 662 |
+
left_box, src_w=src_w, src_h=src_h, x_start=0, x_end=seam
|
| 663 |
+
)
|
| 664 |
+
if left_box is not None:
|
| 665 |
+
left_strip = _chart_strip_with_vertical_pad(left_strip, src_h=src_h)
|
| 666 |
+
right_strip = _bbox_strip(
|
| 667 |
+
right_box, src_w=src_w, src_h=src_h, x_start=seam, x_end=src_w
|
| 668 |
+
)
|
| 669 |
+
if right_box is not None:
|
| 670 |
+
right_strip = _chart_strip_with_vertical_pad(right_strip, src_h=src_h)
|
| 671 |
+
fg = _stack_filtergraph(
|
| 672 |
+
top_strip=left_strip,
|
| 673 |
+
bot_strip=right_strip,
|
| 674 |
+
out_w=out_w,
|
| 675 |
+
top_h=top_h,
|
| 676 |
+
bot_h=bot_h,
|
| 677 |
+
)
|
| 678 |
+
return FilterPlan(filtergraph=fg)
|
| 679 |
+
|
| 680 |
+
|
| 681 |
+
_DISPATCH = {
|
| 682 |
+
LayoutKind.ZOOM_CALL_CENTER: plan_zoom_call_center,
|
| 683 |
+
LayoutKind.SIT_CENTER: plan_sit_center,
|
| 684 |
+
LayoutKind.SPLIT_CHART_PERSON: plan_split_chart_person,
|
| 685 |
+
LayoutKind.SPLIT_TWO_PERSONS: plan_split_two_persons,
|
| 686 |
+
LayoutKind.SPLIT_TWO_CHARTS: plan_split_two_charts,
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
def plan_layout(
|
| 691 |
+
instruction: LayoutInstruction,
|
| 692 |
+
*,
|
| 693 |
+
out_w: int = 1080,
|
| 694 |
+
out_h: int = 1920,
|
| 695 |
+
src_w: int = DEFAULT_SRC_W,
|
| 696 |
+
src_h: int = DEFAULT_SRC_H,
|
| 697 |
+
) -> FilterPlan:
|
| 698 |
+
"""Dispatch to one of the five thrusters.
|
| 699 |
+
|
| 700 |
+
Exhaustive over :class:`LayoutKind` — adding a new layout requires adding
|
| 701 |
+
a planner above **and** an entry in :data:`_DISPATCH`.
|
| 702 |
+
"""
|
| 703 |
+
|
| 704 |
+
fn = _DISPATCH.get(instruction.layout)
|
| 705 |
+
if fn is None:
|
| 706 |
+
raise ValueError(f"Unknown layout: {instruction.layout!r}")
|
| 707 |
+
return fn(instruction, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
|
humeo-core/src/humeo_core/primitives/select_clips.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Clip selection: pick the strongest 30-60s segments from a long source.
|
| 2 |
+
|
| 3 |
+
Two backends, same contract:
|
| 4 |
+
|
| 5 |
+
* ``select_clips_heuristic`` — greedy word-density scoring. Uses the
|
| 6 |
+
transcript alone; zero model calls. Good baseline when transcript exists.
|
| 7 |
+
* ``select_clips_with_llm`` — pluggable LLM hook. Caller provides a
|
| 8 |
+
``(prompt_text) -> str`` function that must return strict JSON matching
|
| 9 |
+
the ``ClipPlan`` schema. We re-validate before returning.
|
| 10 |
+
|
| 11 |
+
Both return a ``ClipPlan``.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
from typing import Callable
|
| 18 |
+
|
| 19 |
+
from ..schemas import Clip, ClipPlan, TranscriptWord
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
LLMTextFn = Callable[[str], str]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
CLIP_SELECTOR_PROMPT_TEMPLATE = """You are a viral-clip selector for a podcast editor.
|
| 26 |
+
Return ONLY JSON matching this shape:
|
| 27 |
+
|
| 28 |
+
{{
|
| 29 |
+
"source_path": "{source_path}",
|
| 30 |
+
"clips": [
|
| 31 |
+
{{
|
| 32 |
+
"clip_id": "001",
|
| 33 |
+
"topic": "<short topic>",
|
| 34 |
+
"start_time_sec": <float>,
|
| 35 |
+
"end_time_sec": <float>,
|
| 36 |
+
"viral_hook": "<one line>",
|
| 37 |
+
"virality_score": <0..1>,
|
| 38 |
+
"transcript": "<full clip transcript>",
|
| 39 |
+
"suggested_overlay_title": "<<=6 words>"
|
| 40 |
+
}}
|
| 41 |
+
]
|
| 42 |
+
}}
|
| 43 |
+
|
| 44 |
+
Pick {target_count} clips, each {min_sec}-{max_sec} seconds long, NO overlaps, sorted by virality_score desc.
|
| 45 |
+
|
| 46 |
+
Transcript (word, start, end):
|
| 47 |
+
{transcript}
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _words_in_window(
|
| 52 |
+
words: list[TranscriptWord], start: float, end: float
|
| 53 |
+
) -> list[TranscriptWord]:
|
| 54 |
+
return [w for w in words if w.start_time >= start and w.end_time <= end]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def select_clips_heuristic(
|
| 58 |
+
source_path: str,
|
| 59 |
+
words: list[TranscriptWord],
|
| 60 |
+
duration_sec: float,
|
| 61 |
+
*,
|
| 62 |
+
target_count: int = 5,
|
| 63 |
+
min_sec: float = 30.0,
|
| 64 |
+
max_sec: float = 60.0,
|
| 65 |
+
step_sec: float = 5.0,
|
| 66 |
+
) -> ClipPlan:
|
| 67 |
+
"""Greedy: slide a window, score by words/sec, take top non-overlapping picks."""
|
| 68 |
+
|
| 69 |
+
if duration_sec <= min_sec or not words:
|
| 70 |
+
# No sensible windowing possible; return one clip of the whole thing.
|
| 71 |
+
end = min(duration_sec, max_sec) if duration_sec > 0 else max_sec
|
| 72 |
+
return ClipPlan(
|
| 73 |
+
source_path=source_path,
|
| 74 |
+
clips=[
|
| 75 |
+
Clip(
|
| 76 |
+
clip_id="001",
|
| 77 |
+
topic="Full source",
|
| 78 |
+
start_time_sec=0.0,
|
| 79 |
+
end_time_sec=max(end, 1.0),
|
| 80 |
+
viral_hook="",
|
| 81 |
+
virality_score=0.5,
|
| 82 |
+
transcript=" ".join(w.word for w in words),
|
| 83 |
+
suggested_overlay_title="Highlight",
|
| 84 |
+
)
|
| 85 |
+
],
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
candidates: list[tuple[float, float, float, str]] = []
|
| 89 |
+
window = (min_sec + max_sec) / 2.0
|
| 90 |
+
t = 0.0
|
| 91 |
+
while t + window <= duration_sec:
|
| 92 |
+
ws = _words_in_window(words, t, t + window)
|
| 93 |
+
if ws:
|
| 94 |
+
density = len(ws) / window
|
| 95 |
+
text = " ".join(w.word for w in ws)
|
| 96 |
+
candidates.append((density, t, t + window, text))
|
| 97 |
+
t += step_sec
|
| 98 |
+
|
| 99 |
+
candidates.sort(key=lambda c: c[0], reverse=True)
|
| 100 |
+
picked: list[tuple[float, float, float, str]] = []
|
| 101 |
+
for c in candidates:
|
| 102 |
+
if len(picked) >= target_count:
|
| 103 |
+
break
|
| 104 |
+
if all(c[2] <= p[1] or c[1] >= p[2] for p in picked):
|
| 105 |
+
picked.append(c)
|
| 106 |
+
picked.sort(key=lambda c: c[1])
|
| 107 |
+
|
| 108 |
+
clips: list[Clip] = []
|
| 109 |
+
for i, (density, s, e, text) in enumerate(picked, start=1):
|
| 110 |
+
norm = min(1.0, density / 3.0) # ~3 words/sec is dense talking
|
| 111 |
+
clips.append(
|
| 112 |
+
Clip(
|
| 113 |
+
clip_id=f"{i:03d}",
|
| 114 |
+
topic=text.split(".")[0][:60] or f"Clip {i}",
|
| 115 |
+
start_time_sec=round(s, 2),
|
| 116 |
+
end_time_sec=round(e, 2),
|
| 117 |
+
viral_hook=text[:120],
|
| 118 |
+
virality_score=round(norm, 3),
|
| 119 |
+
transcript=text,
|
| 120 |
+
suggested_overlay_title=(text.split(".")[0][:40] or f"Clip {i}"),
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
return ClipPlan(source_path=source_path, clips=clips)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def select_clips_with_llm(
|
| 127 |
+
source_path: str,
|
| 128 |
+
words: list[TranscriptWord],
|
| 129 |
+
*,
|
| 130 |
+
target_count: int,
|
| 131 |
+
min_sec: float,
|
| 132 |
+
max_sec: float,
|
| 133 |
+
text_fn: LLMTextFn,
|
| 134 |
+
) -> ClipPlan:
|
| 135 |
+
transcript_lines = "\n".join(
|
| 136 |
+
f"{w.word}\t{w.start_time:.2f}\t{w.end_time:.2f}" for w in words
|
| 137 |
+
)
|
| 138 |
+
prompt = CLIP_SELECTOR_PROMPT_TEMPLATE.format(
|
| 139 |
+
source_path=source_path,
|
| 140 |
+
target_count=target_count,
|
| 141 |
+
min_sec=min_sec,
|
| 142 |
+
max_sec=max_sec,
|
| 143 |
+
transcript=transcript_lines,
|
| 144 |
+
)
|
| 145 |
+
raw = text_fn(prompt)
|
| 146 |
+
try:
|
| 147 |
+
data = json.loads(raw)
|
| 148 |
+
except json.JSONDecodeError as e:
|
| 149 |
+
raise ValueError(f"LLM did not return JSON: {e}; raw={raw[:200]!r}") from e
|
| 150 |
+
return ClipPlan.model_validate(data)
|
humeo-core/src/humeo_core/primitives/vision.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Vision-LLM + OCR primitive — the alt path to per-scene framing decisions.
|
| 2 |
+
|
| 3 |
+
Design (Bryan's "big screen change -> v3 images -> LLM+OCR -> bbox" idea):
|
| 4 |
+
|
| 5 |
+
1. Scene detection already produces one keyframe per scene (deterministic,
|
| 6 |
+
local, cheap). That is ``primitives/ingest.py::extract_keyframes``.
|
| 7 |
+
2. For each keyframe, call a pluggable vision LLM with an OCR hint. The
|
| 8 |
+
model returns normalized bboxes for the on-screen roles it cares about
|
| 9 |
+
(``person``, ``chart``) plus any OCR text it reads.
|
| 10 |
+
3. Fold those bboxes into ``LayoutInstruction`` values so the existing
|
| 11 |
+
layout planner (``primitives/layouts.py``) does the actual ffmpeg math.
|
| 12 |
+
|
| 13 |
+
Why this shape:
|
| 14 |
+
|
| 15 |
+
* **Pluggable**. Caller supplies ``LLMRegionFn``. We never hard-code a
|
| 16 |
+
provider. The same primitive works for Gemini, GPT-4o, internal models,
|
| 17 |
+
tests, or mocks.
|
| 18 |
+
* **Schema-validated**. Raw model output is parsed into ``SceneRegions``
|
| 19 |
+
(Pydantic). Malformed output degrades to ``None`` regions rather than
|
| 20 |
+
crashing or corrupting downstream state.
|
| 21 |
+
* **Separable**. ``detect_regions_with_llm`` is one function. Mapping
|
| 22 |
+
regions to ``LayoutInstruction`` is another. Mapping a ``LayoutKind``
|
| 23 |
+
guess from regions is a third. Each is independently testable.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import json
|
| 29 |
+
from typing import Callable
|
| 30 |
+
|
| 31 |
+
from ..schemas import (
|
| 32 |
+
BoundingBox,
|
| 33 |
+
LayoutInstruction,
|
| 34 |
+
LayoutKind,
|
| 35 |
+
Scene,
|
| 36 |
+
SceneClassification,
|
| 37 |
+
SceneRegions,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
LLMRegionFn = Callable[[str, str], str]
|
| 42 |
+
"""Signature: (keyframe_path, prompt) -> raw model string (expected JSON).
|
| 43 |
+
|
| 44 |
+
The caller is responsible for any image encoding (base64, multipart, etc.).
|
| 45 |
+
The primitive only passes the path + prompt and re-validates the reply.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
REGION_PROMPT = """You are a vision+OCR system for a short-video editor.
|
| 50 |
+
Look at the provided keyframe and return a STRICT JSON object of this shape:
|
| 51 |
+
|
| 52 |
+
{
|
| 53 |
+
"person_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
|
| 54 |
+
"chart_bbox": {"x1": <0..1>, "y1": <0..1>, "x2": <0..1>, "y2": <0..1>, "confidence": <0..1>} | null,
|
| 55 |
+
"ocr_text": "<text visible on screen, empty string if none>",
|
| 56 |
+
"reason": "<= 20 words of rationale"
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
Rules:
|
| 60 |
+
- All bbox coordinates are normalized to the frame (0=left/top, 1=right/bottom).
|
| 61 |
+
- x2 > x1, y2 > y1.
|
| 62 |
+
- Return null for any region that is not present (e.g. a pure talking-head
|
| 63 |
+
scene has no chart).
|
| 64 |
+
- "person_bbox" is the *speaker's* body/head region if visible.
|
| 65 |
+
- "chart_bbox" is any chart, graph, slide, screenshare, or diagram.
|
| 66 |
+
- OCR text should be the readable text on screen (titles, labels, chart
|
| 67 |
+
axis values). Omit subtitle captions.
|
| 68 |
+
- NO markdown, NO prose outside JSON. JSON only.
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ---------------------------------------------------------------------------
|
| 73 |
+
# Core: detect regions per scene via pluggable LLM
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def detect_regions_with_llm(
|
| 78 |
+
scenes: list[Scene], vision_fn: LLMRegionFn
|
| 79 |
+
) -> list[SceneRegions]:
|
| 80 |
+
"""Call ``vision_fn`` for each scene's keyframe and return parsed regions.
|
| 81 |
+
|
| 82 |
+
Parse failures degrade to an empty ``SceneRegions`` with ``raw_reason``
|
| 83 |
+
describing the error — never raise — so a single bad scene can't take
|
| 84 |
+
down the whole pipeline.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
out: list[SceneRegions] = []
|
| 88 |
+
for s in scenes:
|
| 89 |
+
if not s.keyframe_path:
|
| 90 |
+
out.append(
|
| 91 |
+
SceneRegions(scene_id=s.scene_id, raw_reason="no keyframe available")
|
| 92 |
+
)
|
| 93 |
+
continue
|
| 94 |
+
raw = vision_fn(s.keyframe_path, REGION_PROMPT)
|
| 95 |
+
out.append(_parse_region_reply(s.scene_id, raw))
|
| 96 |
+
return out
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _parse_region_reply(scene_id: str, raw: str) -> SceneRegions:
|
| 100 |
+
try:
|
| 101 |
+
data = json.loads(raw)
|
| 102 |
+
except json.JSONDecodeError as e:
|
| 103 |
+
return SceneRegions(scene_id=scene_id, raw_reason=f"JSON parse error: {e!r}")
|
| 104 |
+
|
| 105 |
+
def _opt_bbox(value: object) -> BoundingBox | None:
|
| 106 |
+
if not value:
|
| 107 |
+
return None
|
| 108 |
+
try:
|
| 109 |
+
return BoundingBox.model_validate(value)
|
| 110 |
+
except Exception:
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
return SceneRegions(
|
| 114 |
+
scene_id=scene_id,
|
| 115 |
+
person_bbox=_opt_bbox(data.get("person_bbox")),
|
| 116 |
+
chart_bbox=_opt_bbox(data.get("chart_bbox")),
|
| 117 |
+
ocr_text=str(data.get("ocr_text", ""))[:4000],
|
| 118 |
+
raw_reason=str(data.get("reason", ""))[:400],
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
# Derivation: regions -> LayoutKind / LayoutInstruction
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# Width threshold: if the chart bbox covers this much of the frame width, it
|
| 128 |
+
# is wide enough to treat the scene as a split_chart_person. Tuned for the
|
| 129 |
+
# source videos described in the spec (chart ~2/3 of width).
|
| 130 |
+
_CHART_WIDTH_SPLIT_THRESHOLD = 0.45
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def classify_from_regions(regions: SceneRegions) -> SceneClassification:
|
| 134 |
+
"""Pick a ``LayoutKind`` for a scene using only its ``SceneRegions``.
|
| 135 |
+
|
| 136 |
+
Priority:
|
| 137 |
+
1. If ``chart_bbox`` is present and wide, it's ``SPLIT_CHART_PERSON``.
|
| 138 |
+
2. Else if ``person_bbox`` is present and tight, ``ZOOM_CALL_CENTER``.
|
| 139 |
+
3. Else default to ``SIT_CENTER`` with low confidence.
|
| 140 |
+
|
| 141 |
+
"Tight" ≈ the person covers more than half the frame width (zoom-call
|
| 142 |
+
webcam framing). "Wide" for a chart ≈ 45% of frame width or more.
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
if regions.chart_bbox and regions.chart_bbox.width >= _CHART_WIDTH_SPLIT_THRESHOLD:
|
| 146 |
+
return SceneClassification(
|
| 147 |
+
scene_id=regions.scene_id,
|
| 148 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 149 |
+
confidence=float(min(1.0, 0.5 + regions.chart_bbox.width / 2.0)),
|
| 150 |
+
reason=f"chart bbox covers {regions.chart_bbox.width:.2f} of width",
|
| 151 |
+
)
|
| 152 |
+
if regions.person_bbox and regions.person_bbox.width >= 0.5:
|
| 153 |
+
return SceneClassification(
|
| 154 |
+
scene_id=regions.scene_id,
|
| 155 |
+
layout=LayoutKind.ZOOM_CALL_CENTER,
|
| 156 |
+
confidence=float(min(1.0, 0.5 + regions.person_bbox.width / 2.0)),
|
| 157 |
+
reason=f"person bbox wide ({regions.person_bbox.width:.2f}) — tight framing",
|
| 158 |
+
)
|
| 159 |
+
if regions.person_bbox:
|
| 160 |
+
return SceneClassification(
|
| 161 |
+
scene_id=regions.scene_id,
|
| 162 |
+
layout=LayoutKind.SIT_CENTER,
|
| 163 |
+
confidence=0.7,
|
| 164 |
+
reason="person present, no wide chart, wider framing",
|
| 165 |
+
)
|
| 166 |
+
return SceneClassification(
|
| 167 |
+
scene_id=regions.scene_id,
|
| 168 |
+
layout=LayoutKind.SIT_CENTER,
|
| 169 |
+
confidence=0.3,
|
| 170 |
+
reason=regions.raw_reason or "no regions detected — defaulting to sit_center",
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def layout_instruction_from_regions(
|
| 175 |
+
regions: SceneRegions,
|
| 176 |
+
classification: SceneClassification,
|
| 177 |
+
*,
|
| 178 |
+
clip_id: str | None = None,
|
| 179 |
+
zoom: float = 1.0,
|
| 180 |
+
) -> LayoutInstruction:
|
| 181 |
+
"""Build a ``LayoutInstruction`` whose knobs are populated from bboxes.
|
| 182 |
+
|
| 183 |
+
``person_x_norm`` uses the person bbox center when available; falls back
|
| 184 |
+
to 0.5 (center). ``chart_x_norm`` uses the chart bbox left edge; falls
|
| 185 |
+
back to 0.0.
|
| 186 |
+
"""
|
| 187 |
+
|
| 188 |
+
person_x = regions.person_bbox.center_x if regions.person_bbox else 0.5
|
| 189 |
+
chart_x = regions.chart_bbox.x1 if regions.chart_bbox else 0.0
|
| 190 |
+
return LayoutInstruction(
|
| 191 |
+
clip_id=clip_id or classification.scene_id,
|
| 192 |
+
layout=classification.layout,
|
| 193 |
+
zoom=zoom,
|
| 194 |
+
person_x_norm=person_x,
|
| 195 |
+
chart_x_norm=chart_x,
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def classify_scenes_with_vision_llm(
|
| 200 |
+
scenes: list[Scene], vision_fn: LLMRegionFn
|
| 201 |
+
) -> list[tuple[SceneRegions, SceneClassification]]:
|
| 202 |
+
"""One-shot helper: keyframes -> regions -> classifications.
|
| 203 |
+
|
| 204 |
+
Returns ``(regions, classification)`` pairs per scene so the caller can
|
| 205 |
+
keep both artefacts on disk (regions = deep detail, classification =
|
| 206 |
+
what a renderer consumes).
|
| 207 |
+
"""
|
| 208 |
+
|
| 209 |
+
regions = detect_regions_with_llm(scenes, vision_fn)
|
| 210 |
+
return [(r, classify_from_regions(r)) for r in regions]
|
humeo-core/src/humeo_core/schemas.py
ADDED
|
@@ -0,0 +1,518 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Strict JSON contracts — the "container" of the rocket.
|
| 2 |
+
|
| 3 |
+
Every primitive reads and writes these. No primitive takes or returns free-form
|
| 4 |
+
strings. This is the non-negotiable interface described in the HIVE paper
|
| 5 |
+
guide (section 7): machine-checkable intermediate artifacts at every stage.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import Literal
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field, field_validator, model_serializer, model_validator
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
# Extraction artifacts
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Scene(BaseModel):
|
| 22 |
+
"""A single shot/scene detected in the source video."""
|
| 23 |
+
|
| 24 |
+
scene_id: str
|
| 25 |
+
start_time: float = Field(ge=0)
|
| 26 |
+
end_time: float = Field(gt=0)
|
| 27 |
+
keyframe_path: str | None = None
|
| 28 |
+
|
| 29 |
+
@field_validator("end_time")
|
| 30 |
+
@classmethod
|
| 31 |
+
def _end_after_start(cls, v: float, info) -> float:
|
| 32 |
+
start = info.data.get("start_time", 0.0)
|
| 33 |
+
if v <= start:
|
| 34 |
+
raise ValueError("end_time must be strictly greater than start_time")
|
| 35 |
+
return v
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def duration(self) -> float:
|
| 39 |
+
return self.end_time - self.start_time
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class TranscriptWord(BaseModel):
|
| 43 |
+
"""One ASR token with times in **seconds on the source video** timeline."""
|
| 44 |
+
|
| 45 |
+
word: str
|
| 46 |
+
start_time: float = Field(ge=0)
|
| 47 |
+
end_time: float = Field(ge=0)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class ClipSubtitleWords(BaseModel):
|
| 51 |
+
"""Words for one clip with times in **seconds relative to clip start** (t=0 at cut in-point)."""
|
| 52 |
+
|
| 53 |
+
words: list[TranscriptWord] = Field(default_factory=list)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class FocusStackOrder(str, Enum):
|
| 57 |
+
"""Vertical order for split layouts: which item occupies the top vs bottom band.
|
| 58 |
+
|
| 59 |
+
Bands are split by :attr:`LayoutInstruction.top_band_ratio` (default 0.5 = even).
|
| 60 |
+
For ``SPLIT_CHART_PERSON`` this picks chart-on-top vs person-on-top.
|
| 61 |
+
For ``SPLIT_TWO_PERSONS`` / ``SPLIT_TWO_CHARTS`` it has no visible meaning
|
| 62 |
+
(both bands hold the same kind of item); the enum value is retained only
|
| 63 |
+
so a single stacking recipe drives all three split layouts.
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
CHART_THEN_PERSON = "chart_then_person"
|
| 67 |
+
PERSON_THEN_CHART = "person_then_chart"
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class RenderTheme(str, Enum):
|
| 71 |
+
"""Visual treatment applied by the final renderer."""
|
| 72 |
+
|
| 73 |
+
LEGACY = "legacy"
|
| 74 |
+
REFERENCE_LOWER_THIRD = "reference_lower_third"
|
| 75 |
+
NATIVE_HIGHLIGHT = "native_highlight"
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class IngestResult(BaseModel):
|
| 79 |
+
"""Everything Stage 1 (deterministic local extraction) produces."""
|
| 80 |
+
|
| 81 |
+
source_path: str
|
| 82 |
+
duration_sec: float
|
| 83 |
+
scenes: list[Scene]
|
| 84 |
+
transcript_words: list[TranscriptWord]
|
| 85 |
+
keyframes_dir: str | None = None
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
# Layout system — the 5 "thrusters" (max 2 on-screen items per short)
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class LayoutKind(str, Enum):
|
| 94 |
+
"""The 9:16 layouts. A short contains **at most two** on-screen items.
|
| 95 |
+
|
| 96 |
+
An "item" is one of ``person`` (a human speaker) or ``chart`` (slide, graph,
|
| 97 |
+
data visual, screenshare). Five combinations are allowed:
|
| 98 |
+
|
| 99 |
+
- ``ZOOM_CALL_CENTER``: **1 person**, tight webcam/zoom-call framing, centered.
|
| 100 |
+
- ``SIT_CENTER``: **1 person**, interview/seated framing, centered.
|
| 101 |
+
- ``SPLIT_CHART_PERSON``: **1 chart + 1 person** — chart + speaker share the
|
| 102 |
+
source frame. Output stacks them vertically
|
| 103 |
+
(by default ``focus_stack_order`` = chart-on-top).
|
| 104 |
+
- ``SPLIT_TWO_PERSONS``: **2 persons** — two speakers (e.g. interview two-up).
|
| 105 |
+
Output stacks them vertically.
|
| 106 |
+
- ``SPLIT_TWO_CHARTS``: **2 charts** — two charts/slides side-by-side in source.
|
| 107 |
+
Output stacks them vertically.
|
| 108 |
+
|
| 109 |
+
The "max 2 items" constraint is the keep-it-simple rule: every rendered short
|
| 110 |
+
is either one item centered, or two items stacked evenly top/bottom.
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
ZOOM_CALL_CENTER = "zoom_call_center"
|
| 114 |
+
SIT_CENTER = "sit_center"
|
| 115 |
+
SPLIT_CHART_PERSON = "split_chart_person"
|
| 116 |
+
SPLIT_TWO_PERSONS = "split_two_persons"
|
| 117 |
+
SPLIT_TWO_CHARTS = "split_two_charts"
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# Layouts that stack two items vertically in the 9:16 output.
|
| 121 |
+
SPLIT_LAYOUTS: frozenset[LayoutKind] = frozenset(
|
| 122 |
+
{
|
| 123 |
+
LayoutKind.SPLIT_CHART_PERSON,
|
| 124 |
+
LayoutKind.SPLIT_TWO_PERSONS,
|
| 125 |
+
LayoutKind.SPLIT_TWO_CHARTS,
|
| 126 |
+
}
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class TimedCenterPoint(BaseModel):
|
| 131 |
+
"""Speaker x-center at a clip-relative time, used for tracked centering."""
|
| 132 |
+
|
| 133 |
+
t_sec: float = Field(ge=0.0)
|
| 134 |
+
x_norm: float = Field(ge=0.0, le=1.0)
|
| 135 |
+
zoom: float | None = Field(
|
| 136 |
+
default=None,
|
| 137 |
+
gt=0.0,
|
| 138 |
+
le=4.0,
|
| 139 |
+
description=(
|
| 140 |
+
"Optional per-sample crop zoom. When unset, the layout uses the "
|
| 141 |
+
"clip-level ``zoom`` value for that moment."
|
| 142 |
+
),
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class ClipRenderSpan(BaseModel):
|
| 147 |
+
"""One kept source-timeline span inside a selected clip."""
|
| 148 |
+
|
| 149 |
+
start_time_sec: float = Field(ge=0.0)
|
| 150 |
+
end_time_sec: float = Field(gt=0.0)
|
| 151 |
+
|
| 152 |
+
@field_validator("end_time_sec")
|
| 153 |
+
@classmethod
|
| 154 |
+
def _end_after_start(cls, v: float, info) -> float:
|
| 155 |
+
start = info.data.get("start_time_sec", 0.0)
|
| 156 |
+
if v <= start:
|
| 157 |
+
raise ValueError("render span end_time_sec must be greater than start_time_sec")
|
| 158 |
+
return v
|
| 159 |
+
|
| 160 |
+
@property
|
| 161 |
+
def duration_sec(self) -> float:
|
| 162 |
+
return self.end_time_sec - self.start_time_sec
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class LayoutInstruction(BaseModel):
|
| 166 |
+
"""Per-clip decision telling the compiler which layout to apply and how to crop.
|
| 167 |
+
|
| 168 |
+
Every short is described by exactly one of these, keyed by ``clip_id``. Split
|
| 169 |
+
layouts additionally carry up to two normalized bounding boxes (chart/person
|
| 170 |
+
or two-of-a-kind) so the compiler crops source strips that **partition** the
|
| 171 |
+
source width without overlap or gap.
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
clip_id: str
|
| 175 |
+
layout: LayoutKind
|
| 176 |
+
# Optional per-layout knobs. Defaults are sane for a 1920x1080 source.
|
| 177 |
+
zoom: float = Field(default=1.0, gt=0, le=4.0)
|
| 178 |
+
person_x_norm: float = Field(
|
| 179 |
+
default=0.5,
|
| 180 |
+
ge=0.0,
|
| 181 |
+
le=1.0,
|
| 182 |
+
description="Normalized x-center of the human subject in source frame (0=left, 1=right).",
|
| 183 |
+
)
|
| 184 |
+
person_tracking: list[TimedCenterPoint] = Field(
|
| 185 |
+
default_factory=list,
|
| 186 |
+
description=(
|
| 187 |
+
"Optional clip-relative speaker framing samples for moving 9:16 crops. "
|
| 188 |
+
"Each point can shift the x-center and optionally widen/tighten the crop "
|
| 189 |
+
"for that moment. When empty, the compiler uses the static "
|
| 190 |
+
"person_x_norm/zoom settings."
|
| 191 |
+
),
|
| 192 |
+
)
|
| 193 |
+
chart_x_norm: float = Field(
|
| 194 |
+
default=0.0,
|
| 195 |
+
ge=0.0,
|
| 196 |
+
le=1.0,
|
| 197 |
+
description=(
|
| 198 |
+
"split_chart_person only: left-edge trim of the chart strip, as a fraction of the "
|
| 199 |
+
"left 2/3 pane (0 = use full chart area)."
|
| 200 |
+
),
|
| 201 |
+
)
|
| 202 |
+
focus_stack_order: FocusStackOrder = Field(
|
| 203 |
+
default=FocusStackOrder.CHART_THEN_PERSON,
|
| 204 |
+
description="For split_chart_person only: chart-on-top vs person-on-top in the 9:16 stack.",
|
| 205 |
+
)
|
| 206 |
+
split_chart_region: BoundingBox | None = Field(
|
| 207 |
+
default=None,
|
| 208 |
+
description=(
|
| 209 |
+
"Optional normalized rect for the chart/slide crop (Gemini vision). "
|
| 210 |
+
"When set with split_person_region, the split layout uses these boxes instead of fixed 2/3|1/3."
|
| 211 |
+
),
|
| 212 |
+
)
|
| 213 |
+
split_person_region: BoundingBox | None = Field(
|
| 214 |
+
default=None,
|
| 215 |
+
description="Optional normalized rect for the speaker crop (Gemini vision).",
|
| 216 |
+
)
|
| 217 |
+
split_second_chart_region: BoundingBox | None = Field(
|
| 218 |
+
default=None,
|
| 219 |
+
description=(
|
| 220 |
+
"For ``SPLIT_TWO_CHARTS`` only: second chart bbox. The first chart occupies "
|
| 221 |
+
"the top output band, this one occupies the bottom band."
|
| 222 |
+
),
|
| 223 |
+
)
|
| 224 |
+
split_second_person_region: BoundingBox | None = Field(
|
| 225 |
+
default=None,
|
| 226 |
+
description=(
|
| 227 |
+
"For ``SPLIT_TWO_PERSONS`` only: second speaker bbox. The first person "
|
| 228 |
+
"occupies the top output band, this one occupies the bottom band."
|
| 229 |
+
),
|
| 230 |
+
)
|
| 231 |
+
top_band_ratio: float = Field(
|
| 232 |
+
default=0.5,
|
| 233 |
+
ge=0.2,
|
| 234 |
+
le=0.8,
|
| 235 |
+
description=(
|
| 236 |
+
"Fraction of 9:16 output height used by the top band for split layouts. "
|
| 237 |
+
"0.5 = EVEN 50/50 split (default — the user-requested symmetric look). "
|
| 238 |
+
"0.6 historically matched the 'chart dominant / person small' look."
|
| 239 |
+
),
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@field_validator("person_tracking")
|
| 244 |
+
@classmethod
|
| 245 |
+
def _tracking_times_non_decreasing(
|
| 246 |
+
cls, points: list[TimedCenterPoint]
|
| 247 |
+
) -> list[TimedCenterPoint]:
|
| 248 |
+
last_t = -1.0
|
| 249 |
+
for point in points:
|
| 250 |
+
if point.t_sec < last_t:
|
| 251 |
+
raise ValueError("person_tracking times must be non-decreasing")
|
| 252 |
+
last_t = point.t_sec
|
| 253 |
+
return points
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
class SceneClassification(BaseModel):
|
| 257 |
+
"""Result of the classifier: which layout should a given scene use."""
|
| 258 |
+
|
| 259 |
+
scene_id: str
|
| 260 |
+
layout: LayoutKind
|
| 261 |
+
confidence: float = Field(ge=0.0, le=1.0)
|
| 262 |
+
reason: str = ""
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
# ---------------------------------------------------------------------------
|
| 266 |
+
# Vision bounding boxes — the LLM+OCR path (alt to pixel heuristics)
|
| 267 |
+
# ---------------------------------------------------------------------------
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
class BoundingBox(BaseModel):
|
| 271 |
+
"""Normalized [0..1] bounding box in the source frame coordinate space.
|
| 272 |
+
|
| 273 |
+
Normalized coords keep these outputs portable across source resolutions
|
| 274 |
+
and stop the model hallucinating pixel values. ``x2 > x1`` and
|
| 275 |
+
``y2 > y1`` are enforced.
|
| 276 |
+
"""
|
| 277 |
+
|
| 278 |
+
x1: float = Field(ge=0.0, le=1.0)
|
| 279 |
+
y1: float = Field(ge=0.0, le=1.0)
|
| 280 |
+
x2: float = Field(ge=0.0, le=1.0)
|
| 281 |
+
y2: float = Field(ge=0.0, le=1.0)
|
| 282 |
+
label: str = ""
|
| 283 |
+
confidence: float = Field(default=1.0, ge=0.0, le=1.0)
|
| 284 |
+
|
| 285 |
+
@field_validator("x2")
|
| 286 |
+
@classmethod
|
| 287 |
+
def _x2_after_x1(cls, v: float, info) -> float:
|
| 288 |
+
x1 = info.data.get("x1", 0.0)
|
| 289 |
+
if v <= x1:
|
| 290 |
+
raise ValueError("x2 must be > x1")
|
| 291 |
+
return v
|
| 292 |
+
|
| 293 |
+
@field_validator("y2")
|
| 294 |
+
@classmethod
|
| 295 |
+
def _y2_after_y1(cls, v: float, info) -> float:
|
| 296 |
+
y1 = info.data.get("y1", 0.0)
|
| 297 |
+
if v <= y1:
|
| 298 |
+
raise ValueError("y2 must be > y1")
|
| 299 |
+
return v
|
| 300 |
+
|
| 301 |
+
@property
|
| 302 |
+
def center_x(self) -> float:
|
| 303 |
+
return (self.x1 + self.x2) / 2.0
|
| 304 |
+
|
| 305 |
+
@property
|
| 306 |
+
def center_y(self) -> float:
|
| 307 |
+
return (self.y1 + self.y2) / 2.0
|
| 308 |
+
|
| 309 |
+
@property
|
| 310 |
+
def width(self) -> float:
|
| 311 |
+
return self.x2 - self.x1
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
class SceneRegions(BaseModel):
|
| 315 |
+
"""Vision-LLM output for a single scene keyframe.
|
| 316 |
+
|
| 317 |
+
Flow: detect a scene change locally (cheap) -> extract one keyframe per
|
| 318 |
+
scene -> send that keyframe to a vision LLM with an OCR hint -> get
|
| 319 |
+
normalized bounding boxes for the on-screen roles (``person``,
|
| 320 |
+
``chart``). Those boxes drive ``person_x_norm`` / ``chart_x_norm`` on a
|
| 321 |
+
``LayoutInstruction`` without any pixel code running in Python.
|
| 322 |
+
"""
|
| 323 |
+
|
| 324 |
+
scene_id: str
|
| 325 |
+
person_bbox: BoundingBox | None = None
|
| 326 |
+
chart_bbox: BoundingBox | None = None
|
| 327 |
+
ocr_text: str = ""
|
| 328 |
+
raw_reason: str = ""
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
# ---------------------------------------------------------------------------
|
| 332 |
+
# Clip planning
|
| 333 |
+
# ---------------------------------------------------------------------------
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
class Clip(BaseModel):
|
| 337 |
+
clip_id: str
|
| 338 |
+
topic: str
|
| 339 |
+
start_time_sec: float = Field(ge=0)
|
| 340 |
+
end_time_sec: float = Field(gt=0)
|
| 341 |
+
viral_hook: str = ""
|
| 342 |
+
virality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 343 |
+
transcript: str = ""
|
| 344 |
+
suggested_overlay_title: str = ""
|
| 345 |
+
layout: LayoutKind | None = None
|
| 346 |
+
score_breakdown: dict[str, float] | None = None
|
| 347 |
+
origin: Literal["text", "visual", "both"] = "text"
|
| 348 |
+
visual_notes: str | None = None
|
| 349 |
+
reasoning: str | None = None
|
| 350 |
+
|
| 351 |
+
# Optional LLM metadata (source timeline is start_time_sec / end_time_sec).
|
| 352 |
+
hook_start_sec: float | None = Field(
|
| 353 |
+
default=None,
|
| 354 |
+
description="Seconds from clip in-point where the viral hook begins (0 = clip start).",
|
| 355 |
+
)
|
| 356 |
+
hook_end_sec: float | None = Field(
|
| 357 |
+
default=None,
|
| 358 |
+
description="Seconds from clip in-point where the hook ends (exclusive upper bound).",
|
| 359 |
+
)
|
| 360 |
+
trim_start_sec: float = Field(
|
| 361 |
+
default=0.0,
|
| 362 |
+
ge=0,
|
| 363 |
+
description="Seconds to remove from the start of this segment when exporting.",
|
| 364 |
+
)
|
| 365 |
+
trim_end_sec: float = Field(
|
| 366 |
+
default=0.0,
|
| 367 |
+
ge=0,
|
| 368 |
+
description="Seconds to remove from the end of this segment when exporting.",
|
| 369 |
+
)
|
| 370 |
+
render_spans: list[ClipRenderSpan] = Field(
|
| 371 |
+
default_factory=list,
|
| 372 |
+
description=(
|
| 373 |
+
"Optional ordered source-timeline spans to keep when exporting. "
|
| 374 |
+
"When present, these spans override contiguous trim_start/trim_end export."
|
| 375 |
+
),
|
| 376 |
+
)
|
| 377 |
+
shorts_title: str = ""
|
| 378 |
+
description: str = ""
|
| 379 |
+
hashtags: list[str] = Field(default_factory=list)
|
| 380 |
+
layout_hint: LayoutKind | None = None
|
| 381 |
+
needs_review: bool = False
|
| 382 |
+
review_reason: str = ""
|
| 383 |
+
|
| 384 |
+
@field_validator("score_breakdown")
|
| 385 |
+
@classmethod
|
| 386 |
+
def _score_breakdown_in_range(
|
| 387 |
+
cls, v: dict[str, float] | None
|
| 388 |
+
) -> dict[str, float] | None:
|
| 389 |
+
if v is None:
|
| 390 |
+
return None
|
| 391 |
+
cleaned: dict[str, float] = {}
|
| 392 |
+
for axis, score in v.items():
|
| 393 |
+
if score < 0.0:
|
| 394 |
+
raise ValueError(f"score_breakdown[{axis!r}] must be non-negative")
|
| 395 |
+
cleaned[axis] = min(score, 1.0)
|
| 396 |
+
return cleaned
|
| 397 |
+
|
| 398 |
+
@model_validator(mode="after")
|
| 399 |
+
def _timing_consistency(self) -> "Clip":
|
| 400 |
+
if self.end_time_sec <= self.start_time_sec:
|
| 401 |
+
raise ValueError("end_time_sec must be greater than start_time_sec")
|
| 402 |
+
dur = self.end_time_sec - self.start_time_sec
|
| 403 |
+
hs, he = self.hook_start_sec, self.hook_end_sec
|
| 404 |
+
if (hs is None) ^ (he is None):
|
| 405 |
+
raise ValueError("hook_start_sec and hook_end_sec must both be set or both omitted")
|
| 406 |
+
if hs is not None and he is not None:
|
| 407 |
+
if not (0 <= hs < he <= dur):
|
| 408 |
+
raise ValueError(
|
| 409 |
+
"hook window must satisfy 0 <= hook_start_sec < hook_end_sec <= clip duration"
|
| 410 |
+
)
|
| 411 |
+
if self.trim_start_sec + self.trim_end_sec > dur:
|
| 412 |
+
raise ValueError("trim_start_sec + trim_end_sec must not exceed clip duration")
|
| 413 |
+
last_end = None
|
| 414 |
+
for span in self.render_spans:
|
| 415 |
+
if span.start_time_sec < self.start_time_sec - 1e-6:
|
| 416 |
+
raise ValueError("render_spans must stay within the clip start_time_sec")
|
| 417 |
+
if span.end_time_sec > self.end_time_sec + 1e-6:
|
| 418 |
+
raise ValueError("render_spans must stay within the clip end_time_sec")
|
| 419 |
+
if last_end is not None and span.start_time_sec < last_end - 1e-6:
|
| 420 |
+
raise ValueError("render_spans must be ordered and non-overlapping")
|
| 421 |
+
last_end = span.end_time_sec
|
| 422 |
+
return self
|
| 423 |
+
|
| 424 |
+
@model_serializer(mode="wrap")
|
| 425 |
+
def _serialize_without_default_extensions(self, handler):
|
| 426 |
+
data = handler(self)
|
| 427 |
+
if data.get("score_breakdown") is None:
|
| 428 |
+
data.pop("score_breakdown", None)
|
| 429 |
+
if data.get("origin") == "text":
|
| 430 |
+
data.pop("origin", None)
|
| 431 |
+
if data.get("visual_notes") is None:
|
| 432 |
+
data.pop("visual_notes", None)
|
| 433 |
+
if data.get("reasoning") is None:
|
| 434 |
+
data.pop("reasoning", None)
|
| 435 |
+
return data
|
| 436 |
+
|
| 437 |
+
@property
|
| 438 |
+
def duration_sec(self) -> float:
|
| 439 |
+
return self.end_time_sec - self.start_time_sec
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
class ClipPlan(BaseModel):
|
| 443 |
+
"""Output of the clip-selection stage — a list of clips + their layouts."""
|
| 444 |
+
|
| 445 |
+
source_path: str
|
| 446 |
+
clips: list[Clip]
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
class ApprovalResult(BaseModel):
|
| 450 |
+
action: Literal["proceed", "refine", "quit", "accept_all"]
|
| 451 |
+
selected_ids: list[str] | None = None
|
| 452 |
+
steering_note: str | None = None
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
class RatingFeedback(BaseModel):
|
| 456 |
+
rating: Literal[1, 2, 3]
|
| 457 |
+
issues: list[
|
| 458 |
+
Literal[
|
| 459 |
+
"wrong_moments",
|
| 460 |
+
"bad_cuts",
|
| 461 |
+
"boring",
|
| 462 |
+
"confusing",
|
| 463 |
+
"wrong_layout",
|
| 464 |
+
"length_off",
|
| 465 |
+
"other",
|
| 466 |
+
]
|
| 467 |
+
] = Field(default_factory=list)
|
| 468 |
+
free_text: str | None = None
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
class SessionState(BaseModel):
|
| 472 |
+
source_key: str = ""
|
| 473 |
+
iteration: int = 0
|
| 474 |
+
steering_notes: list[str] = Field(default_factory=list)
|
| 475 |
+
last_rating: RatingFeedback | None = None
|
| 476 |
+
last_selected_ids: list[str] | None = None
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
# ---------------------------------------------------------------------------
|
| 480 |
+
# Render
|
| 481 |
+
# ---------------------------------------------------------------------------
|
| 482 |
+
|
| 483 |
+
|
| 484 |
+
class RenderRequest(BaseModel):
|
| 485 |
+
source_path: str
|
| 486 |
+
clip: Clip
|
| 487 |
+
layout: LayoutInstruction
|
| 488 |
+
output_path: str
|
| 489 |
+
width: int = 1080
|
| 490 |
+
height: int = 1920
|
| 491 |
+
subtitle_path: str | None = None
|
| 492 |
+
subtitle_font_size: int = Field(
|
| 493 |
+
default=48,
|
| 494 |
+
ge=10,
|
| 495 |
+
le=120,
|
| 496 |
+
description=(
|
| 497 |
+
"Caption font size in **output pixels** (libass is pinned to "
|
| 498 |
+
"``original_size=width x height`` by the compiler, so this is a "
|
| 499 |
+
"true pixel value, not the old PlayResY=288 unit)."
|
| 500 |
+
),
|
| 501 |
+
)
|
| 502 |
+
subtitle_margin_v: int = Field(
|
| 503 |
+
default=160,
|
| 504 |
+
ge=0,
|
| 505 |
+
le=800,
|
| 506 |
+
description="Vertical caption margin in output pixels (bottom-anchored).",
|
| 507 |
+
)
|
| 508 |
+
title_text: str = ""
|
| 509 |
+
render_theme: RenderTheme = RenderTheme.NATIVE_HIGHLIGHT
|
| 510 |
+
mode: Literal["normal", "dry_run"] = "normal"
|
| 511 |
+
|
| 512 |
+
|
| 513 |
+
class RenderResult(BaseModel):
|
| 514 |
+
clip_id: str
|
| 515 |
+
output_path: str
|
| 516 |
+
ffmpeg_cmd: list[str]
|
| 517 |
+
success: bool
|
| 518 |
+
error: str = ""
|
humeo-core/src/humeo_core/server.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastMCP server — the control panel for the reusable rocket.
|
| 2 |
+
|
| 3 |
+
Every primitive is exposed as a single MCP ``tool``. Each tool takes and
|
| 4 |
+
returns strict Pydantic-validated JSON, so an MCP client (Cursor, Claude
|
| 5 |
+
Desktop, etc.) can compose a full long-to-short pipeline without guessing
|
| 6 |
+
any interface.
|
| 7 |
+
|
| 8 |
+
Tools:
|
| 9 |
+
|
| 10 |
+
humeo.ingest — Stage 1 extraction (scenes + keyframes [+ transcript])
|
| 11 |
+
humeo.classify_scenes — Assign one of 5 layouts to each scene (pixel heuristic)
|
| 12 |
+
humeo.classify_scenes_with_vision — Assign layouts using bboxes from a vision LLM + OCR
|
| 13 |
+
humeo.detect_scene_regions — Raw LLM bbox output per scene keyframe (OCR-assisted)
|
| 14 |
+
humeo.select_clips — Pick top clips from a transcript (heuristic)
|
| 15 |
+
humeo.plan_layout — Return the ffmpeg filtergraph for a given layout
|
| 16 |
+
humeo.build_render_cmd — Build the full ffmpeg command (dry-run safe)
|
| 17 |
+
humeo.render_clip — Build + actually run ffmpeg to produce a 9:16 clip
|
| 18 |
+
humeo.list_layouts — List the 5 available layouts (discovery)
|
| 19 |
+
|
| 20 |
+
Resources:
|
| 21 |
+
|
| 22 |
+
humeo://layouts — JSON listing of the 5 layouts + description
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
from typing import Any
|
| 29 |
+
|
| 30 |
+
from mcp.server.fastmcp import FastMCP
|
| 31 |
+
|
| 32 |
+
from .primitives import classify as classify_mod
|
| 33 |
+
from .primitives import compile as compile_mod
|
| 34 |
+
from .primitives import ingest as ingest_mod
|
| 35 |
+
from .primitives import layouts as layouts_mod
|
| 36 |
+
from .primitives import select_clips as select_mod
|
| 37 |
+
from .primitives import vision as vision_mod
|
| 38 |
+
from .schemas import (
|
| 39 |
+
IngestResult,
|
| 40 |
+
LayoutInstruction,
|
| 41 |
+
LayoutKind,
|
| 42 |
+
RenderRequest,
|
| 43 |
+
RenderResult,
|
| 44 |
+
Scene,
|
| 45 |
+
SceneRegions,
|
| 46 |
+
TranscriptWord,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
mcp = FastMCP(
|
| 51 |
+
"humeo-core",
|
| 52 |
+
instructions=(
|
| 53 |
+
"Humeo MCP: reusable primitives for turning long videos into 9:16 shorts. "
|
| 54 |
+
"Compose tools in this order: ingest -> classify_scenes -> select_clips -> "
|
| 55 |
+
"plan_layout/build_render_cmd -> render_clip. All IO is strict JSON."
|
| 56 |
+
),
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
# Discovery
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@mcp.tool()
|
| 66 |
+
def list_layouts() -> dict[str, Any]:
|
| 67 |
+
"""Return the 5 fixed 9:16 layouts this server supports.
|
| 68 |
+
|
| 69 |
+
Every short shows **at most two** on-screen items (person/chart), which
|
| 70 |
+
gives exactly five recipes. Use this to discover the set of
|
| 71 |
+
:class:`LayoutKind` values before classifying scenes or requesting
|
| 72 |
+
renders.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
return {
|
| 76 |
+
"layouts": [
|
| 77 |
+
{
|
| 78 |
+
"kind": LayoutKind.ZOOM_CALL_CENTER.value,
|
| 79 |
+
"items": ["person"],
|
| 80 |
+
"description": "1 person, tight zoom-call / webcam framing, centered.",
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"kind": LayoutKind.SIT_CENTER.value,
|
| 84 |
+
"items": ["person"],
|
| 85 |
+
"description": "1 person, interview / seated framing, centered.",
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"kind": LayoutKind.SPLIT_CHART_PERSON.value,
|
| 89 |
+
"items": ["chart", "person"],
|
| 90 |
+
"description": (
|
| 91 |
+
"1 chart + 1 person. Source is partitioned left/right by the chart and "
|
| 92 |
+
"person bboxes (falling back to a 2/3 | 1/3 split); each strip is scaled "
|
| 93 |
+
"to fill its output band. Bands default to an even 50/50 vertical split; "
|
| 94 |
+
"configurable via ``top_band_ratio`` and swappable via ``focus_stack_order``."
|
| 95 |
+
),
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"kind": LayoutKind.SPLIT_TWO_PERSONS.value,
|
| 99 |
+
"items": ["person", "person"],
|
| 100 |
+
"description": (
|
| 101 |
+
"2 people (interview two-up / panel). Left speaker in the top band, right "
|
| 102 |
+
"speaker in the bottom band; seam sits between the two person bboxes."
|
| 103 |
+
),
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"kind": LayoutKind.SPLIT_TWO_CHARTS.value,
|
| 107 |
+
"items": ["chart", "chart"],
|
| 108 |
+
"description": (
|
| 109 |
+
"2 charts / slides side-by-side in source. Left chart on top, right chart "
|
| 110 |
+
"on bottom; each is scaled to fill its band."
|
| 111 |
+
),
|
| 112 |
+
},
|
| 113 |
+
]
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
@mcp.resource("humeo://layouts")
|
| 118 |
+
def layouts_resource() -> str:
|
| 119 |
+
return json.dumps(list_layouts(), indent=2)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
# Landing gear: ingest
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
@mcp.tool()
|
| 128 |
+
def ingest(
|
| 129 |
+
source_path: str,
|
| 130 |
+
work_dir: str,
|
| 131 |
+
with_transcript: bool = False,
|
| 132 |
+
whisper_model: str = "base",
|
| 133 |
+
) -> dict[str, Any]:
|
| 134 |
+
"""Run deterministic local extraction (scenes + keyframes, optional transcript).
|
| 135 |
+
|
| 136 |
+
Args:
|
| 137 |
+
source_path: absolute path to a local video file.
|
| 138 |
+
work_dir: directory where keyframes/ and temp artifacts will be written.
|
| 139 |
+
with_transcript: if True, run faster-whisper word-level transcription.
|
| 140 |
+
whisper_model: whisper model name (e.g. "tiny", "base", "small").
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
result: IngestResult = ingest_mod.ingest(
|
| 144 |
+
source_path,
|
| 145 |
+
work_dir,
|
| 146 |
+
with_transcript=with_transcript,
|
| 147 |
+
whisper_model=whisper_model,
|
| 148 |
+
)
|
| 149 |
+
return result.model_dump()
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ---------------------------------------------------------------------------
|
| 153 |
+
# Pilot: classify scenes
|
| 154 |
+
# ---------------------------------------------------------------------------
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@mcp.tool()
|
| 158 |
+
def classify_scenes(scenes: list[dict[str, Any]]) -> dict[str, Any]:
|
| 159 |
+
"""Classify each scene into exactly one of the 5 supported layouts.
|
| 160 |
+
|
| 161 |
+
Uses an offline pixel heuristic on each scene's keyframe. Agents that
|
| 162 |
+
want a smarter classifier can post-process or overwrite the result,
|
| 163 |
+
or call ``classify_scenes_with_vision`` with bboxes from a vision LLM.
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
parsed = [Scene.model_validate(s) for s in scenes]
|
| 167 |
+
results = classify_mod.classify_scenes_heuristic(parsed)
|
| 168 |
+
return {"classifications": [r.model_dump() for r in results]}
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# ---------------------------------------------------------------------------
|
| 172 |
+
# Pilot (alt path): vision-LLM + OCR bbox classifier
|
| 173 |
+
# ---------------------------------------------------------------------------
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
@mcp.tool()
|
| 177 |
+
def detect_scene_regions(scenes: list[dict[str, Any]]) -> dict[str, Any]:
|
| 178 |
+
"""Return the prompt + per-scene stubs used for LLM+OCR bbox detection.
|
| 179 |
+
|
| 180 |
+
This tool is the *adapter* half of the vision primitive. The MCP server
|
| 181 |
+
itself never calls an LLM — the agent does. So this endpoint returns:
|
| 182 |
+
|
| 183 |
+
1. the exact ``REGION_PROMPT`` to send along with each keyframe, and
|
| 184 |
+
2. a list of ``{scene_id, keyframe_path, prompt}`` jobs.
|
| 185 |
+
|
| 186 |
+
The agent runs its own vision model for each job, then feeds the
|
| 187 |
+
resulting JSON back via ``classify_scenes_with_vision``.
|
| 188 |
+
"""
|
| 189 |
+
|
| 190 |
+
parsed = [Scene.model_validate(s) for s in scenes]
|
| 191 |
+
return {
|
| 192 |
+
"prompt": vision_mod.REGION_PROMPT,
|
| 193 |
+
"jobs": [
|
| 194 |
+
{
|
| 195 |
+
"scene_id": s.scene_id,
|
| 196 |
+
"keyframe_path": s.keyframe_path,
|
| 197 |
+
"prompt": vision_mod.REGION_PROMPT,
|
| 198 |
+
}
|
| 199 |
+
for s in parsed
|
| 200 |
+
],
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@mcp.tool()
|
| 205 |
+
def classify_scenes_with_vision(regions: list[dict[str, Any]]) -> dict[str, Any]:
|
| 206 |
+
"""Classify scenes from already-gathered ``SceneRegions`` bbox records.
|
| 207 |
+
|
| 208 |
+
Input is a list of ``SceneRegions`` JSON dicts (output of the agent's
|
| 209 |
+
vision-LLM pass). Output is a ``{classifications, layout_instructions}``
|
| 210 |
+
pair — the layout kind per scene plus a ready-to-render
|
| 211 |
+
``LayoutInstruction`` with ``person_x_norm`` / ``chart_x_norm`` already
|
| 212 |
+
populated from the bboxes.
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
parsed_regions = [SceneRegions.model_validate(r) for r in regions]
|
| 216 |
+
classifications = [vision_mod.classify_from_regions(r) for r in parsed_regions]
|
| 217 |
+
instructions = [
|
| 218 |
+
vision_mod.layout_instruction_from_regions(r, c)
|
| 219 |
+
for r, c in zip(parsed_regions, classifications)
|
| 220 |
+
]
|
| 221 |
+
return {
|
| 222 |
+
"classifications": [c.model_dump() for c in classifications],
|
| 223 |
+
"layout_instructions": [i.model_dump() for i in instructions],
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ---------------------------------------------------------------------------
|
| 228 |
+
# Pilot: select clips
|
| 229 |
+
# ---------------------------------------------------------------------------
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
@mcp.tool()
|
| 233 |
+
def select_clips(
|
| 234 |
+
source_path: str,
|
| 235 |
+
transcript_words: list[dict[str, Any]],
|
| 236 |
+
duration_sec: float,
|
| 237 |
+
target_count: int = 5,
|
| 238 |
+
min_sec: float = 30.0,
|
| 239 |
+
max_sec: float = 60.0,
|
| 240 |
+
) -> dict[str, Any]:
|
| 241 |
+
"""Heuristically select top clips from a word-level transcript.
|
| 242 |
+
|
| 243 |
+
Scoring is word-density per window. Returns a ``ClipPlan`` with up to
|
| 244 |
+
``target_count`` non-overlapping clips.
|
| 245 |
+
"""
|
| 246 |
+
|
| 247 |
+
words = [TranscriptWord.model_validate(w) for w in transcript_words]
|
| 248 |
+
plan = select_mod.select_clips_heuristic(
|
| 249 |
+
source_path,
|
| 250 |
+
words,
|
| 251 |
+
duration_sec,
|
| 252 |
+
target_count=target_count,
|
| 253 |
+
min_sec=min_sec,
|
| 254 |
+
max_sec=max_sec,
|
| 255 |
+
)
|
| 256 |
+
return plan.model_dump()
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# ---------------------------------------------------------------------------
|
| 260 |
+
# Thrusters: plan + render
|
| 261 |
+
# ---------------------------------------------------------------------------
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
@mcp.tool()
|
| 265 |
+
def plan_layout(
|
| 266 |
+
layout: str,
|
| 267 |
+
out_w: int = 1080,
|
| 268 |
+
out_h: int = 1920,
|
| 269 |
+
src_w: int = 1920,
|
| 270 |
+
src_h: int = 1080,
|
| 271 |
+
zoom: float = 1.0,
|
| 272 |
+
person_x_norm: float = 0.5,
|
| 273 |
+
chart_x_norm: float = 0.0,
|
| 274 |
+
clip_id: str = "preview",
|
| 275 |
+
) -> dict[str, Any]:
|
| 276 |
+
"""Return the ffmpeg filter_complex fragment for one layout.
|
| 277 |
+
|
| 278 |
+
This is the pure, deterministic function underpinning the 5 thrusters.
|
| 279 |
+
No rendering is performed. Useful for agents that want to preview the
|
| 280 |
+
filtergraph or compose it with their own ffmpeg invocation.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
instr = LayoutInstruction(
|
| 284 |
+
clip_id=clip_id,
|
| 285 |
+
layout=LayoutKind(layout),
|
| 286 |
+
zoom=zoom,
|
| 287 |
+
person_x_norm=person_x_norm,
|
| 288 |
+
chart_x_norm=chart_x_norm,
|
| 289 |
+
)
|
| 290 |
+
fp = layouts_mod.plan_layout(instr, out_w=out_w, out_h=out_h, src_w=src_w, src_h=src_h)
|
| 291 |
+
return {"filtergraph": fp.filtergraph, "out_label": fp.out_label}
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
@mcp.tool()
|
| 295 |
+
def build_render_cmd(request: dict[str, Any]) -> dict[str, Any]:
|
| 296 |
+
"""Build (but do NOT run) the ffmpeg command for a render request.
|
| 297 |
+
|
| 298 |
+
``request`` must conform to the ``RenderRequest`` schema. This is a
|
| 299 |
+
dry-run helper so an agent can review the command before executing it.
|
| 300 |
+
"""
|
| 301 |
+
|
| 302 |
+
req = RenderRequest.model_validate({**request, "mode": "dry_run"})
|
| 303 |
+
result = compile_mod.render_clip(req)
|
| 304 |
+
return result.model_dump()
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
@mcp.tool()
|
| 308 |
+
def render_clip(request: dict[str, Any]) -> dict[str, Any]:
|
| 309 |
+
"""Render a single 9:16 clip with the specified layout.
|
| 310 |
+
|
| 311 |
+
``request`` must conform to ``RenderRequest``. If ``request.mode`` is
|
| 312 |
+
``"dry_run"`` the ffmpeg command is returned without execution.
|
| 313 |
+
"""
|
| 314 |
+
|
| 315 |
+
req = RenderRequest.model_validate(request)
|
| 316 |
+
result: RenderResult = compile_mod.render_clip(req)
|
| 317 |
+
return result.model_dump()
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
# ---------------------------------------------------------------------------
|
| 321 |
+
# Entrypoint
|
| 322 |
+
# ---------------------------------------------------------------------------
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def main() -> None:
|
| 326 |
+
"""stdio entrypoint for ``humeo-core`` console-script."""
|
| 327 |
+
|
| 328 |
+
mcp.run()
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
if __name__ == "__main__":
|
| 332 |
+
main()
|
humeo-core/tests/__init__.py
ADDED
|
File without changes
|
humeo-core/tests/test_classify.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
from humeo_core.primitives.classify import (
|
| 4 |
+
classify_scenes_heuristic,
|
| 5 |
+
classify_scenes_with_llm,
|
| 6 |
+
)
|
| 7 |
+
from humeo_core.schemas import LayoutKind, Scene
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_heuristic_no_keyframe_defaults_sit_center():
|
| 11 |
+
scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path=None)]
|
| 12 |
+
result = classify_scenes_heuristic(scenes)
|
| 13 |
+
assert len(result) == 1
|
| 14 |
+
assert result[0].scene_id == "s0"
|
| 15 |
+
assert result[0].layout == LayoutKind.SIT_CENTER
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_llm_classifier_uses_callback_and_validates():
|
| 19 |
+
scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
|
| 20 |
+
|
| 21 |
+
def fake_vision(image_path: str, prompt: str) -> str:
|
| 22 |
+
return json.dumps(
|
| 23 |
+
{"layout": "split_chart_person", "confidence": 0.88, "reason": "chart left"}
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
result = classify_scenes_with_llm(scenes, fake_vision)
|
| 27 |
+
assert result[0].layout == LayoutKind.SPLIT_CHART_PERSON
|
| 28 |
+
assert result[0].confidence == 0.88
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_llm_classifier_parse_error_is_safe():
|
| 32 |
+
scenes = [Scene(scene_id="s0", start_time=0.0, end_time=1.0, keyframe_path="/tmp/x.jpg")]
|
| 33 |
+
|
| 34 |
+
def bad_vision(image_path: str, prompt: str) -> str:
|
| 35 |
+
return "not json"
|
| 36 |
+
|
| 37 |
+
result = classify_scenes_with_llm(scenes, bad_vision)
|
| 38 |
+
assert result[0].layout == LayoutKind.SIT_CENTER
|
| 39 |
+
assert "parse error" in result[0].reason.lower()
|
humeo-core/tests/test_compile.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from humeo_core.primitives import compile as compile_mod
|
| 4 |
+
from humeo_core.primitives.compile import (
|
| 5 |
+
_ensure_windows_fontconfig,
|
| 6 |
+
build_ffmpeg_cmd,
|
| 7 |
+
plan_title_drawtext,
|
| 8 |
+
)
|
| 9 |
+
from humeo_core.schemas import Clip, LayoutInstruction, LayoutKind, RenderRequest, RenderTheme
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _req(**overrides):
|
| 13 |
+
c = Clip(clip_id="1", topic="t", start_time_sec=10.0, end_time_sec=40.0)
|
| 14 |
+
li = LayoutInstruction(clip_id="1", layout=LayoutKind.SIT_CENTER)
|
| 15 |
+
data = dict(
|
| 16 |
+
source_path="/tmp/src.mp4",
|
| 17 |
+
clip=c,
|
| 18 |
+
layout=li,
|
| 19 |
+
output_path="/tmp/out.mp4",
|
| 20 |
+
render_theme=RenderTheme.LEGACY,
|
| 21 |
+
mode="dry_run",
|
| 22 |
+
)
|
| 23 |
+
data.update(overrides)
|
| 24 |
+
return RenderRequest(**data)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_ffmpeg_cmd_has_ss_duration_filtergraph_output():
|
| 28 |
+
cmd = build_ffmpeg_cmd(_req())
|
| 29 |
+
assert "-ss" in cmd
|
| 30 |
+
assert "-t" in cmd
|
| 31 |
+
assert "-filter_complex" in cmd
|
| 32 |
+
# duration = 30.0
|
| 33 |
+
t_idx = cmd.index("-t")
|
| 34 |
+
assert float(cmd[t_idx + 1]) == 30.0
|
| 35 |
+
ss_idx = cmd.index("-ss")
|
| 36 |
+
assert float(cmd[ss_idx + 1]) == 10.0
|
| 37 |
+
assert cmd[-1] == "/tmp/out.mp4"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_title_text_injects_drawtext():
|
| 41 |
+
cmd = build_ffmpeg_cmd(_req(title_text="Hello: world's"))
|
| 42 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 43 |
+
assert "drawtext" in fg
|
| 44 |
+
# colon should be escaped
|
| 45 |
+
assert "Hello\\:" in fg
|
| 46 |
+
assert "worlds" in fg
|
| 47 |
+
assert "world's" not in fg
|
| 48 |
+
assert "expansion=none" in fg
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def test_map_vout_and_primary_audio():
|
| 52 |
+
cmd = build_ffmpeg_cmd(_req())
|
| 53 |
+
assert "[vout]" in cmd
|
| 54 |
+
assert "0:a:0" in cmd
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_subtitle_style_uses_requested_font_and_margin():
|
| 58 |
+
cmd = build_ffmpeg_cmd(
|
| 59 |
+
_req(subtitle_path="/tmp/clip.srt", subtitle_font_size=18, subtitle_margin_v=64)
|
| 60 |
+
)
|
| 61 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 62 |
+
assert "subtitles='" in fg
|
| 63 |
+
assert "FontSize=18" in fg
|
| 64 |
+
assert "MarginV=64" in fg
|
| 65 |
+
# Smart word wrap so long captions break into multiple readable lines.
|
| 66 |
+
assert "WrapStyle=0" in fg
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_subtitle_original_size_pins_libass_to_output_resolution():
|
| 70 |
+
"""Without original_size=W x H, libass uses PlayResY=288 and blows up fonts/margins.
|
| 71 |
+
|
| 72 |
+
This is the root cause of the "subtitles floating in the middle of the
|
| 73 |
+
frame / blocked" bug the user reported.
|
| 74 |
+
"""
|
| 75 |
+
cmd = build_ffmpeg_cmd(_req(subtitle_path="/tmp/clip.srt"))
|
| 76 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 77 |
+
assert "original_size=1080x1920" in fg
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_subtitles_applied_after_crop_and_title():
|
| 81 |
+
"""Order: crop/compose -> drawtext title -> subtitles.
|
| 82 |
+
|
| 83 |
+
The pipeline must crop **first**, then draw text on the finished frame.
|
| 84 |
+
"""
|
| 85 |
+
cmd = build_ffmpeg_cmd(
|
| 86 |
+
_req(title_text="Hook", subtitle_path="/tmp/clip.srt")
|
| 87 |
+
)
|
| 88 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 89 |
+
crop_pos = fg.index("[0:v]crop=")
|
| 90 |
+
drawtext_pos = fg.index("drawtext")
|
| 91 |
+
subs_pos = fg.index("subtitles=")
|
| 92 |
+
assert crop_pos < drawtext_pos < subs_pos
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_build_is_layout_specific():
|
| 96 |
+
c = Clip(clip_id="1", topic="t", start_time_sec=0, end_time_sec=10)
|
| 97 |
+
split_req = _req(
|
| 98 |
+
clip=c,
|
| 99 |
+
layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
|
| 100 |
+
)
|
| 101 |
+
cmd = build_ffmpeg_cmd(split_req)
|
| 102 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 103 |
+
assert "vstack" in fg
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_title_is_suppressed_on_split_layouts():
|
| 107 |
+
"""Split layouts already contain a slide/chart with its own title.
|
| 108 |
+
|
| 109 |
+
Overlaying an additional drawtext title just obscures content -- that's
|
| 110 |
+
what was happening in the Cathy Wood "chart overlaps subject" report.
|
| 111 |
+
"""
|
| 112 |
+
for kind in (
|
| 113 |
+
LayoutKind.SPLIT_CHART_PERSON,
|
| 114 |
+
LayoutKind.SPLIT_TWO_PERSONS,
|
| 115 |
+
LayoutKind.SPLIT_TWO_CHARTS,
|
| 116 |
+
):
|
| 117 |
+
cmd = build_ffmpeg_cmd(
|
| 118 |
+
_req(
|
| 119 |
+
layout=LayoutInstruction(clip_id="1", layout=kind),
|
| 120 |
+
title_text="This should not render",
|
| 121 |
+
)
|
| 122 |
+
)
|
| 123 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 124 |
+
assert "drawtext" not in fg, f"title leaked into split layout {kind}"
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def test_title_is_drawn_on_single_subject_layouts():
|
| 128 |
+
"""Titles are still rendered on ZOOM_CALL_CENTER and SIT_CENTER."""
|
| 129 |
+
for kind in (LayoutKind.ZOOM_CALL_CENTER, LayoutKind.SIT_CENTER):
|
| 130 |
+
cmd = build_ffmpeg_cmd(
|
| 131 |
+
_req(
|
| 132 |
+
layout=LayoutInstruction(clip_id="1", layout=kind),
|
| 133 |
+
title_text="Hook title",
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 137 |
+
assert "drawtext=text='Hook title'" in fg
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
# ---------------------------------------------------------------------------
|
| 141 |
+
# Title wrapping / auto-shrink (P2: fixes the "Prediction Markets vs
|
| 142 |
+
# Derivatives" clipped-title bug reported against the Cathy Wood run).
|
| 143 |
+
# ---------------------------------------------------------------------------
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def test_plan_title_short_stays_single_line_at_72px():
|
| 147 |
+
"""Backward compat: short titles keep the pre-P2 single-drawtext form.
|
| 148 |
+
|
| 149 |
+
Byte-identical output for short titles is important because it keeps
|
| 150 |
+
previously-calibrated visual output unchanged and avoids needless cache
|
| 151 |
+
churn on existing renders.
|
| 152 |
+
"""
|
| 153 |
+
frag = plan_title_drawtext("Hook title", out_w=1080)
|
| 154 |
+
assert frag is not None
|
| 155 |
+
assert frag.count("drawtext=") == 1
|
| 156 |
+
assert "fontsize=72" in frag
|
| 157 |
+
assert "y=80" in frag
|
| 158 |
+
assert "drawtext=text='Hook title'" in frag
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def test_plan_title_long_wraps_to_two_lines_below_72px():
|
| 162 |
+
"""Long titles wrap at the best word boundary and shrink to fit.
|
| 163 |
+
|
| 164 |
+
"Prediction Markets vs Derivatives" is 33 chars — it overflows a 1080px
|
| 165 |
+
canvas at 72px. It must wrap into "Prediction Markets" / "vs Derivatives"
|
| 166 |
+
(balanced halves) at a smaller font.
|
| 167 |
+
"""
|
| 168 |
+
frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
|
| 169 |
+
assert frag is not None
|
| 170 |
+
assert frag.count("drawtext=") == 2, "long titles must split into two drawtext calls"
|
| 171 |
+
assert "drawtext=text='Prediction Markets'" in frag
|
| 172 |
+
assert "drawtext=text='vs Derivatives'" in frag
|
| 173 |
+
assert "fontsize=72" not in frag, "two-line layout must use a smaller font"
|
| 174 |
+
# Both lines share the same shrunken fontsize.
|
| 175 |
+
import re
|
| 176 |
+
|
| 177 |
+
sizes = re.findall(r"fontsize=(\d+)", frag)
|
| 178 |
+
assert len(sizes) == 2 and sizes[0] == sizes[1]
|
| 179 |
+
assert 44 <= int(sizes[0]) <= 64
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def test_plan_title_empty_returns_none():
|
| 183 |
+
assert plan_title_drawtext("", out_w=1080) is None
|
| 184 |
+
assert plan_title_drawtext(" ", out_w=1080) is None
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def test_plan_title_single_huge_word_shrinks_instead_of_wrapping():
|
| 188 |
+
"""A single word cannot be word-wrapped; it must shrink to fit."""
|
| 189 |
+
frag = plan_title_drawtext("Supercalifragilisticexpialidocious", out_w=1080)
|
| 190 |
+
assert frag is not None
|
| 191 |
+
assert frag.count("drawtext=") == 1 # no wrap possible
|
| 192 |
+
assert "fontsize=72" not in frag
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def test_title_uses_arial_font_not_default_serif():
|
| 196 |
+
"""Titles must render in Arial (matching the ASS subtitle font), not the
|
| 197 |
+
platform default which is Times New Roman on Windows.
|
| 198 |
+
|
| 199 |
+
Regression test for the "ugly serif title on the finance short" bug.
|
| 200 |
+
Both the single-line and the two-line drawtext variants must carry a
|
| 201 |
+
``font=Arial`` directive so fontconfig resolves to the same family as
|
| 202 |
+
the subtitle ``Fontname=Arial``.
|
| 203 |
+
"""
|
| 204 |
+
short = plan_title_drawtext("Hook title", out_w=1080)
|
| 205 |
+
assert short is not None
|
| 206 |
+
assert "font=Arial" in short or "fontfile='" in short
|
| 207 |
+
|
| 208 |
+
long_frag = plan_title_drawtext("Prediction Markets vs Derivatives", out_w=1080)
|
| 209 |
+
assert long_frag is not None
|
| 210 |
+
if "font=Arial" in long_frag:
|
| 211 |
+
assert long_frag.count("font=Arial") == 2
|
| 212 |
+
else:
|
| 213 |
+
assert long_frag.count("fontfile='") == 2
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def test_title_font_matches_subtitle_font_family():
|
| 217 |
+
"""Title overlay and subtitle captions must read as one typographic
|
| 218 |
+
family. Both routes through ``build_ffmpeg_cmd`` should carry the same
|
| 219 |
+
Arial reference.
|
| 220 |
+
"""
|
| 221 |
+
cmd = build_ffmpeg_cmd(
|
| 222 |
+
_req(
|
| 223 |
+
title_text="Hook title",
|
| 224 |
+
subtitle_path="/tmp/clip.ass",
|
| 225 |
+
)
|
| 226 |
+
)
|
| 227 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 228 |
+
assert "font=Arial" in fg or "fontfile='" in fg
|
| 229 |
+
assert "Fontname=Arial" in fg
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def test_long_title_pipes_through_build_ffmpeg_cmd():
|
| 233 |
+
"""End-to-end: a long title routed through the full command builder
|
| 234 |
+
produces a valid filtergraph with two drawtext filters and no syntax
|
| 235 |
+
errors ffmpeg would choke on.
|
| 236 |
+
"""
|
| 237 |
+
cmd = build_ffmpeg_cmd(_req(title_text="Prediction Markets vs Derivatives"))
|
| 238 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 239 |
+
assert fg.count("drawtext=") == 2
|
| 240 |
+
assert "[v_prepad]drawtext=text='Prediction Markets'" in fg
|
| 241 |
+
assert "[vout]" in fg
|
| 242 |
+
assert ";;" not in fg # no empty chain links
|
| 243 |
+
assert ",," not in fg # no stray commas
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def test_reference_theme_draws_title_and_caption_bars():
|
| 247 |
+
cmd = build_ffmpeg_cmd(
|
| 248 |
+
_req(
|
| 249 |
+
title_text="A Multi-Trillion Dollar Opportunity",
|
| 250 |
+
subtitle_path="/tmp/clip.ass",
|
| 251 |
+
render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
|
| 252 |
+
)
|
| 253 |
+
)
|
| 254 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 255 |
+
assert "drawbox=x=28:y=32" in fg
|
| 256 |
+
assert "drawbox=x=0:y=" in fg
|
| 257 |
+
assert "Fontname=Source Sans 3" in fg
|
| 258 |
+
assert "Alignment=2" in fg
|
| 259 |
+
assert "Outline=2" in fg
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def test_reference_theme_wraps_long_titles_inside_the_title_bar():
|
| 263 |
+
cmd = build_ffmpeg_cmd(
|
| 264 |
+
_req(
|
| 265 |
+
title_text="12% Youth Unemployment? Start a Business With AI",
|
| 266 |
+
render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
|
| 267 |
+
)
|
| 268 |
+
)
|
| 269 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 270 |
+
assert fg.count("drawtext=") >= 2
|
| 271 |
+
assert "..." not in fg
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
def test_reference_theme_draws_frosted_caption_ribbon_when_subtitles_exist():
|
| 275 |
+
cmd = build_ffmpeg_cmd(
|
| 276 |
+
_req(
|
| 277 |
+
title_text="Hook title",
|
| 278 |
+
subtitle_path="/tmp/clip.ass",
|
| 279 |
+
render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
|
| 280 |
+
)
|
| 281 |
+
)
|
| 282 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 283 |
+
assert "drawbox=x=0:y=" in fg
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def test_reference_theme_allows_titles_on_split_layouts():
|
| 287 |
+
cmd = build_ffmpeg_cmd(
|
| 288 |
+
_req(
|
| 289 |
+
layout=LayoutInstruction(clip_id="1", layout=LayoutKind.SPLIT_CHART_PERSON),
|
| 290 |
+
title_text="Hook title",
|
| 291 |
+
render_theme=RenderTheme.REFERENCE_LOWER_THIRD,
|
| 292 |
+
)
|
| 293 |
+
)
|
| 294 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 295 |
+
assert "drawtext=" in fg
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def test_native_highlight_theme_skips_title_card_and_keeps_ass_styles():
|
| 299 |
+
cmd = build_ffmpeg_cmd(
|
| 300 |
+
_req(
|
| 301 |
+
title_text="This title should not render",
|
| 302 |
+
subtitle_path="/tmp/clip.ass",
|
| 303 |
+
render_theme=RenderTheme.NATIVE_HIGHLIGHT,
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
+
fg = cmd[cmd.index("-filter_complex") + 1]
|
| 307 |
+
assert "drawtext" not in fg
|
| 308 |
+
assert "subtitles='" in fg
|
| 309 |
+
assert "force_style='" not in fg
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def test_ensure_windows_fontconfig_is_noop_off_windows():
|
| 313 |
+
env = _ensure_windows_fontconfig()
|
| 314 |
+
assert isinstance(env, dict)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def test_ensure_windows_fontconfig_creates_config(monkeypatch, tmp_path):
|
| 318 |
+
monkeypatch.setattr(compile_mod.os, "name", "nt", raising=False)
|
| 319 |
+
monkeypatch.delenv("FONTCONFIG_FILE", raising=False)
|
| 320 |
+
monkeypatch.setenv("LOCALAPPDATA", str(tmp_path / "localappdata"))
|
| 321 |
+
monkeypatch.setenv("WINDIR", str(tmp_path / "winroot"))
|
| 322 |
+
|
| 323 |
+
env = _ensure_windows_fontconfig()
|
| 324 |
+
|
| 325 |
+
cfg_file = Path(env["FONTCONFIG_FILE"])
|
| 326 |
+
assert cfg_file.is_file()
|
| 327 |
+
text = cfg_file.read_text(encoding="utf-8")
|
| 328 |
+
assert (tmp_path / "winroot" / "Fonts").as_posix() in text
|
| 329 |
+
assert "fontconfig-cache" in text
|
humeo-core/tests/test_face_detect.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the MediaPipe-backed face detection primitive.
|
| 2 |
+
|
| 3 |
+
Uses a stub ``face_fn`` so MediaPipe itself is not required to run the
|
| 4 |
+
tests — the primitive contract is what we care about: *given* a face
|
| 5 |
+
bbox, does the primitive produce the right ``SceneRegions``.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from humeo_core.primitives.face_detect import detect_face_regions
|
| 9 |
+
from humeo_core.schemas import BoundingBox, Scene
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _scene(i: int, kf: str | None = "/tmp/k.jpg") -> Scene:
|
| 13 |
+
return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_no_keyframe_returns_raw_reason():
|
| 17 |
+
out = detect_face_regions([_scene(0, kf=None)], face_fn=lambda _p: None)
|
| 18 |
+
assert out[0].person_bbox is None
|
| 19 |
+
assert "no keyframe" in out[0].raw_reason.lower()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_no_face_detected_returns_raw_reason():
|
| 23 |
+
out = detect_face_regions([_scene(0)], face_fn=lambda _p: None)
|
| 24 |
+
assert out[0].person_bbox is None
|
| 25 |
+
assert "no face" in out[0].raw_reason.lower()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def test_face_centered_produces_person_only():
|
| 29 |
+
centered = BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.7, label="face", confidence=0.9)
|
| 30 |
+
out = detect_face_regions([_scene(0)], face_fn=lambda _p: centered)
|
| 31 |
+
r = out[0]
|
| 32 |
+
assert r.person_bbox is not None
|
| 33 |
+
assert r.person_bbox.center_x == centered.center_x
|
| 34 |
+
assert r.chart_bbox is None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_face_pushed_right_synthesises_chart_bbox():
|
| 38 |
+
# face center x ~ 0.86 -> above default threshold 0.65 -> chart bbox inferred
|
| 39 |
+
face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9, label="face", confidence=0.95)
|
| 40 |
+
out = detect_face_regions([_scene(0)], face_fn=lambda _p: face)
|
| 41 |
+
r = out[0]
|
| 42 |
+
assert r.person_bbox is not None
|
| 43 |
+
assert r.chart_bbox is not None
|
| 44 |
+
assert r.chart_bbox.x1 == 0.0
|
| 45 |
+
assert r.chart_bbox.x2 <= 0.75 # can't overlap the face
|
| 46 |
+
assert r.chart_bbox.x2 <= 0.65 # bounded by threshold too
|
| 47 |
+
assert "synthetic chart" in r.raw_reason
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_face_detector_exception_is_isolated_per_scene():
|
| 51 |
+
scenes = [_scene(0), _scene(1)]
|
| 52 |
+
calls: list[str] = []
|
| 53 |
+
|
| 54 |
+
def flaky_fn(path: str) -> BoundingBox | None:
|
| 55 |
+
calls.append(path)
|
| 56 |
+
if len(calls) == 1:
|
| 57 |
+
raise RuntimeError("boom")
|
| 58 |
+
return BoundingBox(x1=0.3, y1=0.2, x2=0.7, y2=0.8)
|
| 59 |
+
|
| 60 |
+
out = detect_face_regions(scenes, face_fn=flaky_fn)
|
| 61 |
+
assert out[0].person_bbox is None
|
| 62 |
+
assert "error" in out[0].raw_reason.lower()
|
| 63 |
+
assert out[1].person_bbox is not None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_custom_threshold_prevents_false_chart_split():
|
| 67 |
+
face = BoundingBox(x1=0.75, y1=0.1, x2=0.97, y2=0.9)
|
| 68 |
+
out = detect_face_regions(
|
| 69 |
+
[_scene(0)],
|
| 70 |
+
face_fn=lambda _p: face,
|
| 71 |
+
chart_split_threshold=0.95,
|
| 72 |
+
)
|
| 73 |
+
assert out[0].chart_bbox is None
|
humeo-core/tests/test_layout_bbox.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Split layout uses optional normalized bbox regions (Gemini vision)."""
|
| 2 |
+
|
| 3 |
+
from humeo_core.primitives.layouts import plan_layout
|
| 4 |
+
from humeo_core.schemas import BoundingBox, FocusStackOrder, LayoutInstruction, LayoutKind
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_split_with_bbox_regions_not_fixed_thirds():
|
| 8 |
+
instr = LayoutInstruction(
|
| 9 |
+
clip_id="c",
|
| 10 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 11 |
+
focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
|
| 12 |
+
split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.64, y2=1.0),
|
| 13 |
+
split_person_region=BoundingBox(x1=0.64, y1=0.0, x2=1.0, y2=1.0),
|
| 14 |
+
)
|
| 15 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 16 |
+
assert "crop=1228:1080:0:0" in fg or "crop=1224:1080:0:0" in fg
|
| 17 |
+
assert "vstack=inputs=2" in fg
|
humeo-core/tests/test_layouts.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
from humeo_core.primitives.layouts import (
|
| 4 |
+
_center_crop_to_9x16,
|
| 5 |
+
_crop_box,
|
| 6 |
+
plan_layout,
|
| 7 |
+
)
|
| 8 |
+
from humeo_core.schemas import (
|
| 9 |
+
BoundingBox,
|
| 10 |
+
FocusStackOrder,
|
| 11 |
+
LayoutInstruction,
|
| 12 |
+
LayoutKind,
|
| 13 |
+
TimedCenterPoint,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_crop_box_aspect_exact():
|
| 18 |
+
cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 1.0, 0.5, 0.5)
|
| 19 |
+
# 9:16 inside 1920x1080 -> height-limited: ch=1080, cw ~= 608
|
| 20 |
+
assert ch == 1080
|
| 21 |
+
assert abs(cw / ch - 9 / 16) < 0.01
|
| 22 |
+
assert 0 <= x <= 1920 - cw
|
| 23 |
+
assert y == 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_crop_box_clamps_inside_frame():
|
| 27 |
+
cw, ch, x, y = _crop_box(1920, 1080, 9 / 16, 2.0, 0.99, 0.5)
|
| 28 |
+
assert x + cw <= 1920
|
| 29 |
+
assert y + ch <= 1080
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_crop_box_zoom_tightens():
|
| 33 |
+
cw_small, ch_small, _, _ = _center_crop_to_9x16(1920, 1080, 2.0, 0.5)
|
| 34 |
+
cw_large, ch_large, _, _ = _center_crop_to_9x16(1920, 1080, 1.0, 0.5)
|
| 35 |
+
assert cw_small < cw_large
|
| 36 |
+
assert ch_small < ch_large
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def test_even_dimensions():
|
| 40 |
+
cw, ch, x, y = _crop_box(1921, 1081, 9 / 16, 1.3, 0.4, 0.5)
|
| 41 |
+
assert cw % 2 == 0 and ch % 2 == 0
|
| 42 |
+
assert x % 2 == 0 and y % 2 == 0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _contains(s: str, *subs: str) -> bool:
|
| 46 |
+
return all(sub in s for sub in subs)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def test_zoom_call_layout_filtergraph_shape():
|
| 50 |
+
instr = LayoutInstruction(
|
| 51 |
+
clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.5, person_x_norm=0.5
|
| 52 |
+
)
|
| 53 |
+
plan = plan_layout(instr, out_w=1080, out_h=1920)
|
| 54 |
+
fg = plan.filtergraph
|
| 55 |
+
assert _contains(fg, "[0:v]crop=", "scale=1080:1920", "[vout]")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_sit_center_layout_filtergraph_shape():
|
| 59 |
+
instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
|
| 60 |
+
plan = plan_layout(instr, out_w=1080, out_h=1920)
|
| 61 |
+
assert "[vout]" in plan.filtergraph
|
| 62 |
+
assert plan.out_label == "vout"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def test_sit_center_tracking_uses_dynamic_crop_expression():
|
| 66 |
+
instr = LayoutInstruction(
|
| 67 |
+
clip_id="c",
|
| 68 |
+
layout=LayoutKind.SIT_CENTER,
|
| 69 |
+
person_tracking=[
|
| 70 |
+
TimedCenterPoint(t_sec=0.0, x_norm=0.2),
|
| 71 |
+
TimedCenterPoint(t_sec=10.0, x_norm=0.8),
|
| 72 |
+
],
|
| 73 |
+
)
|
| 74 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
|
| 75 |
+
assert "setpts=PTS-STARTPTS" in fg
|
| 76 |
+
assert "[vsrc]crop=" in fg
|
| 77 |
+
assert "if(lt(t\\,4.850)" in fg
|
| 78 |
+
assert "*(t-4.850)/(0.300)" in fg
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_sit_center_tracking_with_zoom_uses_dynamic_crop_window_expressions():
|
| 82 |
+
instr = LayoutInstruction(
|
| 83 |
+
clip_id="c",
|
| 84 |
+
layout=LayoutKind.SIT_CENTER,
|
| 85 |
+
person_tracking=[
|
| 86 |
+
TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.28),
|
| 87 |
+
TimedCenterPoint(t_sec=10.0, x_norm=0.8, zoom=1.0),
|
| 88 |
+
],
|
| 89 |
+
)
|
| 90 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
|
| 91 |
+
assert "setpts=PTS-STARTPTS" in fg
|
| 92 |
+
assert "[vsrc]crop=" in fg
|
| 93 |
+
assert "out_w/2" in fg
|
| 94 |
+
assert "out_h/2" in fg
|
| 95 |
+
assert "floor((min(" in fg
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def test_split_layout_contains_vstack():
|
| 99 |
+
instr = LayoutInstruction(
|
| 100 |
+
clip_id="c",
|
| 101 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 102 |
+
person_x_norm=0.83,
|
| 103 |
+
chart_x_norm=0.0,
|
| 104 |
+
)
|
| 105 |
+
plan = plan_layout(instr, out_w=1080, out_h=1920)
|
| 106 |
+
fg = plan.filtergraph
|
| 107 |
+
assert _contains(fg, "split=2", "vstack=inputs=2", "[vout]")
|
| 108 |
+
assert "[top]" in fg and "[bot]" in fg
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def test_split_layout_person_crop_is_right_third():
|
| 112 |
+
"""Chart uses left 2/3; person uses right 1/3 (non-overlapping)."""
|
| 113 |
+
instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
|
| 114 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 115 |
+
# Right third: x=1280, w=640 for 1920-wide source.
|
| 116 |
+
assert "crop=640:1080:1280:0" in fg
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def test_split_layout_can_swap_stack_order():
|
| 120 |
+
"""PERSON_THEN_CHART puts the right-strip (person) crop into the top band."""
|
| 121 |
+
chart_first = plan_layout(
|
| 122 |
+
LayoutInstruction(
|
| 123 |
+
clip_id="c",
|
| 124 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 125 |
+
focus_stack_order=FocusStackOrder.CHART_THEN_PERSON,
|
| 126 |
+
),
|
| 127 |
+
out_w=1080,
|
| 128 |
+
out_h=1920,
|
| 129 |
+
).filtergraph
|
| 130 |
+
person_first = plan_layout(
|
| 131 |
+
LayoutInstruction(
|
| 132 |
+
clip_id="c",
|
| 133 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 134 |
+
focus_stack_order=FocusStackOrder.PERSON_THEN_CHART,
|
| 135 |
+
),
|
| 136 |
+
out_w=1080,
|
| 137 |
+
out_h=1920,
|
| 138 |
+
).filtergraph
|
| 139 |
+
|
| 140 |
+
def top_crop(fg: str) -> str:
|
| 141 |
+
m = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg)
|
| 142 |
+
assert m is not None, fg
|
| 143 |
+
return m.group(1)
|
| 144 |
+
|
| 145 |
+
# chart strip = left 1280px of source (2/3 split seam).
|
| 146 |
+
assert top_crop(chart_first) == "1280:1080:0:0"
|
| 147 |
+
# person strip = right 640px -> x=1280.
|
| 148 |
+
assert top_crop(person_first) == "640:1080:1280:0"
|
| 149 |
+
assert "vstack=inputs=2" in chart_first
|
| 150 |
+
assert "vstack=inputs=2" in person_first
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def test_split_layout_person_clamped():
|
| 154 |
+
instr = LayoutInstruction(
|
| 155 |
+
clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, person_x_norm=1.0
|
| 156 |
+
)
|
| 157 |
+
plan = plan_layout(instr, out_w=1080, out_h=1920)
|
| 158 |
+
assert "crop=" in plan.filtergraph # no OOB math crash
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def test_plan_layout_dispatch_covers_all_kinds():
|
| 162 |
+
for k in LayoutKind:
|
| 163 |
+
instr = LayoutInstruction(clip_id="c", layout=k)
|
| 164 |
+
plan = plan_layout(instr)
|
| 165 |
+
assert plan.out_label == "vout"
|
| 166 |
+
assert plan.filtergraph.endswith("[vout]")
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def test_default_split_is_even_50_50_bands():
|
| 170 |
+
"""The user-requested symmetric look: top and bottom bands are equal."""
|
| 171 |
+
instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
|
| 172 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
|
| 173 |
+
# Each strip should scale to the same height (half of 1920).
|
| 174 |
+
heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
|
| 175 |
+
assert len(heights) == 2
|
| 176 |
+
assert heights[0] == heights[1] == "960", f"expected even 960/960, got {heights}"
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def test_top_band_ratio_honored_for_uneven_splits():
|
| 180 |
+
instr = LayoutInstruction(
|
| 181 |
+
clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON, top_band_ratio=0.6
|
| 182 |
+
)
|
| 183 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920).filtergraph
|
| 184 |
+
heights = re.findall(r"scale=1080:(\d+):force_original_aspect_ratio", fg)
|
| 185 |
+
assert heights == ["1152", "768"], heights
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def test_split_seam_is_midpoint_between_bboxes():
|
| 189 |
+
"""When both bboxes are provided, strips partition the source -- no overlap, no gap."""
|
| 190 |
+
instr = LayoutInstruction(
|
| 191 |
+
clip_id="c",
|
| 192 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 193 |
+
split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.50, y2=1.0),
|
| 194 |
+
split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
|
| 195 |
+
)
|
| 196 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 197 |
+
# chart.x2 = 960px, person.x1 = 1056px -> midpoint = 1008 -> even -> 1008.
|
| 198 |
+
# Chart strip: x=0, cw=1008. Person strip: x=1008, cw=912.
|
| 199 |
+
top_crop = re.search(r"\[src1\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
|
| 200 |
+
bot_crop = re.search(r"\[src2\]crop=(\d+:\d+:\d+:\d+)", fg).group(1)
|
| 201 |
+
assert top_crop == "1008:1080:0:0"
|
| 202 |
+
assert bot_crop == "912:1080:1008:0"
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def test_split_uses_bbox_y_for_tight_band_fill():
|
| 206 |
+
"""Chart bboxes anchor the crop, with a little extra height for edge safety."""
|
| 207 |
+
instr = LayoutInstruction(
|
| 208 |
+
clip_id="c",
|
| 209 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 210 |
+
split_chart_region=BoundingBox(x1=0.0, y1=0.1, x2=0.5, y2=0.7),
|
| 211 |
+
split_person_region=BoundingBox(x1=0.55, y1=0.0, x2=1.0, y2=1.0),
|
| 212 |
+
)
|
| 213 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 214 |
+
# Chart bbox y: 0.1..0.7 -> y=108, ch=648, then a modest 12% pad per side.
|
| 215 |
+
assert "crop=1008:804:0:30" in fg
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def test_split_chart_person_adds_vertical_pad_to_reduce_chart_side_crop():
|
| 219 |
+
instr = LayoutInstruction(
|
| 220 |
+
clip_id="c",
|
| 221 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 222 |
+
split_chart_region=BoundingBox(x1=0.02, y1=0.03, x2=0.58, y2=0.7),
|
| 223 |
+
split_person_region=BoundingBox(x1=0.585, y1=0.0, x2=0.995, y2=0.62),
|
| 224 |
+
top_band_ratio=0.436,
|
| 225 |
+
)
|
| 226 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=640, src_h=360).filtergraph
|
| 227 |
+
assert "[src1]crop=372:280:0:0" in fg
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def test_split_minimum_strip_width_enforced():
|
| 231 |
+
"""If chart/person bboxes are pathological (seam at edge), don't starve a strip."""
|
| 232 |
+
instr = LayoutInstruction(
|
| 233 |
+
clip_id="c",
|
| 234 |
+
layout=LayoutKind.SPLIT_CHART_PERSON,
|
| 235 |
+
split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.05, y2=1.0),
|
| 236 |
+
split_person_region=BoundingBox(x1=0.05, y1=0.0, x2=1.0, y2=1.0),
|
| 237 |
+
)
|
| 238 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 239 |
+
widths = [int(m) for m in re.findall(r"crop=(\d+):\d+:\d+:\d+", fg)]
|
| 240 |
+
# Min strip = 20% of 1920 = 384 px. Neither strip should be narrower.
|
| 241 |
+
assert all(w >= 384 for w in widths), widths
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
def test_split_two_persons_stacks_two_crops():
|
| 245 |
+
instr = LayoutInstruction(
|
| 246 |
+
clip_id="c",
|
| 247 |
+
layout=LayoutKind.SPLIT_TWO_PERSONS,
|
| 248 |
+
split_person_region=BoundingBox(x1=0.0, y1=0.05, x2=0.5, y2=0.95),
|
| 249 |
+
split_second_person_region=BoundingBox(x1=0.5, y1=0.05, x2=1.0, y2=0.95),
|
| 250 |
+
)
|
| 251 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 252 |
+
assert "split=2" in fg and "vstack=inputs=2" in fg
|
| 253 |
+
# Seam at x=960. bbox y: 0.05..0.95 -> y=54, ch=972 (even).
|
| 254 |
+
assert "[src1]crop=960:972:0:54" in fg
|
| 255 |
+
assert "[src2]crop=960:972:960:54" in fg
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def test_split_two_charts_stacks_two_crops():
|
| 259 |
+
instr = LayoutInstruction(
|
| 260 |
+
clip_id="c",
|
| 261 |
+
layout=LayoutKind.SPLIT_TWO_CHARTS,
|
| 262 |
+
split_chart_region=BoundingBox(x1=0.0, y1=0.0, x2=0.5, y2=1.0),
|
| 263 |
+
split_second_chart_region=BoundingBox(x1=0.5, y1=0.0, x2=1.0, y2=1.0),
|
| 264 |
+
)
|
| 265 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 266 |
+
assert "split=2" in fg and "vstack=inputs=2" in fg
|
| 267 |
+
assert "[src1]crop=960:1080:0:0" in fg
|
| 268 |
+
assert "[src2]crop=960:1080:960:0" in fg
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def test_split_two_persons_without_bboxes_defaults_to_centered():
|
| 272 |
+
"""No bboxes -> centered 50/50 seam, full source height fallback."""
|
| 273 |
+
instr = LayoutInstruction(
|
| 274 |
+
clip_id="c", layout=LayoutKind.SPLIT_TWO_PERSONS
|
| 275 |
+
)
|
| 276 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 277 |
+
assert "[src1]crop=960:1080:0:0" in fg
|
| 278 |
+
assert "[src2]crop=960:1080:960:0" in fg
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def test_split_bands_use_cover_scale_plus_center_crop():
|
| 282 |
+
"""Each band is painted edge-to-edge -- no letterbox bars."""
|
| 283 |
+
instr = LayoutInstruction(clip_id="c", layout=LayoutKind.SPLIT_CHART_PERSON)
|
| 284 |
+
fg = plan_layout(instr, out_w=1080, out_h=1920, src_w=1920, src_h=1080).filtergraph
|
| 285 |
+
assert fg.count("force_original_aspect_ratio=increase") == 2
|
| 286 |
+
assert fg.count("setsar=1") == 2
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def test_zoom_tighter_means_smaller_crop_window():
|
| 290 |
+
from humeo_core.primitives.layouts import plan_zoom_call_center
|
| 291 |
+
|
| 292 |
+
wide = plan_zoom_call_center(
|
| 293 |
+
LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=1.0),
|
| 294 |
+
out_w=1080,
|
| 295 |
+
out_h=1920,
|
| 296 |
+
)
|
| 297 |
+
tight = plan_zoom_call_center(
|
| 298 |
+
LayoutInstruction(clip_id="c", layout=LayoutKind.ZOOM_CALL_CENTER, zoom=2.0),
|
| 299 |
+
out_w=1080,
|
| 300 |
+
out_h=1920,
|
| 301 |
+
)
|
| 302 |
+
# Parse crop=CW:CH:X:Y out of each filtergraph.
|
| 303 |
+
import re
|
| 304 |
+
|
| 305 |
+
def crop(fg: str) -> tuple[int, int]:
|
| 306 |
+
m = re.search(r"crop=(\d+):(\d+):", fg)
|
| 307 |
+
assert m is not None
|
| 308 |
+
return int(m.group(1)), int(m.group(2))
|
| 309 |
+
|
| 310 |
+
wcw, wch = crop(wide.filtergraph)
|
| 311 |
+
tcw, tch = crop(tight.filtergraph)
|
| 312 |
+
assert tcw < wcw and tch < wch
|
humeo-core/tests/test_schemas.py
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from pydantic import ValidationError
|
| 3 |
+
|
| 4 |
+
from humeo_core.schemas import (
|
| 5 |
+
ApprovalResult,
|
| 6 |
+
Clip,
|
| 7 |
+
ClipPlan,
|
| 8 |
+
ClipSubtitleWords,
|
| 9 |
+
FocusStackOrder,
|
| 10 |
+
LayoutInstruction,
|
| 11 |
+
LayoutKind,
|
| 12 |
+
RatingFeedback,
|
| 13 |
+
RenderRequest,
|
| 14 |
+
Scene,
|
| 15 |
+
SessionState,
|
| 16 |
+
TimedCenterPoint,
|
| 17 |
+
TranscriptWord,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_scene_requires_end_after_start():
|
| 22 |
+
Scene(scene_id="s1", start_time=0.0, end_time=1.0)
|
| 23 |
+
with pytest.raises(ValueError):
|
| 24 |
+
Scene(scene_id="s1", start_time=5.0, end_time=5.0)
|
| 25 |
+
with pytest.raises(ValueError):
|
| 26 |
+
Scene(scene_id="s1", start_time=5.0, end_time=1.0)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_layout_instruction_defaults_and_bounds():
|
| 30 |
+
li = LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER)
|
| 31 |
+
assert li.zoom == 1.0
|
| 32 |
+
assert 0 <= li.person_x_norm <= 1
|
| 33 |
+
assert li.person_tracking == []
|
| 34 |
+
assert li.focus_stack_order == FocusStackOrder.CHART_THEN_PERSON
|
| 35 |
+
with pytest.raises(ValueError):
|
| 36 |
+
LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, zoom=0.0)
|
| 37 |
+
with pytest.raises(ValueError):
|
| 38 |
+
LayoutInstruction(clip_id="c", layout=LayoutKind.SIT_CENTER, person_x_norm=2.0)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_layout_instruction_accepts_sorted_tracking_points():
|
| 42 |
+
li = LayoutInstruction(
|
| 43 |
+
clip_id="c",
|
| 44 |
+
layout=LayoutKind.SIT_CENTER,
|
| 45 |
+
person_tracking=[
|
| 46 |
+
TimedCenterPoint(t_sec=0.0, x_norm=0.2, zoom=1.25),
|
| 47 |
+
TimedCenterPoint(t_sec=5.0, x_norm=0.8, zoom=1.0),
|
| 48 |
+
],
|
| 49 |
+
)
|
| 50 |
+
assert [point.t_sec for point in li.person_tracking] == [0.0, 5.0]
|
| 51 |
+
assert li.person_tracking[0].zoom == pytest.approx(1.25)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def test_layout_instruction_rejects_unsorted_tracking_points():
|
| 55 |
+
with pytest.raises(ValueError, match="person_tracking times"):
|
| 56 |
+
LayoutInstruction(
|
| 57 |
+
clip_id="c",
|
| 58 |
+
layout=LayoutKind.SIT_CENTER,
|
| 59 |
+
person_tracking=[
|
| 60 |
+
TimedCenterPoint(t_sec=5.0, x_norm=0.8),
|
| 61 |
+
TimedCenterPoint(t_sec=1.0, x_norm=0.2),
|
| 62 |
+
],
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_clip_duration():
|
| 67 |
+
c = Clip(
|
| 68 |
+
clip_id="1",
|
| 69 |
+
topic="t",
|
| 70 |
+
start_time_sec=10.0,
|
| 71 |
+
end_time_sec=42.5,
|
| 72 |
+
)
|
| 73 |
+
assert c.duration_sec == pytest.approx(32.5)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_clip_hook_relative_to_clip_in_point():
|
| 77 |
+
c = Clip(
|
| 78 |
+
clip_id="1",
|
| 79 |
+
topic="t",
|
| 80 |
+
start_time_sec=100.0,
|
| 81 |
+
end_time_sec=130.0,
|
| 82 |
+
hook_start_sec=0.0,
|
| 83 |
+
hook_end_sec=3.0,
|
| 84 |
+
)
|
| 85 |
+
assert c.hook_end_sec == 3.0
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_clip_hook_must_be_within_duration():
|
| 89 |
+
with pytest.raises(ValueError, match="hook window"):
|
| 90 |
+
Clip(
|
| 91 |
+
clip_id="1",
|
| 92 |
+
topic="t",
|
| 93 |
+
start_time_sec=0.0,
|
| 94 |
+
end_time_sec=10.0,
|
| 95 |
+
hook_start_sec=0.0,
|
| 96 |
+
hook_end_sec=15.0,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def test_clip_hook_both_or_neither():
|
| 101 |
+
with pytest.raises(ValueError, match="hook_start_sec and hook_end_sec"):
|
| 102 |
+
Clip(
|
| 103 |
+
clip_id="1",
|
| 104 |
+
topic="t",
|
| 105 |
+
start_time_sec=0.0,
|
| 106 |
+
end_time_sec=10.0,
|
| 107 |
+
hook_start_sec=1.0,
|
| 108 |
+
hook_end_sec=None,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_clip_trim_cannot_exceed_duration():
|
| 113 |
+
with pytest.raises(ValueError, match="trim"):
|
| 114 |
+
Clip(
|
| 115 |
+
clip_id="1",
|
| 116 |
+
topic="t",
|
| 117 |
+
start_time_sec=0.0,
|
| 118 |
+
end_time_sec=10.0,
|
| 119 |
+
trim_start_sec=6.0,
|
| 120 |
+
trim_end_sec=6.0,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def test_clip_plan_roundtrip():
|
| 125 |
+
plan = ClipPlan(
|
| 126 |
+
source_path="/tmp/x.mp4",
|
| 127 |
+
clips=[
|
| 128 |
+
Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
|
| 129 |
+
],
|
| 130 |
+
)
|
| 131 |
+
d = plan.model_dump()
|
| 132 |
+
assert ClipPlan.model_validate(d) == plan
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def test_clip_roundtrip_with_extended_fields():
|
| 136 |
+
clip = Clip(
|
| 137 |
+
clip_id="1",
|
| 138 |
+
topic="t",
|
| 139 |
+
start_time_sec=0.0,
|
| 140 |
+
end_time_sec=30.0,
|
| 141 |
+
score_breakdown={"message_wow": 0.9, "hook_emotion": 0.7},
|
| 142 |
+
origin="both",
|
| 143 |
+
visual_notes="Speaker leans in.",
|
| 144 |
+
reasoning="Strong explanation and hook.",
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
dumped = clip.model_dump()
|
| 148 |
+
|
| 149 |
+
assert dumped["score_breakdown"] == {"message_wow": 0.9, "hook_emotion": 0.7}
|
| 150 |
+
assert dumped["origin"] == "both"
|
| 151 |
+
assert dumped["visual_notes"] == "Speaker leans in."
|
| 152 |
+
assert dumped["reasoning"] == "Strong explanation and hook."
|
| 153 |
+
assert Clip.model_validate(dumped) == clip
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def test_clip_defaults_validate_and_do_not_serialize_new_fields():
|
| 157 |
+
clip = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
|
| 158 |
+
|
| 159 |
+
assert clip.origin == "text"
|
| 160 |
+
assert clip.score_breakdown is None
|
| 161 |
+
assert clip.visual_notes is None
|
| 162 |
+
assert clip.reasoning is None
|
| 163 |
+
|
| 164 |
+
dumped = clip.model_dump()
|
| 165 |
+
assert "score_breakdown" not in dumped
|
| 166 |
+
assert "origin" not in dumped
|
| 167 |
+
assert "visual_notes" not in dumped
|
| 168 |
+
assert "reasoning" not in dumped
|
| 169 |
+
assert Clip.model_validate(dumped) == clip
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def test_clip_score_breakdown_validation():
|
| 173 |
+
with pytest.raises(ValidationError):
|
| 174 |
+
Clip(
|
| 175 |
+
clip_id="1",
|
| 176 |
+
topic="t",
|
| 177 |
+
start_time_sec=0.0,
|
| 178 |
+
end_time_sec=30.0,
|
| 179 |
+
score_breakdown={"hook": -0.1},
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
clip = Clip(
|
| 183 |
+
clip_id="1",
|
| 184 |
+
topic="t",
|
| 185 |
+
start_time_sec=0.0,
|
| 186 |
+
end_time_sec=30.0,
|
| 187 |
+
score_breakdown={"hook": 1.2},
|
| 188 |
+
)
|
| 189 |
+
assert clip.score_breakdown == {"hook": 1.0}
|
| 190 |
+
|
| 191 |
+
clip = Clip(
|
| 192 |
+
clip_id="1",
|
| 193 |
+
topic="t",
|
| 194 |
+
start_time_sec=0.0,
|
| 195 |
+
end_time_sec=30.0,
|
| 196 |
+
score_breakdown={},
|
| 197 |
+
)
|
| 198 |
+
assert clip.score_breakdown == {}
|
| 199 |
+
|
| 200 |
+
clip = Clip(
|
| 201 |
+
clip_id="1",
|
| 202 |
+
topic="t",
|
| 203 |
+
start_time_sec=0.0,
|
| 204 |
+
end_time_sec=30.0,
|
| 205 |
+
score_breakdown={"hook": 0.5},
|
| 206 |
+
)
|
| 207 |
+
assert clip.score_breakdown == {"hook": 0.5}
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def test_clip_subtitle_words_relative_times():
|
| 211 |
+
w = ClipSubtitleWords(
|
| 212 |
+
words=[TranscriptWord(word="hi", start_time=0.0, end_time=0.2)]
|
| 213 |
+
)
|
| 214 |
+
assert w.words[0].start_time == 0.0
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def test_render_request_modes():
|
| 218 |
+
c = Clip(clip_id="1", topic="t", start_time_sec=0.0, end_time_sec=30.0)
|
| 219 |
+
li = LayoutInstruction(clip_id="1", layout=LayoutKind.ZOOM_CALL_CENTER)
|
| 220 |
+
req = RenderRequest(
|
| 221 |
+
source_path="/tmp/x.mp4",
|
| 222 |
+
clip=c,
|
| 223 |
+
layout=li,
|
| 224 |
+
output_path="/tmp/out.mp4",
|
| 225 |
+
)
|
| 226 |
+
assert req.mode == "normal"
|
| 227 |
+
req2 = RenderRequest(**{**req.model_dump(), "mode": "dry_run"})
|
| 228 |
+
assert req2.mode == "dry_run"
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def test_approval_result_roundtrip():
|
| 232 |
+
result = ApprovalResult(
|
| 233 |
+
action="proceed",
|
| 234 |
+
selected_ids=["001", "003"],
|
| 235 |
+
steering_note="prefer emotional moments",
|
| 236 |
+
)
|
| 237 |
+
assert ApprovalResult.model_validate(result.model_dump()) == result
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def test_approval_result_rejects_invalid_action():
|
| 241 |
+
with pytest.raises(ValidationError):
|
| 242 |
+
ApprovalResult(action="invalid")
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def test_rating_feedback_roundtrip():
|
| 246 |
+
feedback = RatingFeedback(
|
| 247 |
+
rating=2,
|
| 248 |
+
issues=["wrong_moments", "other"],
|
| 249 |
+
free_text="needs more context",
|
| 250 |
+
)
|
| 251 |
+
assert RatingFeedback.model_validate(feedback.model_dump()) == feedback
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def test_rating_feedback_rejects_invalid_rating():
|
| 255 |
+
with pytest.raises(ValidationError):
|
| 256 |
+
RatingFeedback(rating=4)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def test_session_state_roundtrip():
|
| 260 |
+
state = SessionState(
|
| 261 |
+
source_key="youtube:PdVv_vLkUgk",
|
| 262 |
+
iteration=3,
|
| 263 |
+
steering_notes=["be punchier"],
|
| 264 |
+
last_rating=RatingFeedback(rating=3),
|
| 265 |
+
last_selected_ids=["001", "002"],
|
| 266 |
+
)
|
| 267 |
+
assert SessionState.model_validate(state.model_dump()) == state
|
humeo-core/tests/test_select_clips.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from humeo_core.primitives.select_clips import select_clips_heuristic
|
| 2 |
+
from humeo_core.schemas import TranscriptWord
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def _words(start: float, end: float, n: int) -> list[TranscriptWord]:
|
| 6 |
+
step = (end - start) / max(1, n)
|
| 7 |
+
return [
|
| 8 |
+
TranscriptWord(word=f"w{i}", start_time=start + i * step, end_time=start + (i + 1) * step)
|
| 9 |
+
for i in range(n)
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_no_transcript_returns_single_clip():
|
| 14 |
+
plan = select_clips_heuristic("/tmp/x.mp4", [], duration_sec=600.0)
|
| 15 |
+
assert len(plan.clips) == 1
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_prefers_dense_windows():
|
| 19 |
+
# dense between 30-90, sparse elsewhere
|
| 20 |
+
dense = _words(30.0, 90.0, 240) # 4 words/sec
|
| 21 |
+
sparse_before = _words(0.0, 30.0, 6)
|
| 22 |
+
sparse_after = _words(90.0, 600.0, 30)
|
| 23 |
+
words = sparse_before + dense + sparse_after
|
| 24 |
+
plan = select_clips_heuristic(
|
| 25 |
+
"/tmp/x.mp4", words, duration_sec=600.0, target_count=1, min_sec=30, max_sec=60
|
| 26 |
+
)
|
| 27 |
+
assert len(plan.clips) == 1
|
| 28 |
+
c = plan.clips[0]
|
| 29 |
+
assert 30 <= c.start_time_sec <= 90
|
| 30 |
+
assert c.end_time_sec <= 120
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_no_overlap_when_multiple_picked():
|
| 34 |
+
dense_a = _words(30.0, 90.0, 240)
|
| 35 |
+
dense_b = _words(200.0, 260.0, 240)
|
| 36 |
+
words = dense_a + dense_b
|
| 37 |
+
plan = select_clips_heuristic(
|
| 38 |
+
"/tmp/x.mp4",
|
| 39 |
+
words,
|
| 40 |
+
duration_sec=400.0,
|
| 41 |
+
target_count=3,
|
| 42 |
+
min_sec=30,
|
| 43 |
+
max_sec=60,
|
| 44 |
+
)
|
| 45 |
+
# Should pick both dense regions without overlap.
|
| 46 |
+
assert len(plan.clips) >= 2
|
| 47 |
+
starts_ends = sorted((c.start_time_sec, c.end_time_sec) for c in plan.clips)
|
| 48 |
+
for (s1, e1), (s2, e2) in zip(starts_ends, starts_ends[1:]):
|
| 49 |
+
assert e1 <= s2
|
humeo-core/tests/test_server_tools.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Exercise the MCP server tools as plain Python callables.
|
| 2 |
+
|
| 3 |
+
FastMCP tools are registered on the server instance, but the underlying
|
| 4 |
+
functions are ordinary Python functions decorated with ``@mcp.tool()``.
|
| 5 |
+
We import the module and invoke those functions directly to verify the
|
| 6 |
+
end-to-end wiring (schemas validated, dispatch correct, JSON-serializable).
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import humeo_core.server as srv
|
| 10 |
+
from humeo_core.schemas import LayoutKind
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def test_list_layouts_lists_all_three():
|
| 14 |
+
result = srv.list_layouts()
|
| 15 |
+
kinds = {layout["kind"] for layout in result["layouts"]}
|
| 16 |
+
assert kinds == {k.value for k in LayoutKind}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_plan_layout_tool_returns_filtergraph():
|
| 20 |
+
for k in LayoutKind:
|
| 21 |
+
out = srv.plan_layout(layout=k.value)
|
| 22 |
+
assert out["out_label"] == "vout"
|
| 23 |
+
assert "[vout]" in out["filtergraph"]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_build_render_cmd_dry_run():
|
| 27 |
+
req = {
|
| 28 |
+
"source_path": "/tmp/src.mp4",
|
| 29 |
+
"clip": {
|
| 30 |
+
"clip_id": "1",
|
| 31 |
+
"topic": "t",
|
| 32 |
+
"start_time_sec": 0.0,
|
| 33 |
+
"end_time_sec": 30.0,
|
| 34 |
+
},
|
| 35 |
+
"layout": {"clip_id": "1", "layout": LayoutKind.SIT_CENTER.value},
|
| 36 |
+
"output_path": "/tmp/out.mp4",
|
| 37 |
+
}
|
| 38 |
+
out = srv.build_render_cmd(request=req)
|
| 39 |
+
assert out["success"] is True
|
| 40 |
+
assert out["output_path"] == "/tmp/out.mp4"
|
| 41 |
+
assert any("-filter_complex" == part for part in out["ffmpeg_cmd"])
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_select_clips_tool_happy_path():
|
| 45 |
+
words = [
|
| 46 |
+
{"word": f"w{i}", "start_time": float(i), "end_time": float(i) + 0.5}
|
| 47 |
+
for i in range(120)
|
| 48 |
+
]
|
| 49 |
+
plan = srv.select_clips(
|
| 50 |
+
source_path="/tmp/x.mp4",
|
| 51 |
+
transcript_words=words,
|
| 52 |
+
duration_sec=120.0,
|
| 53 |
+
target_count=2,
|
| 54 |
+
min_sec=30.0,
|
| 55 |
+
max_sec=60.0,
|
| 56 |
+
)
|
| 57 |
+
assert plan["source_path"] == "/tmp/x.mp4"
|
| 58 |
+
assert 1 <= len(plan["clips"]) <= 2
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_classify_scenes_tool_no_keyframes():
|
| 62 |
+
scenes = [{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0}]
|
| 63 |
+
out = srv.classify_scenes(scenes=scenes)
|
| 64 |
+
assert out["classifications"][0]["scene_id"] == "s0"
|
| 65 |
+
assert out["classifications"][0]["layout"] in {k.value for k in LayoutKind}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def test_detect_scene_regions_returns_jobs_and_prompt():
|
| 69 |
+
scenes = [
|
| 70 |
+
{"scene_id": "s0", "start_time": 0.0, "end_time": 5.0, "keyframe_path": "/tmp/k0.jpg"},
|
| 71 |
+
{"scene_id": "s1", "start_time": 5.0, "end_time": 10.0, "keyframe_path": "/tmp/k1.jpg"},
|
| 72 |
+
]
|
| 73 |
+
out = srv.detect_scene_regions(scenes=scenes)
|
| 74 |
+
assert "STRICT JSON" in out["prompt"]
|
| 75 |
+
assert len(out["jobs"]) == 2
|
| 76 |
+
assert out["jobs"][0]["scene_id"] == "s0"
|
| 77 |
+
assert out["jobs"][0]["keyframe_path"] == "/tmp/k0.jpg"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_classify_scenes_with_vision_derives_instructions():
|
| 81 |
+
regions = [
|
| 82 |
+
{
|
| 83 |
+
"scene_id": "s0",
|
| 84 |
+
"chart_bbox": {"x1": 0.0, "y1": 0.0, "x2": 0.66, "y2": 1.0},
|
| 85 |
+
"person_bbox": {"x1": 0.72, "y1": 0.1, "x2": 0.99, "y2": 0.95},
|
| 86 |
+
"ocr_text": "CPI YoY",
|
| 87 |
+
}
|
| 88 |
+
]
|
| 89 |
+
out = srv.classify_scenes_with_vision(regions=regions)
|
| 90 |
+
assert out["classifications"][0]["layout"] == LayoutKind.SPLIT_CHART_PERSON.value
|
| 91 |
+
instr = out["layout_instructions"][0]
|
| 92 |
+
assert instr["chart_x_norm"] == 0.0
|
| 93 |
+
assert 0.8 < instr["person_x_norm"] < 0.9
|
humeo-core/tests/test_vision.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the scene-change + vision-LLM + OCR bbox primitive.
|
| 2 |
+
|
| 3 |
+
Covers:
|
| 4 |
+
* happy path: well-formed JSON -> populated ``SceneRegions``.
|
| 5 |
+
* bad JSON: degrade to empty regions + raw_reason, never raise.
|
| 6 |
+
* bad bbox: one malformed bbox does not take down the whole scene record.
|
| 7 |
+
* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.
|
| 8 |
+
* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come
|
| 9 |
+
from the bboxes when present, defaults when not.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
from humeo_core.primitives.vision import (
|
| 17 |
+
_CHART_WIDTH_SPLIT_THRESHOLD,
|
| 18 |
+
classify_from_regions,
|
| 19 |
+
classify_scenes_with_vision_llm,
|
| 20 |
+
detect_regions_with_llm,
|
| 21 |
+
layout_instruction_from_regions,
|
| 22 |
+
)
|
| 23 |
+
from humeo_core.schemas import (
|
| 24 |
+
BoundingBox,
|
| 25 |
+
LayoutKind,
|
| 26 |
+
Scene,
|
| 27 |
+
SceneClassification,
|
| 28 |
+
SceneRegions,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# Schema
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_bounding_box_requires_x2_gt_x1():
|
| 38 |
+
BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
|
| 39 |
+
with pytest.raises(ValueError):
|
| 40 |
+
BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
|
| 41 |
+
with pytest.raises(ValueError):
|
| 42 |
+
BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_bounding_box_center_and_width():
|
| 46 |
+
b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
|
| 47 |
+
assert b.center_x == pytest.approx(0.4)
|
| 48 |
+
assert b.center_y == pytest.approx(0.65)
|
| 49 |
+
assert b.width == pytest.approx(0.4)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
# detect_regions_with_llm
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
|
| 58 |
+
return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_detect_regions_happy_path():
|
| 62 |
+
scenes = [_scene(0)]
|
| 63 |
+
|
| 64 |
+
def vision_fn(_img: str, _prompt: str) -> str:
|
| 65 |
+
return json.dumps(
|
| 66 |
+
{
|
| 67 |
+
"person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
|
| 68 |
+
"chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
|
| 69 |
+
"ocr_text": "Inflation YoY",
|
| 70 |
+
"reason": "explainer layout",
|
| 71 |
+
}
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
out = detect_regions_with_llm(scenes, vision_fn)
|
| 75 |
+
assert len(out) == 1
|
| 76 |
+
r = out[0]
|
| 77 |
+
assert r.scene_id == "s0"
|
| 78 |
+
assert r.person_bbox and r.person_bbox.center_x > 0.8
|
| 79 |
+
assert r.chart_bbox and r.chart_bbox.width > 0.6
|
| 80 |
+
assert "Inflation" in r.ocr_text
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_detect_regions_bad_json_is_safe():
|
| 84 |
+
scenes = [_scene(0)]
|
| 85 |
+
|
| 86 |
+
def vision_fn(*_a) -> str:
|
| 87 |
+
return "not json"
|
| 88 |
+
|
| 89 |
+
out = detect_regions_with_llm(scenes, vision_fn)
|
| 90 |
+
assert out[0].person_bbox is None
|
| 91 |
+
assert out[0].chart_bbox is None
|
| 92 |
+
assert "parse error" in out[0].raw_reason.lower()
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_detect_regions_missing_keyframe_is_safe():
|
| 96 |
+
scenes = [_scene(0, kf=None)]
|
| 97 |
+
|
| 98 |
+
def vision_fn(*_a) -> str: # pragma: no cover - should not be called
|
| 99 |
+
raise AssertionError("vision_fn must not be called without a keyframe")
|
| 100 |
+
|
| 101 |
+
out = detect_regions_with_llm(scenes, vision_fn)
|
| 102 |
+
assert out[0].person_bbox is None
|
| 103 |
+
assert "no keyframe" in out[0].raw_reason.lower()
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_detect_regions_bad_bbox_degrades_gracefully():
|
| 107 |
+
scenes = [_scene(0)]
|
| 108 |
+
|
| 109 |
+
def vision_fn(*_a) -> str:
|
| 110 |
+
return json.dumps(
|
| 111 |
+
{
|
| 112 |
+
"person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
|
| 113 |
+
"chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
|
| 114 |
+
"ocr_text": "",
|
| 115 |
+
"reason": "person bbox inverted",
|
| 116 |
+
}
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
out = detect_regions_with_llm(scenes, vision_fn)
|
| 120 |
+
assert out[0].person_bbox is None
|
| 121 |
+
assert out[0].chart_bbox is not None
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# classify_from_regions
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def test_classify_wide_chart_is_split():
|
| 130 |
+
r = SceneRegions(
|
| 131 |
+
scene_id="s0",
|
| 132 |
+
chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
|
| 133 |
+
person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
|
| 134 |
+
)
|
| 135 |
+
c = classify_from_regions(r)
|
| 136 |
+
assert c.layout == LayoutKind.SPLIT_CHART_PERSON
|
| 137 |
+
assert c.confidence > 0.5
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def test_classify_narrow_chart_not_split():
|
| 141 |
+
r = SceneRegions(
|
| 142 |
+
scene_id="s0",
|
| 143 |
+
chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
|
| 144 |
+
person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
|
| 145 |
+
)
|
| 146 |
+
c = classify_from_regions(r)
|
| 147 |
+
# chart width (0.1) is below the split threshold -> not split
|
| 148 |
+
assert c.layout != LayoutKind.SPLIT_CHART_PERSON
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def test_classify_wide_person_is_zoom_call():
|
| 152 |
+
r = SceneRegions(
|
| 153 |
+
scene_id="s0",
|
| 154 |
+
person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
|
| 155 |
+
)
|
| 156 |
+
c = classify_from_regions(r)
|
| 157 |
+
assert c.layout == LayoutKind.ZOOM_CALL_CENTER
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def test_classify_small_person_is_sit_center():
|
| 161 |
+
r = SceneRegions(
|
| 162 |
+
scene_id="s0",
|
| 163 |
+
person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
|
| 164 |
+
)
|
| 165 |
+
c = classify_from_regions(r)
|
| 166 |
+
assert c.layout == LayoutKind.SIT_CENTER
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def test_classify_nothing_detected_defaults_sit_center_low_conf():
|
| 170 |
+
r = SceneRegions(scene_id="s0", raw_reason="model returned null")
|
| 171 |
+
c = classify_from_regions(r)
|
| 172 |
+
assert c.layout == LayoutKind.SIT_CENTER
|
| 173 |
+
assert c.confidence <= 0.5
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def test_chart_threshold_is_exported():
|
| 177 |
+
# guard against the tuning constant silently being removed
|
| 178 |
+
assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
# layout_instruction_from_regions
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def test_layout_instruction_from_regions_split():
|
| 187 |
+
r = SceneRegions(
|
| 188 |
+
scene_id="s0",
|
| 189 |
+
chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
|
| 190 |
+
person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
|
| 191 |
+
)
|
| 192 |
+
c = classify_from_regions(r)
|
| 193 |
+
instr = layout_instruction_from_regions(r, c)
|
| 194 |
+
assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
|
| 195 |
+
# person_x_norm = center of (0.72, 0.99) = 0.855
|
| 196 |
+
assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
|
| 197 |
+
# chart_x_norm = left edge = 0.0
|
| 198 |
+
assert instr.chart_x_norm == pytest.approx(0.0)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def test_layout_instruction_defaults_when_no_regions():
|
| 202 |
+
r = SceneRegions(scene_id="s0")
|
| 203 |
+
c = SceneClassification(
|
| 204 |
+
scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
|
| 205 |
+
)
|
| 206 |
+
instr = layout_instruction_from_regions(r, c)
|
| 207 |
+
assert instr.person_x_norm == 0.5
|
| 208 |
+
assert instr.chart_x_norm == 0.0
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def test_classify_scenes_with_vision_llm_returns_pairs():
|
| 212 |
+
scenes = [_scene(0)]
|
| 213 |
+
|
| 214 |
+
def vision_fn(*_a) -> str:
|
| 215 |
+
return json.dumps(
|
| 216 |
+
{
|
| 217 |
+
"person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
|
| 218 |
+
"chart_bbox": None,
|
| 219 |
+
"ocr_text": "",
|
| 220 |
+
"reason": "solo subject",
|
| 221 |
+
}
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
|
| 225 |
+
assert len(pairs) == 1
|
| 226 |
+
regions, classification = pairs[0]
|
| 227 |
+
assert regions.person_bbox is not None
|
| 228 |
+
assert classification.layout == LayoutKind.ZOOM_CALL_CENTER
|
pyproject.toml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "humeo"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Automated podcast-to-shorts pipeline"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"yt-dlp>=2024.0",
|
| 13 |
+
"fastapi>=0.115",
|
| 14 |
+
"openai>=1.0",
|
| 15 |
+
"google-genai>=1.0",
|
| 16 |
+
"httpx>=0.28",
|
| 17 |
+
"jinja2>=3.1",
|
| 18 |
+
"numpy>=1.24",
|
| 19 |
+
"Pillow>=10.0",
|
| 20 |
+
"python-dotenv>=1.0",
|
| 21 |
+
"replicate>=0.34.2",
|
| 22 |
+
"tqdm>=4.60",
|
| 23 |
+
"python-multipart>=0.0.9",
|
| 24 |
+
"uvicorn[standard]>=0.30",
|
| 25 |
+
"humeo-core",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
[project.optional-dependencies]
|
| 29 |
+
dev = [
|
| 30 |
+
"pytest-asyncio>=0.23",
|
| 31 |
+
"ruff",
|
| 32 |
+
"pytest",
|
| 33 |
+
]
|
| 34 |
+
whisper = [
|
| 35 |
+
"whisperx @ git+https://github.com/m-bain/whisperX.git",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
[tool.uv.sources]
|
| 39 |
+
humeo-core = { path = "humeo-core", editable = true }
|
| 40 |
+
|
| 41 |
+
[project.scripts]
|
| 42 |
+
humeo = "humeo.cli:main"
|
| 43 |
+
|
| 44 |
+
[tool.setuptools.packages.find]
|
| 45 |
+
where = ["src"]
|
| 46 |
+
|
| 47 |
+
[tool.setuptools.package-data]
|
| 48 |
+
humeo = ["prompts/*.jinja2"]
|
| 49 |
+
|
| 50 |
+
[tool.pytest.ini_options]
|
| 51 |
+
testpaths = ["tests", "humeo-core/tests"]
|
| 52 |
+
addopts = "-ra -q"
|
| 53 |
+
|
| 54 |
+
[tool.ruff]
|
| 55 |
+
line-length = 100
|
| 56 |
+
target-version = "py310"
|
src/humeo.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: humeo
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Automated podcast-to-shorts pipeline
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Description-Content-Type: text/markdown
|
| 7 |
+
License-File: LICENSE
|
| 8 |
+
Requires-Dist: yt-dlp>=2024.0
|
| 9 |
+
Requires-Dist: openai>=1.0
|
| 10 |
+
Requires-Dist: google-genai>=1.0
|
| 11 |
+
Requires-Dist: httpx>=0.28
|
| 12 |
+
Requires-Dist: jinja2>=3.1
|
| 13 |
+
Requires-Dist: numpy>=1.24
|
| 14 |
+
Requires-Dist: Pillow>=10.0
|
| 15 |
+
Requires-Dist: python-dotenv>=1.0
|
| 16 |
+
Requires-Dist: replicate>=0.34.2
|
| 17 |
+
Requires-Dist: tqdm>=4.60
|
| 18 |
+
Requires-Dist: humeo-core
|
| 19 |
+
Provides-Extra: dev
|
| 20 |
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
| 21 |
+
Requires-Dist: ruff; extra == "dev"
|
| 22 |
+
Requires-Dist: pytest; extra == "dev"
|
| 23 |
+
Provides-Extra: whisper
|
| 24 |
+
Requires-Dist: whisperx @ git+https://github.com/m-bain/whisperX.git ; extra == "whisper"
|
| 25 |
+
Dynamic: license-file
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
title: Humeo
|
| 29 |
+
sdk: docker
|
| 30 |
+
app_port: 7860
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
# Humeo
|
| 34 |
+
|
| 35 |
+
Current default preset:
|
| 36 |
+
|
| 37 |
+
- `native_highlight` captions
|
| 38 |
+
- OpenRouter + `google/gemini-2.5-pro` for Gemini-like stages
|
| 39 |
+
- Replicate SAM speaker-lock when `REPLICATE_API_TOKEN` is available
|
| 40 |
+
- ElevenLabs Scribe v2 transcription when `ELEVENLABS_API_KEY` is set
|
| 41 |
+
|
| 42 |
+
Long podcast or interview → vertical 9:16 shorts. Pipeline: download, transcribe, Gemini (clip selection, hook detection, content pruning, layout vision), ffmpeg render.
|
| 43 |
+
|
| 44 |
+
**Architecture (static HTML, GitHub Pages):**
|
| 45 |
+
[https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html](https://bryanthelai.github.io/long-to-shorts/hive_architecture_visualization.html)
|
| 46 |
+
|
| 47 |
+
## Hugging Face Space
|
| 48 |
+
|
| 49 |
+
This repo includes a Hugging Face Docker Space entrypoint in `app.py`.
|
| 50 |
+
|
| 51 |
+
- Upload one local MP4
|
| 52 |
+
- Watch live pipeline logs and stage progress
|
| 53 |
+
- Download rendered `short_*.mp4` clips from the UI
|
| 54 |
+
|
| 55 |
+
Required Space secrets:
|
| 56 |
+
|
| 57 |
+
- `GOOGLE_API_KEY` or `GEMINI_API_KEY`, or `OPENROUTER_API_KEY`
|
| 58 |
+
- `OPENAI_API_KEY` or `ELEVENLABS_API_KEY`
|
| 59 |
+
|
| 60 |
+
The Docker image pins `HUMEO_TRANSCRIBE_PROVIDER=openai` for the Space demo.
|
| 61 |
+
|
| 62 |
+
## Repo layout
|
| 63 |
+
|
| 64 |
+
| Path | Role |
|
| 65 |
+
|------|------|
|
| 66 |
+
| `src/humeo/` | CLI, pipeline, ingest, Gemini prompts, render adapters |
|
| 67 |
+
| `humeo-core/` | Schemas, ffmpeg compile, primitives, optional MCP server |
|
| 68 |
+
|
| 69 |
+
## Pipeline (actual order)
|
| 70 |
+
|
| 71 |
+
```text
|
| 72 |
+
YouTube URL
|
| 73 |
+
→ ingest (source.mp4, transcript.json)
|
| 74 |
+
→ clip selection (Gemini → clips.json)
|
| 75 |
+
→ hook detection (Gemini → hooks.json)
|
| 76 |
+
→ content pruning (Gemini → prune.json)
|
| 77 |
+
→ keyframes + layout vision (Gemini vision → layout_vision.json)
|
| 78 |
+
→ ASS subtitles + humeo-core ffmpeg render → short_<id>.mp4
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
Details: **`docs/PIPELINE.md`**.
|
| 82 |
+
|
| 83 |
+
## Five layouts
|
| 84 |
+
|
| 85 |
+
A short shows at most two on-screen items (`person` or `chart`). That yields five layout modes (see **`TERMINOLOGY.md`**).
|
| 86 |
+
|
| 87 |
+
## Requirements
|
| 88 |
+
|
| 89 |
+
- **Python** ≥ 3.10
|
| 90 |
+
- **`uv`** — install: [astral.sh/uv](https://docs.astral.sh/uv/)
|
| 91 |
+
- **`ffmpeg`** — on `PATH` for extract/render
|
| 92 |
+
- **API keys** — see **`docs/ENVIRONMENT.md`**
|
| 93 |
+
- `GOOGLE_API_KEY` or `GEMINI_API_KEY` — preferred for Gemini stages
|
| 94 |
+
- `OPENROUTER_API_KEY` — supported fallback for those same Gemini-like stages when Google keys are unavailable
|
| 95 |
+
- `OPENAI_API_KEY` — if using OpenAI Whisper API (`HUMEO_TRANSCRIBE_PROVIDER=openai`)
|
| 96 |
+
|
| 97 |
+
Copy **`.env.example`** → **`.env`** (never commit `.env`).
|
| 98 |
+
|
| 99 |
+
## Install
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
uv venv
|
| 103 |
+
uv sync
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
Optional local WhisperX (heavy; Windows often uses OpenAI API instead):
|
| 107 |
+
|
| 108 |
+
```bash
|
| 109 |
+
uv sync --extra whisper
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
## Run
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
|
| 116 |
+
humeo --long-to-shorts "C:\path\to\video.mp4"
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
Use **`--work-dir`** or **`--no-video-cache`** to control where `source.mp4` and intermediates live (see **`docs/ENVIRONMENT.md`**).
|
| 120 |
+
|
| 121 |
+
## CLI guide (all flags)
|
| 122 |
+
|
| 123 |
+
Use `humeo --help` for the live source of truth. This table matches `src/humeo/cli.py`.
|
| 124 |
+
|
| 125 |
+
### Required
|
| 126 |
+
|
| 127 |
+
| Flag | Meaning |
|
| 128 |
+
|------|---------|
|
| 129 |
+
| `--long-to-shorts SOURCE` | YouTube URL or local MP4 path to process (required). |
|
| 130 |
+
|
| 131 |
+
### Paths and cache behavior
|
| 132 |
+
|
| 133 |
+
| Flag | Meaning |
|
| 134 |
+
|------|---------|
|
| 135 |
+
| `--output`, `-o` | Output directory for final `short_*.mp4` (default: `./output`). |
|
| 136 |
+
| `--work-dir PATH` | Directory for intermediate artifacts (`source.mp4`, `transcript.json`, caches). |
|
| 137 |
+
| `--no-video-cache` | Disable per-video cache dirs; uses `./.humeo_work` unless `--work-dir` is set. |
|
| 138 |
+
| `--cache-root PATH` | Override cache root (env equivalent: `HUMEO_CACHE_ROOT`). |
|
| 139 |
+
| `--clean-run` | Fresh run: disables video cache, forces all model stages, overwrites outputs, and auto-creates a timestamped work dir if `--work-dir` is not provided. |
|
| 140 |
+
|
| 141 |
+
### Model selection and stage forcing
|
| 142 |
+
|
| 143 |
+
| Flag | Meaning |
|
| 144 |
+
|------|---------|
|
| 145 |
+
| `--gemini-model MODEL_ID` | Gemini model for clip selection / text stages (default from env/config). |
|
| 146 |
+
| `--gemini-vision-model MODEL_ID` | Gemini model for keyframe layout vision (defaults to `GEMINI_VISION_MODEL` or clip model). |
|
| 147 |
+
| `--force-clip-selection` | Re-run clip selection even if `clips.meta.json` cache matches. |
|
| 148 |
+
| `--force-hook-detection` | Re-run Stage 2.25 hook detection even if `hooks.meta.json` cache matches. |
|
| 149 |
+
| `--force-content-pruning` | Re-run Stage 2.5 pruning even if `prune.meta.json` cache matches. |
|
| 150 |
+
| `--force-layout-vision` | Re-run layout vision even if `layout_vision.meta.json` cache matches. |
|
| 151 |
+
| `--no-hook-detection` | Skip Stage 2.25 hook detection (pruning still runs with fallback behavior). |
|
| 152 |
+
|
| 153 |
+
### Pruning and subtitles
|
| 154 |
+
|
| 155 |
+
| Flag | Meaning |
|
| 156 |
+
|------|---------|
|
| 157 |
+
| `--prune-level {off,conservative,balanced,aggressive}` | Stage 2.5 aggressiveness (default: `balanced`). |
|
| 158 |
+
| `--subtitle-font-size INT` | Subtitle font size in output pixels (default: `48`). |
|
| 159 |
+
| `--subtitle-margin-v INT` | Bottom subtitle margin in output pixels (default: `160`). |
|
| 160 |
+
| `--subtitle-max-words INT` | Max words per subtitle cue (default: `4`). |
|
| 161 |
+
| `--subtitle-max-cue-sec FLOAT` | Max subtitle cue duration in seconds (default: `2.2`). |
|
| 162 |
+
|
| 163 |
+
### Logging
|
| 164 |
+
|
| 165 |
+
| Flag | Meaning |
|
| 166 |
+
|------|---------|
|
| 167 |
+
| `--verbose`, `-v` | Enable debug logging. |
|
| 168 |
+
|
| 169 |
+
### Common command recipes
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
# Basic run
|
| 173 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID"
|
| 174 |
+
|
| 175 |
+
# Local MP4
|
| 176 |
+
humeo --long-to-shorts "C:\path\to\video.mp4"
|
| 177 |
+
|
| 178 |
+
# Full fresh run for debugging / prompt tuning
|
| 179 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --clean-run --verbose
|
| 180 |
+
|
| 181 |
+
# Re-run only clip selection after prompt edits
|
| 182 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --force-clip-selection
|
| 183 |
+
|
| 184 |
+
# Keep intermediates in a fixed local folder
|
| 185 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --work-dir .humeo_work
|
| 186 |
+
|
| 187 |
+
# Compare different prune levels on same source
|
| 188 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level conservative
|
| 189 |
+
humeo --long-to-shorts "https://www.youtube.com/watch?v=VIDEO_ID" --prune-level aggressive
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
## Documentation
|
| 193 |
+
|
| 194 |
+
| Doc | Purpose |
|
| 195 |
+
|-----|---------|
|
| 196 |
+
| **`docs/README.md`** | Index of all files under `docs/` |
|
| 197 |
+
| **`docs/STUDY_ORDER.md`** | Read order for onboarding |
|
| 198 |
+
| **`docs/PIPELINE.md`** | Stages, caches, JSON contracts |
|
| 199 |
+
| **`docs/ENVIRONMENT.md`** | Keys, env vars, cache layout |
|
| 200 |
+
| **`docs/SHARING.md`** | How to share logs/docs/video without bloating git |
|
| 201 |
+
| **`docs/TARGET_VIDEO_ANALYSIS.md`** | Reference input analysis example |
|
| 202 |
+
| **`docs/full_run_output.txt`** | Example full run log (text) |
|
| 203 |
+
| **`docs/hive-paper/PAPER_BREAKDOWN.md`** | HIVE paper, file mapping §9 |
|
| 204 |
+
| **`docs/hive-paper/hive_paper_blunt_guide.md`** | Short HIVE recap |
|
| 205 |
+
| **`docs/TODO.md`** | Backlog |
|
| 206 |
+
| **`docs/KNOWN_LIMITATIONS_AND_PROMPT_CONTRACT_GAP.md`** | Prompt vs code (ranking, hooks, unused fields, scene detect) |
|
| 207 |
+
| **`docs/SOLUTIONS.md`** | Design rationale |
|
| 208 |
+
| **`TERMINOLOGY.md`** | Glossary |
|
| 209 |
+
|
| 210 |
+
## Tests
|
| 211 |
+
|
| 212 |
+
```bash
|
| 213 |
+
uv sync --extra dev
|
| 214 |
+
uv run pytest
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
## Sharing outputs
|
| 218 |
+
|
| 219 |
+
`output/`, `*.mp4`, and `keyframes/` are **gitignored**. Put rendered shorts on **YouTube** or **GitHub Releases**; keep the repo for source and docs. See **`docs/SHARING.md`**.
|
| 220 |
+
|
| 221 |
+
## License
|
| 222 |
+
|
| 223 |
+
See **`LICENSE`** (root) and **`humeo-core/LICENSE`**.
|
src/humeo.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
LICENSE
|
| 2 |
+
README.md
|
| 3 |
+
pyproject.toml
|
| 4 |
+
src/humeo/__init__.py
|
| 5 |
+
src/humeo/best_of.py
|
| 6 |
+
src/humeo/cli.py
|
| 7 |
+
src/humeo/clip_assembly.py
|
| 8 |
+
src/humeo/clip_selection_cache.py
|
| 9 |
+
src/humeo/clip_selector.py
|
| 10 |
+
src/humeo/config.py
|
| 11 |
+
src/humeo/content_pruning.py
|
| 12 |
+
src/humeo/cutter.py
|
| 13 |
+
src/humeo/env.py
|
| 14 |
+
src/humeo/gemini_generate.py
|
| 15 |
+
src/humeo/hook_detector.py
|
| 16 |
+
src/humeo/hook_library.py
|
| 17 |
+
src/humeo/ingest.py
|
| 18 |
+
src/humeo/interactive.py
|
| 19 |
+
src/humeo/layout_vision.py
|
| 20 |
+
src/humeo/pipeline.py
|
| 21 |
+
src/humeo/prompt_loader.py
|
| 22 |
+
src/humeo/reframe_ffmpeg.py
|
| 23 |
+
src/humeo/render_window.py
|
| 24 |
+
src/humeo/session_state.py
|
| 25 |
+
src/humeo/transcript_align.py
|
| 26 |
+
src/humeo/video_cache.py
|
| 27 |
+
src/humeo.egg-info/PKG-INFO
|
| 28 |
+
src/humeo.egg-info/SOURCES.txt
|
| 29 |
+
src/humeo.egg-info/dependency_links.txt
|
| 30 |
+
src/humeo.egg-info/entry_points.txt
|
| 31 |
+
src/humeo.egg-info/requires.txt
|
| 32 |
+
src/humeo.egg-info/top_level.txt
|
| 33 |
+
src/humeo/prompts/clip_selection_system.jinja2
|
| 34 |
+
src/humeo/prompts/clip_selection_user.jinja2
|
| 35 |
+
src/humeo/prompts/content_pruning_system.jinja2
|
| 36 |
+
src/humeo/prompts/hook_detection_system.jinja2
|
| 37 |
+
tests/test_ass_subtitles.py
|
| 38 |
+
tests/test_best_of.py
|
| 39 |
+
tests/test_clip_assembly.py
|
| 40 |
+
tests/test_clip_ranking.py
|
| 41 |
+
tests/test_clip_selection_cache.py
|
| 42 |
+
tests/test_clip_selector.py
|
| 43 |
+
tests/test_content_pruning.py
|
| 44 |
+
tests/test_cutter_native_highlight.py
|
| 45 |
+
tests/test_gemini_generate.py
|
| 46 |
+
tests/test_hook_detector.py
|
| 47 |
+
tests/test_hook_library.py
|
| 48 |
+
tests/test_ingest_openai_chunks.py
|
| 49 |
+
tests/test_interactive.py
|
| 50 |
+
tests/test_layout_vision_unit.py
|
| 51 |
+
tests/test_pipeline_interactive.py
|
| 52 |
+
tests/test_pipeline_quality_gate.py
|
| 53 |
+
tests/test_prompt_loader.py
|
| 54 |
+
tests/test_reframe_ffmpeg.py
|
| 55 |
+
tests/test_render_window.py
|
| 56 |
+
tests/test_session_state.py
|
| 57 |
+
tests/test_transcript_align.py
|
| 58 |
+
tests/test_video_cache.py
|
src/humeo.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
src/humeo.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
humeo = humeo.cli:main
|
src/humeo.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
yt-dlp>=2024.0
|
| 2 |
+
openai>=1.0
|
| 3 |
+
google-genai>=1.0
|
| 4 |
+
httpx>=0.28
|
| 5 |
+
jinja2>=3.1
|
| 6 |
+
numpy>=1.24
|
| 7 |
+
Pillow>=10.0
|
| 8 |
+
python-dotenv>=1.0
|
| 9 |
+
replicate>=0.34.2
|
| 10 |
+
tqdm>=4.60
|
| 11 |
+
humeo-core
|
| 12 |
+
|
| 13 |
+
[dev]
|
| 14 |
+
pytest-asyncio>=0.23
|
| 15 |
+
ruff
|
| 16 |
+
pytest
|
| 17 |
+
|
| 18 |
+
[whisper]
|
| 19 |
+
whisperx @ git+https://github.com/m-bain/whisperX.git
|