Spaces:
Sleeping
Sleeping
Fix preview modal and false split layouts
Browse files- .env.example +54 -0
- app.py +3 -3
- src/humeo/layout_vision.py +16 -15
- src/humeo/pipeline.py +24 -28
.env.example
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copy to ".env" in the project root (or export in your shell). See docs/ENVIRONMENT.md.
|
| 2 |
+
|
| 3 |
+
# --- Gemini / OpenRouter (clip selection + hooks + layout vision) ---
|
| 4 |
+
# Choose the backend explicitly. Recommended for this pipeline: openrouter.
|
| 5 |
+
HUMEO_LLM_PROVIDER=openrouter
|
| 6 |
+
|
| 7 |
+
# Google AI Studio key (only needed when HUMEO_LLM_PROVIDER=google or auto).
|
| 8 |
+
GOOGLE_API_KEY=
|
| 9 |
+
# Legacy alias — only used if GOOGLE_API_KEY is unset:
|
| 10 |
+
# GEMINI_API_KEY=
|
| 11 |
+
|
| 12 |
+
# Recommended backend for all Gemini stages:
|
| 13 |
+
OPENROUTER_API_KEY=
|
| 14 |
+
|
| 15 |
+
# Model ids (override per run with: humeo ... --gemini-model <id>)
|
| 16 |
+
GEMINI_MODEL=google/gemini-2.5-pro
|
| 17 |
+
# Optional override for layout vision; leave blank to reuse GEMINI_MODEL.
|
| 18 |
+
# GEMINI_VISION_MODEL=google/gemini-2.5-pro
|
| 19 |
+
|
| 20 |
+
# Optional zip or directory with the viral hook library corpus.
|
| 21 |
+
# HUMEO_HOOK_LIBRARY_PATH=C:\Users\you\Downloads\5000_viral_hooks.zip
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Optional: directory with clip_selection_system.jinja2 and clip_selection_user.jinja2
|
| 25 |
+
# HUMEO_PROMPTS_DIR=
|
| 26 |
+
|
| 27 |
+
# --- Transcription ---
|
| 28 |
+
# Recommended default: ElevenLabs Scribe v2 with No Verbatim cleanup.
|
| 29 |
+
ELEVENLABS_API_KEY=
|
| 30 |
+
ELEVENLABS_NO_VERBATIM=true
|
| 31 |
+
HUMEO_TRANSCRIBE_PROVIDER=elevenlabs
|
| 32 |
+
|
| 33 |
+
# Optional fallback if you deliberately use the OpenAI Whisper API instead.
|
| 34 |
+
# OPENAI_API_KEY=sk-...
|
| 35 |
+
|
| 36 |
+
# Optional local fallback:
|
| 37 |
+
# HUMEO_TRANSCRIBE_PROVIDER=whisperx
|
| 38 |
+
|
| 39 |
+
# --- Tracking fallback ---
|
| 40 |
+
# REPLICATE_API_TOKEN=
|
| 41 |
+
HUMEO_SEGMENTATION_PROVIDER=replicate
|
| 42 |
+
# HUMEO_SEGMENTATION_MODEL=meta/sam-2-video
|
| 43 |
+
|
| 44 |
+
# --- YouTube downloads on cloud hosts ---
|
| 45 |
+
# Hugging Face/cloud IPs are sometimes blocked by YouTube. If link downloads fail
|
| 46 |
+
# with a bot/sign-in message, export browser cookies in Netscape cookies.txt format,
|
| 47 |
+
# base64 encode the file, and set this as a Hugging Face Space secret.
|
| 48 |
+
# YTDLP_COOKIES_B64=
|
| 49 |
+
# Advanced yt-dlp override:
|
| 50 |
+
# YTDLP_EXTRACTOR_ARGS=youtube:player_client=default,web_creator
|
| 51 |
+
|
| 52 |
+
# --- Video cache (optional) ---
|
| 53 |
+
# Override default: ~/.cache/humeo (Unix) or %LOCALAPPDATA%/humeo (Windows)
|
| 54 |
+
# HUMEO_CACHE_ROOT=
|
app.py
CHANGED
|
@@ -570,10 +570,10 @@ INDEX_HTML = r"""<!DOCTYPE html>
|
|
| 570 |
.regen-btn:hover { background: var(--ink-soft); }
|
| 571 |
.modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
|
| 572 |
.modal-overlay.open { display: flex; }
|
| 573 |
-
.modal-box { background: var(--white); border-radius: var(--radius-lg); width:
|
| 574 |
@keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
|
| 575 |
-
.modal-video { aspect-ratio: 9/16;
|
| 576 |
-
.modal-video video { width:100%; height:100%; object-fit:
|
| 577 |
.modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
|
| 578 |
.modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
|
| 579 |
.modal-actions { display:flex; align-items:center; gap:8px; }
|
|
|
|
| 570 |
.regen-btn:hover { background: var(--ink-soft); }
|
| 571 |
.modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
|
| 572 |
.modal-overlay.open { display: flex; }
|
| 573 |
+
.modal-box { background: var(--white); border-radius: var(--radius-lg); width: min(390px, calc((100vh - 130px) * 9 / 16), calc(100vw - 40px)); max-width: none; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
|
| 574 |
@keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
|
| 575 |
+
.modal-video { width: 100%; aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; background:#000; }
|
| 576 |
+
.modal-video video { width:100%; height:100%; object-fit:cover; background:#000; display:block; }
|
| 577 |
.modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
|
| 578 |
.modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
|
| 579 |
.modal-actions { display:flex; align-items:center; gap:8px; }
|
src/humeo/layout_vision.py
CHANGED
|
@@ -46,8 +46,8 @@ logger = logging.getLogger(__name__)
|
|
| 46 |
LAYOUT_VISION_CACHE_VERSION = 8
|
| 47 |
LAYOUT_VISION_META = "layout_vision.meta.json"
|
| 48 |
LAYOUT_VISION_JSON = "layout_vision.json"
|
| 49 |
-
TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10))
|
| 50 |
-
TRACKING_MIN_SPREAD_NORM = 0.08
|
| 51 |
TRACKING_OUTLIER_DELTA_NORM = 0.16
|
| 52 |
TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10
|
| 53 |
TRACKING_DEADBAND_NORM = 0.025
|
|
@@ -1304,9 +1304,9 @@ def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[
|
|
| 1304 |
provider = resolve_llm_provider()
|
| 1305 |
resolved_model = model_name_for_provider(model_name, provider)
|
| 1306 |
|
| 1307 |
-
if provider == "google":
|
| 1308 |
-
client = genai.Client(api_key=resolve_gemini_api_key())
|
| 1309 |
-
response = client.models.generate_content(
|
| 1310 |
model=resolved_model,
|
| 1311 |
contents=[
|
| 1312 |
types.Part.from_text(text=prompt),
|
|
@@ -1338,10 +1338,10 @@ def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[
|
|
| 1338 |
{"type": "image_url", "image_url": {"url": data_url}},
|
| 1339 |
],
|
| 1340 |
},
|
| 1341 |
-
],
|
| 1342 |
-
temperature=0.2,
|
| 1343 |
-
response_format={"type": "json_object"},
|
| 1344 |
-
)
|
| 1345 |
text = _openai_message_text(response.choices[0].message.content)
|
| 1346 |
if not text:
|
| 1347 |
raise RuntimeError("OpenRouter vision returned empty response")
|
|
@@ -1376,12 +1376,13 @@ def infer_layout_instructions(
|
|
| 1376 |
sid = s.scene_id
|
| 1377 |
if not s.keyframe_path:
|
| 1378 |
logger.warning("No keyframe for %s; using sit_center.", sid)
|
| 1379 |
-
out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
|
| 1380 |
-
raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"}
|
| 1381 |
-
continue
|
| 1382 |
-
try:
|
| 1383 |
-
|
| 1384 |
-
|
|
|
|
| 1385 |
instr = _instruction_from_gemini_json(
|
| 1386 |
sid,
|
| 1387 |
data,
|
|
|
|
| 46 |
LAYOUT_VISION_CACHE_VERSION = 8
|
| 47 |
LAYOUT_VISION_META = "layout_vision.meta.json"
|
| 48 |
LAYOUT_VISION_JSON = "layout_vision.json"
|
| 49 |
+
TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10))
|
| 50 |
+
TRACKING_MIN_SPREAD_NORM = 0.08
|
| 51 |
TRACKING_OUTLIER_DELTA_NORM = 0.16
|
| 52 |
TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10
|
| 53 |
TRACKING_DEADBAND_NORM = 0.025
|
|
|
|
| 1304 |
provider = resolve_llm_provider()
|
| 1305 |
resolved_model = model_name_for_provider(model_name, provider)
|
| 1306 |
|
| 1307 |
+
if provider == "google":
|
| 1308 |
+
client = genai.Client(api_key=resolve_gemini_api_key())
|
| 1309 |
+
response = client.models.generate_content(
|
| 1310 |
model=resolved_model,
|
| 1311 |
contents=[
|
| 1312 |
types.Part.from_text(text=prompt),
|
|
|
|
| 1338 |
{"type": "image_url", "image_url": {"url": data_url}},
|
| 1339 |
],
|
| 1340 |
},
|
| 1341 |
+
],
|
| 1342 |
+
temperature=0.2,
|
| 1343 |
+
response_format={"type": "json_object"},
|
| 1344 |
+
)
|
| 1345 |
text = _openai_message_text(response.choices[0].message.content)
|
| 1346 |
if not text:
|
| 1347 |
raise RuntimeError("OpenRouter vision returned empty response")
|
|
|
|
| 1376 |
sid = s.scene_id
|
| 1377 |
if not s.keyframe_path:
|
| 1378 |
logger.warning("No keyframe for %s; using sit_center.", sid)
|
| 1379 |
+
out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
|
| 1380 |
+
raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"}
|
| 1381 |
+
continue
|
| 1382 |
+
try:
|
| 1383 |
+
logger.info("Layout vision for %s (model=%s)...", sid, model_name)
|
| 1384 |
+
data = _call_gemini_vision(s.keyframe_path, model_name)
|
| 1385 |
+
image_size = _keyframe_dimensions(s.keyframe_path)
|
| 1386 |
instr = _instruction_from_gemini_json(
|
| 1387 |
sid,
|
| 1388 |
data,
|
src/humeo/pipeline.py
CHANGED
|
@@ -69,13 +69,28 @@ _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1 = 0.12
|
|
| 69 |
_NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20
|
| 70 |
_PRESENTATION_REFERENCE_RE = re.compile(
|
| 71 |
r"\b("
|
| 72 |
-
r"as you can see|you can see|what you can see|look at|take a look|shown here|"
|
| 73 |
r"shown on|on the screen|on this slide|this chart|the chart|this graph|"
|
| 74 |
r"the graph|this slide|this matrix|the matrix|red line|yellow line|"
|
| 75 |
r"blue line|green line|top there|bottom there|x-axis|y-axis"
|
| 76 |
r")\b",
|
| 77 |
flags=re.IGNORECASE,
|
| 78 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig:
|
|
@@ -286,34 +301,15 @@ def _normalize_layout_for_render(
|
|
| 286 |
) -> LayoutInstruction:
|
| 287 |
if render_theme != RenderTheme.NATIVE_HIGHLIGHT:
|
| 288 |
return instruction
|
| 289 |
-
if instruction.layout != LayoutKind.SPLIT_CHART_PERSON:
|
| 290 |
-
return instruction
|
| 291 |
-
chart = instruction.split_chart_region
|
| 292 |
-
person = instruction.split_person_region
|
| 293 |
-
if chart is None or person is None:
|
| 294 |
-
return instruction
|
| 295 |
-
chart_dominates = chart.y2 >= _NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2
|
| 296 |
-
person_too_small = person.width <= _NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH
|
| 297 |
-
# Keep Bryan's newer head-and-shoulders presenter crops in split mode even
|
| 298 |
-
# when the speaker strip is narrow; the older fallback-to-center rule was
|
| 299 |
-
# written for lower-anchored full-body crops that rendered badly here.
|
| 300 |
-
person_is_top_anchored = person.y1 <= _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1
|
| 301 |
-
if not (chart_dominates and person_too_small and not person_is_top_anchored):
|
| 302 |
-
return instruction
|
| 303 |
-
if clip is not None and _clip_references_presentation(clip):
|
| 304 |
return instruction
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
"split_second_person_region": None,
|
| 313 |
-
"chart_x_norm": 0.0,
|
| 314 |
-
"top_band_ratio": 0.5,
|
| 315 |
-
}
|
| 316 |
-
)
|
| 317 |
|
| 318 |
|
| 319 |
def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]:
|
|
|
|
| 69 |
_NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20
|
| 70 |
_PRESENTATION_REFERENCE_RE = re.compile(
|
| 71 |
r"\b("
|
| 72 |
+
r"as you can(?: also)? see|you can(?: also)? see|what you can(?: also)? see|look at|take a look|shown here|"
|
| 73 |
r"shown on|on the screen|on this slide|this chart|the chart|this graph|"
|
| 74 |
r"the graph|this slide|this matrix|the matrix|red line|yellow line|"
|
| 75 |
r"blue line|green line|top there|bottom there|x-axis|y-axis"
|
| 76 |
r")\b",
|
| 77 |
flags=re.IGNORECASE,
|
| 78 |
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _split_chart_person_to_center(instruction: LayoutInstruction) -> LayoutInstruction:
|
| 82 |
+
return instruction.model_copy(
|
| 83 |
+
update={
|
| 84 |
+
"layout": LayoutKind.SIT_CENTER,
|
| 85 |
+
"zoom": max(float(instruction.zoom), _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM),
|
| 86 |
+
"split_chart_region": None,
|
| 87 |
+
"split_person_region": None,
|
| 88 |
+
"split_second_chart_region": None,
|
| 89 |
+
"split_second_person_region": None,
|
| 90 |
+
"chart_x_norm": 0.0,
|
| 91 |
+
"top_band_ratio": 0.5,
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
|
| 95 |
|
| 96 |
def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig:
|
|
|
|
| 301 |
) -> LayoutInstruction:
|
| 302 |
if render_theme != RenderTheme.NATIVE_HIGHLIGHT:
|
| 303 |
return instruction
|
| 304 |
+
if instruction.layout != LayoutKind.SPLIT_CHART_PERSON:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
return instruction
|
| 306 |
+
if clip is None or not _clip_references_presentation(clip):
|
| 307 |
+
return _split_chart_person_to_center(instruction)
|
| 308 |
+
chart = instruction.split_chart_region
|
| 309 |
+
person = instruction.split_person_region
|
| 310 |
+
if chart is None or person is None:
|
| 311 |
+
return _split_chart_person_to_center(instruction)
|
| 312 |
+
return instruction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
|
| 315 |
def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]:
|