moonlantern1 commited on
Commit
c86f8a6
·
verified ·
1 Parent(s): ed35a5c

Fix preview modal and false split layouts

Browse files
Files changed (4) hide show
  1. .env.example +54 -0
  2. app.py +3 -3
  3. src/humeo/layout_vision.py +16 -15
  4. src/humeo/pipeline.py +24 -28
.env.example ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy to ".env" in the project root (or export in your shell). See docs/ENVIRONMENT.md.
2
+
3
+ # --- Gemini / OpenRouter (clip selection + hooks + layout vision) ---
4
+ # Choose the backend explicitly. Recommended for this pipeline: openrouter.
5
+ HUMEO_LLM_PROVIDER=openrouter
6
+
7
+ # Google AI Studio key (only needed when HUMEO_LLM_PROVIDER=google or auto).
8
+ GOOGLE_API_KEY=
9
+ # Legacy alias — only used if GOOGLE_API_KEY is unset:
10
+ # GEMINI_API_KEY=
11
+
12
+ # Recommended backend for all Gemini stages:
13
+ OPENROUTER_API_KEY=
14
+
15
+ # Model ids (override per run with: humeo ... --gemini-model <id>)
16
+ GEMINI_MODEL=google/gemini-2.5-pro
17
+ # Optional override for layout vision; leave blank to reuse GEMINI_MODEL.
18
+ # GEMINI_VISION_MODEL=google/gemini-2.5-pro
19
+
20
+ # Optional zip or directory with the viral hook library corpus.
21
+ # HUMEO_HOOK_LIBRARY_PATH=C:\Users\you\Downloads\5000_viral_hooks.zip
22
+
23
+
24
+ # Optional: directory with clip_selection_system.jinja2 and clip_selection_user.jinja2
25
+ # HUMEO_PROMPTS_DIR=
26
+
27
+ # --- Transcription ---
28
+ # Recommended default: ElevenLabs Scribe v2 with No Verbatim cleanup.
29
+ ELEVENLABS_API_KEY=
30
+ ELEVENLABS_NO_VERBATIM=true
31
+ HUMEO_TRANSCRIBE_PROVIDER=elevenlabs
32
+
33
+ # Optional fallback if you deliberately use the OpenAI Whisper API instead.
34
+ # OPENAI_API_KEY=sk-...
35
+
36
+ # Optional local fallback:
37
+ # HUMEO_TRANSCRIBE_PROVIDER=whisperx
38
+
39
+ # --- Tracking fallback ---
40
+ # REPLICATE_API_TOKEN=
41
+ HUMEO_SEGMENTATION_PROVIDER=replicate
42
+ # HUMEO_SEGMENTATION_MODEL=meta/sam-2-video
43
+
44
+ # --- YouTube downloads on cloud hosts ---
45
+ # Hugging Face/cloud IPs are sometimes blocked by YouTube. If link downloads fail
46
+ # with a bot/sign-in message, export browser cookies in Netscape cookies.txt format,
47
+ # base64 encode the file, and set this as a Hugging Face Space secret.
48
+ # YTDLP_COOKIES_B64=
49
+ # Advanced yt-dlp override:
50
+ # YTDLP_EXTRACTOR_ARGS=youtube:player_client=default,web_creator
51
+
52
+ # --- Video cache (optional) ---
53
+ # Override default: ~/.cache/humeo (Unix) or %LOCALAPPDATA%/humeo (Windows)
54
+ # HUMEO_CACHE_ROOT=
app.py CHANGED
@@ -570,10 +570,10 @@ INDEX_HTML = r"""<!DOCTYPE html>
570
  .regen-btn:hover { background: var(--ink-soft); }
571
  .modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
572
  .modal-overlay.open { display: flex; }
573
- .modal-box { background: var(--white); border-radius: var(--radius-lg); width: 100%; max-width: 390px; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
574
  @keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
575
- .modal-video { aspect-ratio: 9/16; max-height: 70vh; display: flex; align-items: center; justify-content: center; position: relative; background:var(--ink); }
576
- .modal-video video { width:100%; height:100%; object-fit:contain; background:#000; }
577
  .modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
578
  .modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
579
  .modal-actions { display:flex; align-items:center; gap:8px; }
 
570
  .regen-btn:hover { background: var(--ink-soft); }
571
  .modal-overlay { display: none; position: fixed; inset: 0; background: rgba(42,31,14,0.65); backdrop-filter: blur(6px); z-index: 500; align-items: center; justify-content: center; padding: 20px; animation: fadeIn 0.25s ease; }
572
  .modal-overlay.open { display: flex; }
573
+ .modal-box { background: var(--white); border-radius: var(--radius-lg); width: min(390px, calc((100vh - 130px) * 9 / 16), calc(100vw - 40px)); max-width: none; overflow: hidden; box-shadow: 0 24px 64px rgba(42,31,14,0.25); animation: slideUp 0.3s ease; }
574
  @keyframes slideUp { from { opacity: 0; transform: translateY(20px) scale(0.97); } to { opacity: 1; transform: translateY(0) scale(1); } }
575
+ .modal-video { width: 100%; aspect-ratio: 9/16; display: flex; align-items: center; justify-content: center; position: relative; background:#000; }
576
+ .modal-video video { width:100%; height:100%; object-fit:cover; background:#000; display:block; }
577
  .modal-footer { padding: 16px 20px; border-top: 1px solid var(--border); display: flex; align-items: center; justify-content: space-between; gap:12px; }
578
  .modal-clip-label { font-family: 'Cormorant Garamond', serif; font-size: 1.1rem; font-weight: 500; }
579
  .modal-actions { display:flex; align-items:center; gap:8px; }
src/humeo/layout_vision.py CHANGED
@@ -46,8 +46,8 @@ logger = logging.getLogger(__name__)
46
  LAYOUT_VISION_CACHE_VERSION = 8
47
  LAYOUT_VISION_META = "layout_vision.meta.json"
48
  LAYOUT_VISION_JSON = "layout_vision.json"
49
- TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10))
50
- TRACKING_MIN_SPREAD_NORM = 0.08
51
  TRACKING_OUTLIER_DELTA_NORM = 0.16
52
  TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10
53
  TRACKING_DEADBAND_NORM = 0.025
@@ -1304,9 +1304,9 @@ def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[
1304
  provider = resolve_llm_provider()
1305
  resolved_model = model_name_for_provider(model_name, provider)
1306
 
1307
- if provider == "google":
1308
- client = genai.Client(api_key=resolve_gemini_api_key())
1309
- response = client.models.generate_content(
1310
  model=resolved_model,
1311
  contents=[
1312
  types.Part.from_text(text=prompt),
@@ -1338,10 +1338,10 @@ def _call_vision_json(keyframe_path: str, model_name: str, prompt: str) -> dict[
1338
  {"type": "image_url", "image_url": {"url": data_url}},
1339
  ],
1340
  },
1341
- ],
1342
- temperature=0.2,
1343
- response_format={"type": "json_object"},
1344
- )
1345
  text = _openai_message_text(response.choices[0].message.content)
1346
  if not text:
1347
  raise RuntimeError("OpenRouter vision returned empty response")
@@ -1376,12 +1376,13 @@ def infer_layout_instructions(
1376
  sid = s.scene_id
1377
  if not s.keyframe_path:
1378
  logger.warning("No keyframe for %s; using sit_center.", sid)
1379
- out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
1380
- raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"}
1381
- continue
1382
- try:
1383
- data = _call_gemini_vision(s.keyframe_path, model_name)
1384
- image_size = _keyframe_dimensions(s.keyframe_path)
 
1385
  instr = _instruction_from_gemini_json(
1386
  sid,
1387
  data,
 
46
  LAYOUT_VISION_CACHE_VERSION = 8
47
  LAYOUT_VISION_META = "layout_vision.meta.json"
48
  LAYOUT_VISION_JSON = "layout_vision.json"
49
+ TRACKING_SAMPLE_FRACTIONS = tuple(i / 10.0 for i in range(1, 10))
50
+ TRACKING_MIN_SPREAD_NORM = 0.08
51
  TRACKING_OUTLIER_DELTA_NORM = 0.16
52
  TRACKING_OUTLIER_NEIGHBOR_MAX_NORM = 0.10
53
  TRACKING_DEADBAND_NORM = 0.025
 
1304
  provider = resolve_llm_provider()
1305
  resolved_model = model_name_for_provider(model_name, provider)
1306
 
1307
+ if provider == "google":
1308
+ client = genai.Client(api_key=resolve_gemini_api_key())
1309
+ response = client.models.generate_content(
1310
  model=resolved_model,
1311
  contents=[
1312
  types.Part.from_text(text=prompt),
 
1338
  {"type": "image_url", "image_url": {"url": data_url}},
1339
  ],
1340
  },
1341
+ ],
1342
+ temperature=0.2,
1343
+ response_format={"type": "json_object"},
1344
+ )
1345
  text = _openai_message_text(response.choices[0].message.content)
1346
  if not text:
1347
  raise RuntimeError("OpenRouter vision returned empty response")
 
1376
  sid = s.scene_id
1377
  if not s.keyframe_path:
1378
  logger.warning("No keyframe for %s; using sit_center.", sid)
1379
+ out[sid] = LayoutInstruction(clip_id=sid, layout=LayoutKind.SIT_CENTER)
1380
+ raw_by_clip[sid] = {"error": "no keyframe", "layout": "sit_center"}
1381
+ continue
1382
+ try:
1383
+ logger.info("Layout vision for %s (model=%s)...", sid, model_name)
1384
+ data = _call_gemini_vision(s.keyframe_path, model_name)
1385
+ image_size = _keyframe_dimensions(s.keyframe_path)
1386
  instr = _instruction_from_gemini_json(
1387
  sid,
1388
  data,
src/humeo/pipeline.py CHANGED
@@ -69,13 +69,28 @@ _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1 = 0.12
69
  _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20
70
  _PRESENTATION_REFERENCE_RE = re.compile(
71
  r"\b("
72
- r"as you can see|you can see|what you can see|look at|take a look|shown here|"
73
  r"shown on|on the screen|on this slide|this chart|the chart|this graph|"
74
  r"the graph|this slide|this matrix|the matrix|red line|yellow line|"
75
  r"blue line|green line|top there|bottom there|x-axis|y-axis"
76
  r")\b",
77
  flags=re.IGNORECASE,
78
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
  def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig:
@@ -286,34 +301,15 @@ def _normalize_layout_for_render(
286
  ) -> LayoutInstruction:
287
  if render_theme != RenderTheme.NATIVE_HIGHLIGHT:
288
  return instruction
289
- if instruction.layout != LayoutKind.SPLIT_CHART_PERSON:
290
- return instruction
291
- chart = instruction.split_chart_region
292
- person = instruction.split_person_region
293
- if chart is None or person is None:
294
- return instruction
295
- chart_dominates = chart.y2 >= _NATIVE_HIGHLIGHT_CHART_DOMINANCE_Y2
296
- person_too_small = person.width <= _NATIVE_HIGHLIGHT_MIN_PERSON_WIDTH
297
- # Keep Bryan's newer head-and-shoulders presenter crops in split mode even
298
- # when the speaker strip is narrow; the older fallback-to-center rule was
299
- # written for lower-anchored full-body crops that rendered badly here.
300
- person_is_top_anchored = person.y1 <= _NATIVE_HIGHLIGHT_MAX_TOP_ANCHORED_PERSON_Y1
301
- if not (chart_dominates and person_too_small and not person_is_top_anchored):
302
- return instruction
303
- if clip is not None and _clip_references_presentation(clip):
304
  return instruction
305
- return instruction.model_copy(
306
- update={
307
- "layout": LayoutKind.SIT_CENTER,
308
- "zoom": max(float(instruction.zoom), _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM),
309
- "split_chart_region": None,
310
- "split_person_region": None,
311
- "split_second_chart_region": None,
312
- "split_second_person_region": None,
313
- "chart_x_norm": 0.0,
314
- "top_band_ratio": 0.5,
315
- }
316
- )
317
 
318
 
319
  def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]:
 
69
  _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM = 1.20
70
  _PRESENTATION_REFERENCE_RE = re.compile(
71
  r"\b("
72
+ r"as you can(?: also)? see|you can(?: also)? see|what you can(?: also)? see|look at|take a look|shown here|"
73
  r"shown on|on the screen|on this slide|this chart|the chart|this graph|"
74
  r"the graph|this slide|this matrix|the matrix|red line|yellow line|"
75
  r"blue line|green line|top there|bottom there|x-axis|y-axis"
76
  r")\b",
77
  flags=re.IGNORECASE,
78
  )
79
+
80
+
81
+ def _split_chart_person_to_center(instruction: LayoutInstruction) -> LayoutInstruction:
82
+ return instruction.model_copy(
83
+ update={
84
+ "layout": LayoutKind.SIT_CENTER,
85
+ "zoom": max(float(instruction.zoom), _NATIVE_HIGHLIGHT_SPLIT_TO_CENTER_MIN_ZOOM),
86
+ "split_chart_region": None,
87
+ "split_person_region": None,
88
+ "split_second_chart_region": None,
89
+ "split_second_person_region": None,
90
+ "chart_x_norm": 0.0,
91
+ "top_band_ratio": 0.5,
92
+ }
93
+ )
94
 
95
 
96
  def _rerun_config(config: PipelineConfig, steering_notes: list[str]) -> PipelineConfig:
 
301
  ) -> LayoutInstruction:
302
  if render_theme != RenderTheme.NATIVE_HIGHLIGHT:
303
  return instruction
304
+ if instruction.layout != LayoutKind.SPLIT_CHART_PERSON:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  return instruction
306
+ if clip is None or not _clip_references_presentation(clip):
307
+ return _split_chart_person_to_center(instruction)
308
+ chart = instruction.split_chart_region
309
+ person = instruction.split_person_region
310
+ if chart is None or person is None:
311
+ return _split_chart_person_to_center(instruction)
312
+ return instruction
 
 
 
 
 
313
 
314
 
315
  def _load_layout_raw_by_clip(work_dir: Path) -> dict[str, dict]: