clipforge / humeo-core /tests /test_vision.py
moonlantern1's picture
Deploy ClipForge Docker Space
eda316b verified
"""Tests for the scene-change + vision-LLM + OCR bbox primitive.
Covers:
* happy path: well-formed JSON -> populated ``SceneRegions``.
* bad JSON: degrade to empty regions + raw_reason, never raise.
* bad bbox: one malformed bbox does not take down the whole scene record.
* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.
* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come
from the bboxes when present, defaults when not.
"""
import json
import pytest
from humeo_core.primitives.vision import (
_CHART_WIDTH_SPLIT_THRESHOLD,
classify_from_regions,
classify_scenes_with_vision_llm,
detect_regions_with_llm,
layout_instruction_from_regions,
)
from humeo_core.schemas import (
BoundingBox,
LayoutKind,
Scene,
SceneClassification,
SceneRegions,
)
# ---------------------------------------------------------------------------
# Schema
# ---------------------------------------------------------------------------
def test_bounding_box_requires_x2_gt_x1():
BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
with pytest.raises(ValueError):
BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
with pytest.raises(ValueError):
BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)
def test_bounding_box_center_and_width():
b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
assert b.center_x == pytest.approx(0.4)
assert b.center_y == pytest.approx(0.65)
assert b.width == pytest.approx(0.4)
# ---------------------------------------------------------------------------
# detect_regions_with_llm
# ---------------------------------------------------------------------------
def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)
def test_detect_regions_happy_path():
scenes = [_scene(0)]
def vision_fn(_img: str, _prompt: str) -> str:
return json.dumps(
{
"person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
"chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
"ocr_text": "Inflation YoY",
"reason": "explainer layout",
}
)
out = detect_regions_with_llm(scenes, vision_fn)
assert len(out) == 1
r = out[0]
assert r.scene_id == "s0"
assert r.person_bbox and r.person_bbox.center_x > 0.8
assert r.chart_bbox and r.chart_bbox.width > 0.6
assert "Inflation" in r.ocr_text
def test_detect_regions_bad_json_is_safe():
scenes = [_scene(0)]
def vision_fn(*_a) -> str:
return "not json"
out = detect_regions_with_llm(scenes, vision_fn)
assert out[0].person_bbox is None
assert out[0].chart_bbox is None
assert "parse error" in out[0].raw_reason.lower()
def test_detect_regions_missing_keyframe_is_safe():
scenes = [_scene(0, kf=None)]
def vision_fn(*_a) -> str: # pragma: no cover - should not be called
raise AssertionError("vision_fn must not be called without a keyframe")
out = detect_regions_with_llm(scenes, vision_fn)
assert out[0].person_bbox is None
assert "no keyframe" in out[0].raw_reason.lower()
def test_detect_regions_bad_bbox_degrades_gracefully():
scenes = [_scene(0)]
def vision_fn(*_a) -> str:
return json.dumps(
{
"person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
"chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
"ocr_text": "",
"reason": "person bbox inverted",
}
)
out = detect_regions_with_llm(scenes, vision_fn)
assert out[0].person_bbox is None
assert out[0].chart_bbox is not None
# ---------------------------------------------------------------------------
# classify_from_regions
# ---------------------------------------------------------------------------
def test_classify_wide_chart_is_split():
r = SceneRegions(
scene_id="s0",
chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
)
c = classify_from_regions(r)
assert c.layout == LayoutKind.SPLIT_CHART_PERSON
assert c.confidence > 0.5
def test_classify_narrow_chart_not_split():
r = SceneRegions(
scene_id="s0",
chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
)
c = classify_from_regions(r)
# chart width (0.1) is below the split threshold -> not split
assert c.layout != LayoutKind.SPLIT_CHART_PERSON
def test_classify_wide_person_is_zoom_call():
r = SceneRegions(
scene_id="s0",
person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
)
c = classify_from_regions(r)
assert c.layout == LayoutKind.ZOOM_CALL_CENTER
def test_classify_small_person_is_sit_center():
r = SceneRegions(
scene_id="s0",
person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
)
c = classify_from_regions(r)
assert c.layout == LayoutKind.SIT_CENTER
def test_classify_nothing_detected_defaults_sit_center_low_conf():
r = SceneRegions(scene_id="s0", raw_reason="model returned null")
c = classify_from_regions(r)
assert c.layout == LayoutKind.SIT_CENTER
assert c.confidence <= 0.5
def test_chart_threshold_is_exported():
# guard against the tuning constant silently being removed
assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0
# ---------------------------------------------------------------------------
# layout_instruction_from_regions
# ---------------------------------------------------------------------------
def test_layout_instruction_from_regions_split():
r = SceneRegions(
scene_id="s0",
chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
)
c = classify_from_regions(r)
instr = layout_instruction_from_regions(r, c)
assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
# person_x_norm = center of (0.72, 0.99) = 0.855
assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
# chart_x_norm = left edge = 0.0
assert instr.chart_x_norm == pytest.approx(0.0)
def test_layout_instruction_defaults_when_no_regions():
r = SceneRegions(scene_id="s0")
c = SceneClassification(
scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
)
instr = layout_instruction_from_regions(r, c)
assert instr.person_x_norm == 0.5
assert instr.chart_x_norm == 0.0
def test_classify_scenes_with_vision_llm_returns_pairs():
scenes = [_scene(0)]
def vision_fn(*_a) -> str:
return json.dumps(
{
"person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
"chart_bbox": None,
"ocr_text": "",
"reason": "solo subject",
}
)
pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
assert len(pairs) == 1
regions, classification = pairs[0]
assert regions.person_bbox is not None
assert classification.layout == LayoutKind.ZOOM_CALL_CENTER