Spaces:

moonlantern1
/

clipforge

Sleeping

File size: 7,600 Bytes

eda316b

"""Tests for the scene-change + vision-LLM + OCR bbox primitive.



Covers:

* happy path: well-formed JSON -> populated ``SceneRegions``.

* bad JSON: degrade to empty regions + raw_reason, never raise.

* bad bbox: one malformed bbox does not take down the whole scene record.

* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.

* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come

  from the bboxes when present, defaults when not.

"""

import json

import pytest

from humeo_core.primitives.vision import (
    _CHART_WIDTH_SPLIT_THRESHOLD,
    classify_from_regions,
    classify_scenes_with_vision_llm,
    detect_regions_with_llm,
    layout_instruction_from_regions,
)
from humeo_core.schemas import (
    BoundingBox,
    LayoutKind,
    Scene,
    SceneClassification,
    SceneRegions,
)


# ---------------------------------------------------------------------------
# Schema
# ---------------------------------------------------------------------------


def test_bounding_box_requires_x2_gt_x1():
    BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
    with pytest.raises(ValueError):
        BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
    with pytest.raises(ValueError):
        BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)


def test_bounding_box_center_and_width():
    b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
    assert b.center_x == pytest.approx(0.4)
    assert b.center_y == pytest.approx(0.65)
    assert b.width == pytest.approx(0.4)


# ---------------------------------------------------------------------------
# detect_regions_with_llm
# ---------------------------------------------------------------------------


def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)


def test_detect_regions_happy_path():
    scenes = [_scene(0)]

    def vision_fn(_img: str, _prompt: str) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
                "ocr_text": "Inflation YoY",
                "reason": "explainer layout",
            }
        )

    out = detect_regions_with_llm(scenes, vision_fn)
    assert len(out) == 1
    r = out[0]
    assert r.scene_id == "s0"
    assert r.person_bbox and r.person_bbox.center_x > 0.8
    assert r.chart_bbox and r.chart_bbox.width > 0.6
    assert "Inflation" in r.ocr_text


def test_detect_regions_bad_json_is_safe():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return "not json"

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert out[0].chart_bbox is None
    assert "parse error" in out[0].raw_reason.lower()


def test_detect_regions_missing_keyframe_is_safe():
    scenes = [_scene(0, kf=None)]

    def vision_fn(*_a) -> str:  # pragma: no cover - should not be called
        raise AssertionError("vision_fn must not be called without a keyframe")

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert "no keyframe" in out[0].raw_reason.lower()


def test_detect_regions_bad_bbox_degrades_gracefully():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
                "ocr_text": "",
                "reason": "person bbox inverted",
            }
        )

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert out[0].chart_bbox is not None


# ---------------------------------------------------------------------------
# classify_from_regions
# ---------------------------------------------------------------------------


def test_classify_wide_chart_is_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SPLIT_CHART_PERSON
    assert c.confidence > 0.5


def test_classify_narrow_chart_not_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
        person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
    )
    c = classify_from_regions(r)
    # chart width (0.1) is below the split threshold -> not split
    assert c.layout != LayoutKind.SPLIT_CHART_PERSON


def test_classify_wide_person_is_zoom_call():
    r = SceneRegions(
        scene_id="s0",
        person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.ZOOM_CALL_CENTER


def test_classify_small_person_is_sit_center():
    r = SceneRegions(
        scene_id="s0",
        person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SIT_CENTER


def test_classify_nothing_detected_defaults_sit_center_low_conf():
    r = SceneRegions(scene_id="s0", raw_reason="model returned null")
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SIT_CENTER
    assert c.confidence <= 0.5


def test_chart_threshold_is_exported():
    # guard against the tuning constant silently being removed
    assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0


# ---------------------------------------------------------------------------
# layout_instruction_from_regions
# ---------------------------------------------------------------------------


def test_layout_instruction_from_regions_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
    )
    c = classify_from_regions(r)
    instr = layout_instruction_from_regions(r, c)
    assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
    # person_x_norm = center of (0.72, 0.99) = 0.855
    assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
    # chart_x_norm = left edge = 0.0
    assert instr.chart_x_norm == pytest.approx(0.0)


def test_layout_instruction_defaults_when_no_regions():
    r = SceneRegions(scene_id="s0")
    c = SceneClassification(
        scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
    )
    instr = layout_instruction_from_regions(r, c)
    assert instr.person_x_norm == 0.5
    assert instr.chart_x_norm == 0.0


def test_classify_scenes_with_vision_llm_returns_pairs():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
                "chart_bbox": None,
                "ocr_text": "",
                "reason": "solo subject",
            }
        )

    pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
    assert len(pairs) == 1
    regions, classification = pairs[0]
    assert regions.person_bbox is not None
    assert classification.layout == LayoutKind.ZOOM_CALL_CENTER