File size: 7,600 Bytes
eda316b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
"""Tests for the scene-change + vision-LLM + OCR bbox primitive.



Covers:

* happy path: well-formed JSON -> populated ``SceneRegions``.

* bad JSON: degrade to empty regions + raw_reason, never raise.

* bad bbox: one malformed bbox does not take down the whole scene record.

* classification dispatch: chart width -> SPLIT; wide person -> ZOOM; else SIT.

* layout instruction derivation: ``person_x_norm`` / ``chart_x_norm`` come

  from the bboxes when present, defaults when not.

"""

import json

import pytest

from humeo_core.primitives.vision import (
    _CHART_WIDTH_SPLIT_THRESHOLD,
    classify_from_regions,
    classify_scenes_with_vision_llm,
    detect_regions_with_llm,
    layout_instruction_from_regions,
)
from humeo_core.schemas import (
    BoundingBox,
    LayoutKind,
    Scene,
    SceneClassification,
    SceneRegions,
)


# ---------------------------------------------------------------------------
# Schema
# ---------------------------------------------------------------------------


def test_bounding_box_requires_x2_gt_x1():
    BoundingBox(x1=0.1, y1=0.1, x2=0.2, y2=0.2)
    with pytest.raises(ValueError):
        BoundingBox(x1=0.2, y1=0.1, x2=0.1, y2=0.2)
    with pytest.raises(ValueError):
        BoundingBox(x1=0.1, y1=0.2, x2=0.2, y2=0.1)


def test_bounding_box_center_and_width():
    b = BoundingBox(x1=0.2, y1=0.4, x2=0.6, y2=0.9)
    assert b.center_x == pytest.approx(0.4)
    assert b.center_y == pytest.approx(0.65)
    assert b.width == pytest.approx(0.4)


# ---------------------------------------------------------------------------
# detect_regions_with_llm
# ---------------------------------------------------------------------------


def _scene(i: int, kf: str | None = "/tmp/x.jpg") -> Scene:
    return Scene(scene_id=f"s{i}", start_time=float(i), end_time=float(i) + 1.0, keyframe_path=kf)


def test_detect_regions_happy_path():
    scenes = [_scene(0)]

    def vision_fn(_img: str, _prompt: str) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.7, "y1": 0.1, "x2": 0.98, "y2": 0.9, "confidence": 0.9},
                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95, "confidence": 0.8},
                "ocr_text": "Inflation YoY",
                "reason": "explainer layout",
            }
        )

    out = detect_regions_with_llm(scenes, vision_fn)
    assert len(out) == 1
    r = out[0]
    assert r.scene_id == "s0"
    assert r.person_bbox and r.person_bbox.center_x > 0.8
    assert r.chart_bbox and r.chart_bbox.width > 0.6
    assert "Inflation" in r.ocr_text


def test_detect_regions_bad_json_is_safe():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return "not json"

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert out[0].chart_bbox is None
    assert "parse error" in out[0].raw_reason.lower()


def test_detect_regions_missing_keyframe_is_safe():
    scenes = [_scene(0, kf=None)]

    def vision_fn(*_a) -> str:  # pragma: no cover - should not be called
        raise AssertionError("vision_fn must not be called without a keyframe")

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert "no keyframe" in out[0].raw_reason.lower()


def test_detect_regions_bad_bbox_degrades_gracefully():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.5, "y1": 0.1, "x2": 0.3, "y2": 0.9},
                "chart_bbox": {"x1": 0.02, "y1": 0.05, "x2": 0.65, "y2": 0.95},
                "ocr_text": "",
                "reason": "person bbox inverted",
            }
        )

    out = detect_regions_with_llm(scenes, vision_fn)
    assert out[0].person_bbox is None
    assert out[0].chart_bbox is not None


# ---------------------------------------------------------------------------
# classify_from_regions
# ---------------------------------------------------------------------------


def test_classify_wide_chart_is_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SPLIT_CHART_PERSON
    assert c.confidence > 0.5


def test_classify_narrow_chart_not_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.5, y2=0.4),
        person_bbox=BoundingBox(x1=0.3, y1=0.1, x2=0.85, y2=0.95),
    )
    c = classify_from_regions(r)
    # chart width (0.1) is below the split threshold -> not split
    assert c.layout != LayoutKind.SPLIT_CHART_PERSON


def test_classify_wide_person_is_zoom_call():
    r = SceneRegions(
        scene_id="s0",
        person_bbox=BoundingBox(x1=0.1, y1=0.05, x2=0.9, y2=0.98),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.ZOOM_CALL_CENTER


def test_classify_small_person_is_sit_center():
    r = SceneRegions(
        scene_id="s0",
        person_bbox=BoundingBox(x1=0.4, y1=0.2, x2=0.6, y2=0.8),
    )
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SIT_CENTER


def test_classify_nothing_detected_defaults_sit_center_low_conf():
    r = SceneRegions(scene_id="s0", raw_reason="model returned null")
    c = classify_from_regions(r)
    assert c.layout == LayoutKind.SIT_CENTER
    assert c.confidence <= 0.5


def test_chart_threshold_is_exported():
    # guard against the tuning constant silently being removed
    assert 0.0 < _CHART_WIDTH_SPLIT_THRESHOLD < 1.0


# ---------------------------------------------------------------------------
# layout_instruction_from_regions
# ---------------------------------------------------------------------------


def test_layout_instruction_from_regions_split():
    r = SceneRegions(
        scene_id="s0",
        chart_bbox=BoundingBox(x1=0.0, y1=0.0, x2=0.66, y2=1.0),
        person_bbox=BoundingBox(x1=0.72, y1=0.1, x2=0.99, y2=0.95),
    )
    c = classify_from_regions(r)
    instr = layout_instruction_from_regions(r, c)
    assert instr.layout == LayoutKind.SPLIT_CHART_PERSON
    # person_x_norm = center of (0.72, 0.99) = 0.855
    assert instr.person_x_norm == pytest.approx(0.855, rel=1e-3)
    # chart_x_norm = left edge = 0.0
    assert instr.chart_x_norm == pytest.approx(0.0)


def test_layout_instruction_defaults_when_no_regions():
    r = SceneRegions(scene_id="s0")
    c = SceneClassification(
        scene_id="s0", layout=LayoutKind.SIT_CENTER, confidence=0.3, reason="default"
    )
    instr = layout_instruction_from_regions(r, c)
    assert instr.person_x_norm == 0.5
    assert instr.chart_x_norm == 0.0


def test_classify_scenes_with_vision_llm_returns_pairs():
    scenes = [_scene(0)]

    def vision_fn(*_a) -> str:
        return json.dumps(
            {
                "person_bbox": {"x1": 0.1, "y1": 0.1, "x2": 0.95, "y2": 0.95},
                "chart_bbox": None,
                "ocr_text": "",
                "reason": "solo subject",
            }
        )

    pairs = classify_scenes_with_vision_llm(scenes, vision_fn)
    assert len(pairs) == 1
    regions, classification = pairs[0]
    assert regions.person_bbox is not None
    assert classification.layout == LayoutKind.ZOOM_CALL_CENTER