File size: 3,247 Bytes
03815d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Tests for the v1-vs-v2 archived-response toggle (`server/demo_v1_v2.py`)."""

from __future__ import annotations

import json
from pathlib import Path

import pytest

from server.demo_v1_v2 import (
    list_scenario_choices,
    load_archived_data,
    render_summary_banner,
    render_toggle_view,
)


REPO_ROOT = Path(__file__).resolve().parent.parent
ARCHIVE_PATH = REPO_ROOT / "data" / "v1_v2_archived_responses.json"
BENCH_PATH = REPO_ROOT / "data" / "chakravyuh-bench-v0" / "scenarios.jsonl"


@pytest.mark.unit
def test_archive_file_loads() -> None:
    data = load_archived_data()
    assert "_provenance" in data
    assert "honest_note" in data["_provenance"]
    assert isinstance(data["scenarios"], list)
    assert len(data["scenarios"]) >= 5


@pytest.mark.unit
def test_each_archived_response_has_required_fields() -> None:
    data = load_archived_data()
    required_top = {"id", "label", "ground_truth", "category", "difficulty",
                    "prompt", "v1", "v2", "verdict"}
    required_resp = {"score", "flagged", "signals", "explanation"}
    for s in data["scenarios"]:
        missing = required_top - s.keys()
        assert not missing, f"scenario {s.get('id')!r} missing fields: {missing}"
        for v in ("v1", "v2"):
            missing = required_resp - s[v].keys()
            assert not missing, f"scenario {s['id']} {v} missing fields: {missing}"


@pytest.mark.unit
def test_archived_ground_truth_matches_bench() -> None:
    """Cross-check: each archived scenario's `ground_truth` matches bench v0."""
    bench = {}
    with BENCH_PATH.open() as f:
        for line in f:
            row = json.loads(line)
            bench[row["id"]] = row["ground_truth"]["is_scam"]
    archive = load_archived_data()
    for s in archive["scenarios"]:
        sid = s["id"]
        assert sid in bench, f"archived scenario {sid} not in bench"
        archived_is_scam = s["ground_truth"] == "scam"
        assert archived_is_scam == bench[sid], (
            f"{sid} archive says {s['ground_truth']!r} but bench is_scam={bench[sid]}"
        )


@pytest.mark.unit
def test_list_scenario_choices_returns_label_id_pairs() -> None:
    choices = list_scenario_choices()
    assert isinstance(choices, list)
    assert all(isinstance(t, tuple) and len(t) == 2 for t in choices)
    assert all(isinstance(label, str) and isinstance(sid, str) for label, sid in choices)


@pytest.mark.unit
def test_render_toggle_view_returns_four_strings() -> None:
    data = load_archived_data()
    sid = data["scenarios"][0]["id"]
    prompt, v1, v2, asymmetry = render_toggle_view(sid)
    assert all(isinstance(x, str) for x in (prompt, v1, v2, asymmetry))
    assert "v1" in v1.lower() or "reward-hacked" in v1.lower()
    assert "v2" in v2.lower() or "principled" in v2.lower()


@pytest.mark.unit
def test_render_summary_banner_includes_archived_disclosure() -> None:
    banner = render_summary_banner()
    assert "ARCHIVED" in banner.upper()
    assert "not a live re-run" in banner.lower()


@pytest.mark.unit
def test_render_toggle_view_unknown_id_does_not_crash() -> None:
    prompt, v1, v2, asymmetry = render_toggle_view("modec_NONEXISTENT")
    assert all(isinstance(x, str) for x in (prompt, v1, v2, asymmetry))