File size: 4,440 Bytes
df97e68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
tests/test_phase2_graders.py
Phase 2: graders.py — deterministic scoring for all three tasks
Run: pytest tests/test_phase2_graders.py -v
"""
import pytest
from app.env import GovWorkflowEnv
from app.graders import grade_episode
from app.models import ActionModel, ActionType


def run_episode_to_end(task_id: str, seed: int, max_steps: int = 500) -> GovWorkflowEnv:
    """Run a full episode and return the env for grading."""
    env = GovWorkflowEnv(task_id=task_id)
    env.reset(seed=seed)
    action = ActionModel(action_type=ActionType.ADVANCE_TIME)
    for _ in range(max_steps):
        _, _, t, tr, _ = env.step(action)
        if t or tr:
            break
    return env


class TestGraderEasy:
    def test_grade_returns_result(self):
        env = run_episode_to_end("district_backlog_easy", 42)
        result = grade_episode(env.state())
        assert result is not None

    def test_grade_score_in_range(self):
        env = run_episode_to_end("district_backlog_easy", 42)
        result = grade_episode(env.state())
        assert 0.0 <= result.score <= 1.0

    def test_grade_has_grader_name(self):
        env = run_episode_to_end("district_backlog_easy", 42)
        result = grade_episode(env.state())
        assert isinstance(result.grader_name, str)
        assert len(result.grader_name) > 0

    def test_grade_metrics_dict_nonempty(self):
        env = run_episode_to_end("district_backlog_easy", 42)
        result = grade_episode(env.state())
        assert isinstance(result.metrics, dict)
        assert len(result.metrics) > 0

    def test_grade_deterministic_same_seed(self):
        env1 = run_episode_to_end("district_backlog_easy", 42)
        env2 = run_episode_to_end("district_backlog_easy", 42)
        r1 = grade_episode(env1.state())
        r2 = grade_episode(env2.state())
        assert abs(r1.score - r2.score) < 1e-6

    def test_grade_metrics_all_floats(self):
        env = run_episode_to_end("district_backlog_easy", 42)
        result = grade_episode(env.state())
        for k, v in result.metrics.items():
            assert isinstance(v, (int, float)), f"Metric {k} is not numeric: {v}"


class TestGraderMedium:
    def test_grade_score_in_range(self):
        env = run_episode_to_end("mixed_urgency_medium", 123)
        result = grade_episode(env.state())
        assert 0.0 <= result.score <= 1.0

    def test_grade_different_grader_than_easy(self):
        env_easy = run_episode_to_end("district_backlog_easy", 42)
        env_med = run_episode_to_end("mixed_urgency_medium", 123)
        r_easy = grade_episode(env_easy.state())
        r_med = grade_episode(env_med.state())
        # Different tasks may have different grader names
        assert isinstance(r_med.grader_name, str)


class TestGraderHard:
    def test_grade_score_in_range(self):
        env = run_episode_to_end("cross_department_hard", 999, max_steps=800)
        result = grade_episode(env.state())
        assert 0.0 <= result.score <= 1.0

    def test_grade_has_fairness_metric(self):
        env = run_episode_to_end("cross_department_hard", 999, max_steps=800)
        result = grade_episode(env.state())
        # Hard task grader should include fairness-related metric
        keys_lower = {k.lower() for k in result.metrics.keys()}
        has_fairness = any("fair" in k for k in keys_lower)
        assert has_fairness, f"Hard grader missing fairness metric. Keys: {result.metrics.keys()}"


class TestGraderScoreBounds:
    @pytest.mark.parametrize("task_id,seed", [
        ("district_backlog_easy", 42),
        ("mixed_urgency_medium", 123),
        ("cross_department_hard", 999),
    ])
    def test_score_always_in_zero_one(self, task_id, seed):
        env = run_episode_to_end(task_id, seed)
        result = grade_episode(env.state())
        assert 0.0 <= result.score <= 1.0, (
            f"{task_id}: score {result.score} out of [0, 1]"
        )

    @pytest.mark.parametrize("task_id,seed", [
        ("district_backlog_easy", 1),
        ("district_backlog_easy", 2),
        ("district_backlog_easy", 3),
    ])
    def test_partial_episode_grades_without_error(self, task_id, seed):
        env = GovWorkflowEnv(task_id=task_id)
        env.reset(seed=seed)
        action = ActionModel(action_type=ActionType.ADVANCE_TIME)
        for _ in range(5):
            env.step(action)
        result = grade_episode(env.state())
        assert 0.0 <= result.score <= 1.0