File size: 6,706 Bytes
0ee66d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Tests for graders β€” determinism and range validation.
"""

import pytest
from env.graders import get_grader
from env.tasks.registry import get_task


# ── Determinism Tests ────────────────────────────────────────────────────────

def _make_dummy_attempts(n=2, tests_passed=3, tests_total=8):
    """Create dummy attempt data for testing."""
    return [
        {
            "attempt_number": i + 1,
            "code_submitted": "def dummy(): pass",
            "hypothesis": "The bug is in the loop condition",
            "execution_output": f"{tests_passed} passed, {tests_total - tests_passed} failed",
            "tests_passed": tests_passed,
            "tests_total": tests_total,
            "execution_time_ms": 100,
            "timed_out": False,
        }
        for i in range(n)
    ]


def test_easy_grader_deterministic():
    """Same input to easy grader must produce same output."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = _make_dummy_attempts(2, tests_passed=7, tests_total=8)
    hypotheses = ["The off by one error in the loop condition"]

    score1 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
    score2 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
    assert score1 == score2, f"Easy grader not deterministic: {score1} != {score2}"


def test_medium_grader_deterministic():
    """Same input to medium grader must produce same output."""
    grader = get_grader("medium")
    task = get_task("medium")
    attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
    hypotheses = ["Bug is in hash_password bytes conversion"]

    score1 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    score2 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    assert score1 == score2, f"Medium grader not deterministic: {score1} != {score2}"


def test_hard_grader_deterministic():
    """Same input to hard grader must produce same output (excluding concurrent test randomness)."""
    grader = get_grader("hard")
    task = get_task("hard")
    # Use buggy code so concurrent test is deterministically failing
    attempts = _make_dummy_attempts(2, tests_passed=8, tests_total=8)
    hypotheses = ["race condition in increment"]

    score1 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
    score2 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
    assert score1 == score2, f"Hard grader not deterministic: {score1} != {score2}"


# ── Range Tests ──────────────────────────────────────────────────────────────

@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_zero_attempts(task_id):
    """Grader with zero attempts should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    score = grader.score(task, [], 0, task["tests_total"], 0, task["max_attempts"], [])
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_perfect_score(task_id):
    """Grader with all tests passing should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    tests_total = task["tests_total"]
    attempts = _make_dummy_attempts(1, tests_passed=tests_total, tests_total=tests_total)
    hypotheses = ["off by one", "hash_password bytes", "race condition atomic lock"]

    score = grader.score(task, attempts, tests_total, tests_total, 1, task["max_attempts"], hypotheses)
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_all_failures(task_id):
    """Grader with no tests passing should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    tests_total = task["tests_total"]
    attempts = _make_dummy_attempts(task["max_attempts"], tests_passed=0, tests_total=tests_total)

    score = grader.score(task, attempts, 0, tests_total, task["max_attempts"], task["max_attempts"], [])
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


# ── Variance Tests (dummy vs perfect agents) ────────────────────────────────

def test_easy_dummy_agent_low_score():
    """A dummy agent submitting 'pass' should score < 0.15."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = [
        {
            "attempt_number": i + 1,
            "code_submitted": "pass",
            "hypothesis": "I don't know",
            "execution_output": "0 passed, 8 failed",
            "tests_passed": 0,
            "tests_total": 8,
            "execution_time_ms": 50,
            "timed_out": False,
        }
        for i in range(5)
    ]
    score = grader.score(task, attempts, 0, 8, 5, 5, ["I don't know"] * 5)
    assert score < 0.15, f"Dummy agent scored too high on easy: {score}"


def test_easy_perfect_agent_high_score():
    """A perfect agent should score > 0.85 on easy."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = [
        {
            "attempt_number": 1,
            "code_submitted": task["ground_truth"]["fixed_code"],
            "hypothesis": "The off by one error: should be left <= right",
            "execution_output": "8 passed, 0 failed",
            "tests_passed": 8,
            "tests_total": 8,
            "execution_time_ms": 50,
            "timed_out": False,
        }
    ]
    score = grader.score(task, attempts, 8, 8, 1, 5, ["The off by one error: should be left <= right"])
    assert score > 0.85, f"Perfect agent scored too low on easy: {score}"


def test_medium_red_herring_low_score():
    """Agent that only fixes authenticate_user should score < 0.30 on hypothesis."""
    grader = get_grader("medium")
    task = get_task("medium")
    attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
    hypotheses = [
        "The bug is in authenticate_user, it's not checking credentials correctly",
        "authenticate_user should handle the case differently",
        "Fix authenticate_user to return True for valid users",
    ]
    score = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    # With only 6/10 tests and red herring hypotheses, score should be modest
    assert score < 0.60, f"Red herring agent scored too high on medium: {score}"