Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

File size: 6,706 Bytes

0ee66d2

"""
Tests for graders — determinism and range validation.
"""

import pytest
from env.graders import get_grader
from env.tasks.registry import get_task


# ── Determinism Tests ────────────────────────────────────────────────────────

def _make_dummy_attempts(n=2, tests_passed=3, tests_total=8):
    """Create dummy attempt data for testing."""
    return [
        {
            "attempt_number": i + 1,
            "code_submitted": "def dummy(): pass",
            "hypothesis": "The bug is in the loop condition",
            "execution_output": f"{tests_passed} passed, {tests_total - tests_passed} failed",
            "tests_passed": tests_passed,
            "tests_total": tests_total,
            "execution_time_ms": 100,
            "timed_out": False,
        }
        for i in range(n)
    ]


def test_easy_grader_deterministic():
    """Same input to easy grader must produce same output."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = _make_dummy_attempts(2, tests_passed=7, tests_total=8)
    hypotheses = ["The off by one error in the loop condition"]

    score1 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
    score2 = grader.score(task, attempts, 7, 8, 2, 5, hypotheses)
    assert score1 == score2, f"Easy grader not deterministic: {score1} != {score2}"


def test_medium_grader_deterministic():
    """Same input to medium grader must produce same output."""
    grader = get_grader("medium")
    task = get_task("medium")
    attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
    hypotheses = ["Bug is in hash_password bytes conversion"]

    score1 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    score2 = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    assert score1 == score2, f"Medium grader not deterministic: {score1} != {score2}"


def test_hard_grader_deterministic():
    """Same input to hard grader must produce same output (excluding concurrent test randomness)."""
    grader = get_grader("hard")
    task = get_task("hard")
    # Use buggy code so concurrent test is deterministically failing
    attempts = _make_dummy_attempts(2, tests_passed=8, tests_total=8)
    hypotheses = ["race condition in increment"]

    score1 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
    score2 = grader.score(task, attempts, 8, 8, 2, 10, hypotheses)
    assert score1 == score2, f"Hard grader not deterministic: {score1} != {score2}"


# ── Range Tests ──────────────────────────────────────────────────────────────

@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_zero_attempts(task_id):
    """Grader with zero attempts should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    score = grader.score(task, [], 0, task["tests_total"], 0, task["max_attempts"], [])
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_perfect_score(task_id):
    """Grader with all tests passing should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    tests_total = task["tests_total"]
    attempts = _make_dummy_attempts(1, tests_passed=tests_total, tests_total=tests_total)
    hypotheses = ["off by one", "hash_password bytes", "race condition atomic lock"]

    score = grader.score(task, attempts, tests_total, tests_total, 1, task["max_attempts"], hypotheses)
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


@pytest.mark.parametrize("task_id", ["easy", "medium", "hard"])
def test_grader_range_with_all_failures(task_id):
    """Grader with no tests passing should return a score in [0.0, 1.0]."""
    grader = get_grader(task_id)
    task = get_task(task_id)
    tests_total = task["tests_total"]
    attempts = _make_dummy_attempts(task["max_attempts"], tests_passed=0, tests_total=tests_total)

    score = grader.score(task, attempts, 0, tests_total, task["max_attempts"], task["max_attempts"], [])
    assert 0.0 <= score <= 1.0, f"{task_id} grader out of range: {score}"


# ── Variance Tests (dummy vs perfect agents) ────────────────────────────────

def test_easy_dummy_agent_low_score():
    """A dummy agent submitting 'pass' should score < 0.15."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = [
        {
            "attempt_number": i + 1,
            "code_submitted": "pass",
            "hypothesis": "I don't know",
            "execution_output": "0 passed, 8 failed",
            "tests_passed": 0,
            "tests_total": 8,
            "execution_time_ms": 50,
            "timed_out": False,
        }
        for i in range(5)
    ]
    score = grader.score(task, attempts, 0, 8, 5, 5, ["I don't know"] * 5)
    assert score < 0.15, f"Dummy agent scored too high on easy: {score}"


def test_easy_perfect_agent_high_score():
    """A perfect agent should score > 0.85 on easy."""
    grader = get_grader("easy")
    task = get_task("easy")
    attempts = [
        {
            "attempt_number": 1,
            "code_submitted": task["ground_truth"]["fixed_code"],
            "hypothesis": "The off by one error: should be left <= right",
            "execution_output": "8 passed, 0 failed",
            "tests_passed": 8,
            "tests_total": 8,
            "execution_time_ms": 50,
            "timed_out": False,
        }
    ]
    score = grader.score(task, attempts, 8, 8, 1, 5, ["The off by one error: should be left <= right"])
    assert score > 0.85, f"Perfect agent scored too low on easy: {score}"


def test_medium_red_herring_low_score():
    """Agent that only fixes authenticate_user should score < 0.30 on hypothesis."""
    grader = get_grader("medium")
    task = get_task("medium")
    attempts = _make_dummy_attempts(3, tests_passed=6, tests_total=10)
    hypotheses = [
        "The bug is in authenticate_user, it's not checking credentials correctly",
        "authenticate_user should handle the case differently",
        "Fix authenticate_user to return True for valid users",
    ]
    score = grader.score(task, attempts, 6, 10, 3, 7, hypotheses)
    # With only 6/10 tests and red herring hypotheses, score should be modest
    assert score < 0.60, f"Red herring agent scored too high on medium: {score}"