Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

File size: 12,488 Bytes

31715b5

"""Unit tests for the OpenSleuth env + verifier.

Run with `pytest -q` from the env/ directory.
"""

from __future__ import annotations

import pytest

from opensleuth_env import (
    BLACK_BOX_FUNCTIONS,
    OpenSleuthEnv,
    ProbeAction,
    SubmitAction,
)
from opensleuth_env.env import _bucket_of, NEW_BUCKET_BONUS, NEW_OUTPUT_BONUS, PROBE_STEP_COST
from opensleuth_env.verifier import (
    calculate_complexity_penalty,
    generate_fuzz_inputs,
    get_edge_inputs,
    verify_submission,
    _looks_like_reference_import,
)


# ---------- env transitions ------------------------------------------------


def test_reset_returns_episode_id_and_signature():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci")
    assert obs.episode_id
    assert obs.target_function_name == "fibonacci"
    assert "fibonacci" in obs.target_function_signature
    assert obs.probe_history == []
    assert obs.steps_taken == 0
    # New v0.3 metadata.
    assert obs.difficulty == "easy"
    assert obs.coverage_buckets_seen == 0


def test_unknown_target_raises():
    env = OpenSleuthEnv()
    with pytest.raises(ValueError):
        env.reset("not_a_real_function")


def test_probe_with_int_input_records_output():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci")
    resp = env.step(obs.episode_id, ProbeAction(input_repr="10"))
    assert resp.done is False
    assert resp.observation.probe_history[-1].is_error is False
    assert resp.observation.probe_history[-1].output_repr == "55"
    # First successful probe = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST.
    expected = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST
    assert resp.reward == pytest.approx(expected)
    assert resp.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
    assert resp.info["bucket"] == "int:medium"
    assert resp.observation.coverage_buckets_seen == 1
    assert resp.observation.seen_outputs_count == 1


def test_probe_with_invalid_literal_returns_parse_error():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci")
    resp = env.step(obs.episode_id, ProbeAction(input_repr="not a literal"))
    assert resp.done is False
    assert resp.observation.probe_history[-1].error_type == "ParseError"


def test_repeated_output_only_pays_intrinsic_once():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci")
    r1 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
    r2 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
    assert r1.reward > r2.reward
    # Second hit on the same bucket+output: just the per-step cost.
    assert r2.reward == pytest.approx(PROBE_STEP_COST)


def test_step_limit_terminates_episode():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci", max_steps=2)
    env.step(obs.episode_id, ProbeAction(input_repr="1"))
    resp = env.step(obs.episode_id, ProbeAction(input_repr="2"))
    assert resp.done is True


def test_unknown_episode_id_raises():
    env = OpenSleuthEnv()
    with pytest.raises(KeyError):
        env.step("does-not-exist", ProbeAction(input_repr="1"))


# ---------- coverage bucketing (CovRL-Fuzz inspired) -----------------------


def test_bucket_of_distinguishes_qualitative_input_classes():
    assert _bucket_of(0) == "int:zero"
    assert _bucket_of(-1) == "int:negative"
    assert _bucket_of(5) == "int:small"
    assert _bucket_of(50) == "int:medium"
    assert _bucket_of(5000) == "int:large"
    assert _bucket_of(50_000) == "int:huge"
    assert _bucket_of("") == "str:empty"
    assert _bucket_of("a") == "str:singleton"
    assert _bucket_of([]) == "list:empty"
    assert _bucket_of((1, 2)) == "tuple:short"
    assert _bucket_of(True) == "bool:True"  # bool isolated from int
    assert _bucket_of(None) == "none"


def test_probe_distinct_buckets_each_pay_coverage_bonus():
    env = OpenSleuthEnv()
    obs = env.reset("fibonacci")
    # 1 (small), 50 (medium), 5 (already small)
    r1 = env.step(obs.episode_id, ProbeAction(input_repr="1"))
    r2 = env.step(obs.episode_id, ProbeAction(input_repr="50"))
    r3 = env.step(obs.episode_id, ProbeAction(input_repr="5"))
    assert r1.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
    assert r2.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
    assert r3.info["coverage_bonus"] == pytest.approx(0.0)
    assert r3.observation.coverage_buckets_seen == 2


# ---------- verifier -------------------------------------------------------


def test_verifier_perfect_score_on_reference_impl():
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    code = (
        "def fibonacci(n):\n"
        "    if not isinstance(n, int) or n <= 0 or n > 90:\n"
        "        raise ValueError('bad')\n"
        "    a, b = 0, 1\n"
        "    for _ in range(n - 1):\n"
        "        a, b = b, a + b\n"
        "    return b\n"
    )
    inputs = generate_fuzz_inputs(spec, count=30, seed=0)
    edges = get_edge_inputs(spec)
    result = verify_submission(code, spec.fn, inputs, target_name="fibonacci", edge_inputs=edges)
    assert result.matches == 30 + len(edges)
    assert result.execution_reward == pytest.approx(100.0)
    assert result.edge_pass_rate == pytest.approx(1.0)
    assert result.floor_penalty == 0.0
    assert result.reward_hack_penalty == 0.0


def test_verifier_partial_score_on_buggy_impl():
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    buggy = (
        "def fibonacci(n):\n"
        "    if not isinstance(n, int) or n <= 0 or n > 90:\n"
        "        raise ValueError('bad')\n"
        "    a, b = 0, 1\n"
        "    for _ in range(n - 1):\n"
        "        a, b = b, a + b\n"
        "    return b + 1\n"
    )
    inputs = generate_fuzz_inputs(spec, count=30, seed=0)
    result = verify_submission(buggy, spec.fn, inputs, target_name="fibonacci")
    assert result.execution_reward == pytest.approx(0.0)
    assert result.matches == 0
    # Sub-50% match rate triggers the hard floor.
    assert result.floor_penalty == 25.0


def test_verifier_syntax_error_returns_define_error_and_full_penalty():
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    inputs = generate_fuzz_inputs(spec, count=10, seed=0)
    result = verify_submission("def fib(:\n  pass", spec.fn, inputs, target_name="fibonacci")
    assert result.define_error is not None
    assert result.execution_reward == 0.0
    assert result.complexity_penalty == 50.0
    assert result.floor_penalty == 25.0


def test_verifier_missing_target_returns_error():
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    inputs = generate_fuzz_inputs(spec, count=10, seed=0)
    result = verify_submission("def other(x): return x", spec.fn, inputs, target_name="fibonacci")
    assert result.define_error is not None
    assert result.execution_reward == 0.0


def test_complexity_penalty_low_for_simple_fn():
    code = "def f(x): return x\n"
    assert calculate_complexity_penalty(code) < 1.0


def test_complexity_penalty_high_for_branchy_fn():
    body = "\n    ".join(f"if x == {i}: return {i}" for i in range(100))
    code = f"def f(x):\n    {body}\n    return -1\n"
    assert calculate_complexity_penalty(code) > 5.0


# ---------- anti-reward-hacking --------------------------------------------


def test_sandbox_blocks_import_of_reference_module():
    """Critical regression: previously an agent could write::

        from opensleuth_env.black_box import _fibonacci
        def fibonacci(n): return _fibonacci(n)

    and reward-hack to a perfect score. The hardened sandbox must block this.
    """
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    hack = (
        "def fibonacci(n):\n"
        "    from opensleuth_env.black_box import _fibonacci\n"
        "    return _fibonacci(n)\n"
    )
    inputs = generate_fuzz_inputs(spec, count=10, seed=0)
    result = verify_submission(hack, spec.fn, inputs, target_name="fibonacci")
    # Either definition fails (no __import__) or per-call fails. Either way
    # the agent must NOT score positively.
    assert result.execution_reward < 50.0
    # Static detector flagged the import attempt.
    assert result.reward_hack_penalty >= 25.0


def test_static_detector_flags_opensleuth_import():
    code = "import opensleuth_env\ndef f(x): return x\n"
    assert _looks_like_reference_import(code) is True
    assert _looks_like_reference_import("def f(x): return x\n") is False


def test_constant_function_collapse_is_penalised():
    """An agent that learns to always return the same value should be
    penalised even if some random inputs happen to match (e.g. for
    `digit_sum`, `lambda x: 0` matches only x=0)."""
    spec = BLACK_BOX_FUNCTIONS["digit_sum"]
    code = "def digit_sum(n):\n    return 0\n"
    inputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999]
    result = verify_submission(code, spec.fn, inputs, target_name="digit_sum")
    # All distinct inputs return 0 (one signature) while ref produces many.
    assert result.reward_hack_penalty >= 15.0


def test_sandbox_blocks_open_and_eval():
    spec = BLACK_BOX_FUNCTIONS["fibonacci"]
    bad = (
        "def fibonacci(n):\n"
        "    open('/tmp/x', 'w')\n"
        "    return 0\n"
    )
    inputs = generate_fuzz_inputs(spec, count=5, seed=0)
    result = verify_submission(bad, spec.fn, inputs, target_name="fibonacci")
    # Either the per-call NameError on `open` makes everything mismatch,
    # or it raises at definition time. Either way, low reward.
    assert result.execution_reward < 50.0


# ---------- stratified scoring (edge vs random) ----------------------------


def test_edge_cases_are_always_evaluated():
    spec = BLACK_BOX_FUNCTIONS["reverse_string"]
    # Submission that fails the empty-string edge case but works for non-empty.
    code = (
        "def reverse_string(s):\n"
        "    if s == '':\n"
        "        return 'OOPS'\n"
        "    return s[::-1]\n"
    )
    inputs = generate_fuzz_inputs(spec, count=20, seed=0)
    edges = get_edge_inputs(spec)
    assert "" in edges
    result = verify_submission(
        code, spec.fn, inputs, target_name="reverse_string", edge_inputs=edges
    )
    # Should pass most random + most edge except the empty-string edge case.
    assert result.matches_by_category["edge"] == len(edges) - 1
    assert result.edge_pass_rate < 1.0
    assert result.matches_by_category["random"] >= 18  # very rare to roll empty


# ---------- end-to-end submission via env ----------------------------------


def test_env_submit_reference_implementation_gives_high_reward():
    env = OpenSleuthEnv(fuzz_count=20)
    obs = env.reset("reverse_string")
    code = "def reverse_string(s):\n    return s[::-1]\n"
    resp = env.step(obs.episode_id, SubmitAction(code=code))
    assert resp.done is True
    # 100 - tiny complexity penalty + 50 perfect bonus.
    assert resp.reward > 140.0
    assert resp.info["execution_reward"] == pytest.approx(100.0)
    assert resp.info["edge_pass_rate"] == pytest.approx(1.0)
    assert resp.info["floor_penalty"] == 0.0
    assert resp.info["reward_hack_penalty"] == 0.0
    assert resp.info["perfect_bonus"] == 50.0


def test_env_submit_buggy_function_lands_clearly_negative():
    """Wrong submissions must end up clearly negative so the trainer's GRPO
    advantage penalises 'just emit any function'."""
    env = OpenSleuthEnv(fuzz_count=10)
    obs = env.reset("digit_sum")
    code = "def digit_sum(n):\n    return -1\n"
    resp = env.step(obs.episode_id, SubmitAction(code=code))
    assert resp.done is True
    assert resp.info["execution_reward"] < 50.0
    assert resp.reward < 0.0
    assert resp.info["floor_penalty"] == 25.0


def test_env_submit_import_hack_scores_clearly_negative():
    env = OpenSleuthEnv(fuzz_count=10)
    obs = env.reset("fibonacci")
    code = (
        "def fibonacci(n):\n"
        "    from opensleuth_env.black_box import _fibonacci\n"
        "    return _fibonacci(n)\n"
    )
    resp = env.step(obs.episode_id, SubmitAction(code=code))
    assert resp.done is True
    assert resp.reward < 0.0
    assert resp.info["reward_hack_penalty"] >= 25.0


# ---------- spec metadata --------------------------------------------------


def test_all_specs_have_difficulty_and_edge_cases():
    valid = {"easy", "medium", "hard"}
    for name, spec in BLACK_BOX_FUNCTIONS.items():
        assert spec.difficulty in valid, f"{name} has invalid difficulty {spec.difficulty!r}"
        assert isinstance(spec.edge_cases, list)
        assert len(spec.edge_cases) >= 3, f"{name} should declare >=3 edge cases for robust scoring"