| """Unit tests for the OpenSleuth env + verifier. |
| |
| Run with `pytest -q` from the env/ directory. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pytest |
|
|
| from opensleuth_env import ( |
| BLACK_BOX_FUNCTIONS, |
| OpenSleuthEnv, |
| ProbeAction, |
| SubmitAction, |
| ) |
| from opensleuth_env.env import _bucket_of, NEW_BUCKET_BONUS, NEW_OUTPUT_BONUS, PROBE_STEP_COST |
| from opensleuth_env.verifier import ( |
| calculate_complexity_penalty, |
| generate_fuzz_inputs, |
| get_edge_inputs, |
| verify_submission, |
| _looks_like_reference_import, |
| ) |
|
|
|
|
| |
|
|
|
|
| def test_reset_returns_episode_id_and_signature(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci") |
| assert obs.episode_id |
| assert obs.target_function_name == "fibonacci" |
| assert "fibonacci" in obs.target_function_signature |
| assert obs.probe_history == [] |
| assert obs.steps_taken == 0 |
| |
| assert obs.difficulty == "easy" |
| assert obs.coverage_buckets_seen == 0 |
|
|
|
|
| def test_unknown_target_raises(): |
| env = OpenSleuthEnv() |
| with pytest.raises(ValueError): |
| env.reset("not_a_real_function") |
|
|
|
|
| def test_probe_with_int_input_records_output(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci") |
| resp = env.step(obs.episode_id, ProbeAction(input_repr="10")) |
| assert resp.done is False |
| assert resp.observation.probe_history[-1].is_error is False |
| assert resp.observation.probe_history[-1].output_repr == "55" |
| |
| expected = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST |
| assert resp.reward == pytest.approx(expected) |
| assert resp.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) |
| assert resp.info["bucket"] == "int:medium" |
| assert resp.observation.coverage_buckets_seen == 1 |
| assert resp.observation.seen_outputs_count == 1 |
|
|
|
|
| def test_probe_with_invalid_literal_returns_parse_error(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci") |
| resp = env.step(obs.episode_id, ProbeAction(input_repr="not a literal")) |
| assert resp.done is False |
| assert resp.observation.probe_history[-1].error_type == "ParseError" |
|
|
|
|
| def test_repeated_output_only_pays_intrinsic_once(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci") |
| r1 = env.step(obs.episode_id, ProbeAction(input_repr="10")) |
| r2 = env.step(obs.episode_id, ProbeAction(input_repr="10")) |
| assert r1.reward > r2.reward |
| |
| assert r2.reward == pytest.approx(PROBE_STEP_COST) |
|
|
|
|
| def test_step_limit_terminates_episode(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci", max_steps=2) |
| env.step(obs.episode_id, ProbeAction(input_repr="1")) |
| resp = env.step(obs.episode_id, ProbeAction(input_repr="2")) |
| assert resp.done is True |
|
|
|
|
| def test_unknown_episode_id_raises(): |
| env = OpenSleuthEnv() |
| with pytest.raises(KeyError): |
| env.step("does-not-exist", ProbeAction(input_repr="1")) |
|
|
|
|
| |
|
|
|
|
| def test_bucket_of_distinguishes_qualitative_input_classes(): |
| assert _bucket_of(0) == "int:zero" |
| assert _bucket_of(-1) == "int:negative" |
| assert _bucket_of(5) == "int:small" |
| assert _bucket_of(50) == "int:medium" |
| assert _bucket_of(5000) == "int:large" |
| assert _bucket_of(50_000) == "int:huge" |
| assert _bucket_of("") == "str:empty" |
| assert _bucket_of("a") == "str:singleton" |
| assert _bucket_of([]) == "list:empty" |
| assert _bucket_of((1, 2)) == "tuple:short" |
| assert _bucket_of(True) == "bool:True" |
| assert _bucket_of(None) == "none" |
|
|
|
|
| def test_probe_distinct_buckets_each_pay_coverage_bonus(): |
| env = OpenSleuthEnv() |
| obs = env.reset("fibonacci") |
| |
| r1 = env.step(obs.episode_id, ProbeAction(input_repr="1")) |
| r2 = env.step(obs.episode_id, ProbeAction(input_repr="50")) |
| r3 = env.step(obs.episode_id, ProbeAction(input_repr="5")) |
| assert r1.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) |
| assert r2.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) |
| assert r3.info["coverage_bonus"] == pytest.approx(0.0) |
| assert r3.observation.coverage_buckets_seen == 2 |
|
|
|
|
| |
|
|
|
|
| def test_verifier_perfect_score_on_reference_impl(): |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| code = ( |
| "def fibonacci(n):\n" |
| " if not isinstance(n, int) or n <= 0 or n > 90:\n" |
| " raise ValueError('bad')\n" |
| " a, b = 0, 1\n" |
| " for _ in range(n - 1):\n" |
| " a, b = b, a + b\n" |
| " return b\n" |
| ) |
| inputs = generate_fuzz_inputs(spec, count=30, seed=0) |
| edges = get_edge_inputs(spec) |
| result = verify_submission(code, spec.fn, inputs, target_name="fibonacci", edge_inputs=edges) |
| assert result.matches == 30 + len(edges) |
| assert result.execution_reward == pytest.approx(100.0) |
| assert result.edge_pass_rate == pytest.approx(1.0) |
| assert result.floor_penalty == 0.0 |
| assert result.reward_hack_penalty == 0.0 |
|
|
|
|
| def test_verifier_partial_score_on_buggy_impl(): |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| buggy = ( |
| "def fibonacci(n):\n" |
| " if not isinstance(n, int) or n <= 0 or n > 90:\n" |
| " raise ValueError('bad')\n" |
| " a, b = 0, 1\n" |
| " for _ in range(n - 1):\n" |
| " a, b = b, a + b\n" |
| " return b + 1\n" |
| ) |
| inputs = generate_fuzz_inputs(spec, count=30, seed=0) |
| result = verify_submission(buggy, spec.fn, inputs, target_name="fibonacci") |
| assert result.execution_reward == pytest.approx(0.0) |
| assert result.matches == 0 |
| |
| assert result.floor_penalty == 25.0 |
|
|
|
|
| def test_verifier_syntax_error_returns_define_error_and_full_penalty(): |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| inputs = generate_fuzz_inputs(spec, count=10, seed=0) |
| result = verify_submission("def fib(:\n pass", spec.fn, inputs, target_name="fibonacci") |
| assert result.define_error is not None |
| assert result.execution_reward == 0.0 |
| assert result.complexity_penalty == 50.0 |
| assert result.floor_penalty == 25.0 |
|
|
|
|
| def test_verifier_missing_target_returns_error(): |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| inputs = generate_fuzz_inputs(spec, count=10, seed=0) |
| result = verify_submission("def other(x): return x", spec.fn, inputs, target_name="fibonacci") |
| assert result.define_error is not None |
| assert result.execution_reward == 0.0 |
|
|
|
|
| def test_complexity_penalty_low_for_simple_fn(): |
| code = "def f(x): return x\n" |
| assert calculate_complexity_penalty(code) < 1.0 |
|
|
|
|
| def test_complexity_penalty_high_for_branchy_fn(): |
| body = "\n ".join(f"if x == {i}: return {i}" for i in range(100)) |
| code = f"def f(x):\n {body}\n return -1\n" |
| assert calculate_complexity_penalty(code) > 5.0 |
|
|
|
|
| |
|
|
|
|
| def test_sandbox_blocks_import_of_reference_module(): |
| """Critical regression: previously an agent could write:: |
| |
| from opensleuth_env.black_box import _fibonacci |
| def fibonacci(n): return _fibonacci(n) |
| |
| and reward-hack to a perfect score. The hardened sandbox must block this. |
| """ |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| hack = ( |
| "def fibonacci(n):\n" |
| " from opensleuth_env.black_box import _fibonacci\n" |
| " return _fibonacci(n)\n" |
| ) |
| inputs = generate_fuzz_inputs(spec, count=10, seed=0) |
| result = verify_submission(hack, spec.fn, inputs, target_name="fibonacci") |
| |
| |
| assert result.execution_reward < 50.0 |
| |
| assert result.reward_hack_penalty >= 25.0 |
|
|
|
|
| def test_static_detector_flags_opensleuth_import(): |
| code = "import opensleuth_env\ndef f(x): return x\n" |
| assert _looks_like_reference_import(code) is True |
| assert _looks_like_reference_import("def f(x): return x\n") is False |
|
|
|
|
| def test_constant_function_collapse_is_penalised(): |
| """An agent that learns to always return the same value should be |
| penalised even if some random inputs happen to match (e.g. for |
| `digit_sum`, `lambda x: 0` matches only x=0).""" |
| spec = BLACK_BOX_FUNCTIONS["digit_sum"] |
| code = "def digit_sum(n):\n return 0\n" |
| inputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999] |
| result = verify_submission(code, spec.fn, inputs, target_name="digit_sum") |
| |
| assert result.reward_hack_penalty >= 15.0 |
|
|
|
|
| def test_sandbox_blocks_open_and_eval(): |
| spec = BLACK_BOX_FUNCTIONS["fibonacci"] |
| bad = ( |
| "def fibonacci(n):\n" |
| " open('/tmp/x', 'w')\n" |
| " return 0\n" |
| ) |
| inputs = generate_fuzz_inputs(spec, count=5, seed=0) |
| result = verify_submission(bad, spec.fn, inputs, target_name="fibonacci") |
| |
| |
| assert result.execution_reward < 50.0 |
|
|
|
|
| |
|
|
|
|
| def test_edge_cases_are_always_evaluated(): |
| spec = BLACK_BOX_FUNCTIONS["reverse_string"] |
| |
| code = ( |
| "def reverse_string(s):\n" |
| " if s == '':\n" |
| " return 'OOPS'\n" |
| " return s[::-1]\n" |
| ) |
| inputs = generate_fuzz_inputs(spec, count=20, seed=0) |
| edges = get_edge_inputs(spec) |
| assert "" in edges |
| result = verify_submission( |
| code, spec.fn, inputs, target_name="reverse_string", edge_inputs=edges |
| ) |
| |
| assert result.matches_by_category["edge"] == len(edges) - 1 |
| assert result.edge_pass_rate < 1.0 |
| assert result.matches_by_category["random"] >= 18 |
|
|
|
|
| |
|
|
|
|
| def test_env_submit_reference_implementation_gives_high_reward(): |
| env = OpenSleuthEnv(fuzz_count=20) |
| obs = env.reset("reverse_string") |
| code = "def reverse_string(s):\n return s[::-1]\n" |
| resp = env.step(obs.episode_id, SubmitAction(code=code)) |
| assert resp.done is True |
| |
| assert resp.reward > 140.0 |
| assert resp.info["execution_reward"] == pytest.approx(100.0) |
| assert resp.info["edge_pass_rate"] == pytest.approx(1.0) |
| assert resp.info["floor_penalty"] == 0.0 |
| assert resp.info["reward_hack_penalty"] == 0.0 |
| assert resp.info["perfect_bonus"] == 50.0 |
|
|
|
|
| def test_env_submit_buggy_function_lands_clearly_negative(): |
| """Wrong submissions must end up clearly negative so the trainer's GRPO |
| advantage penalises 'just emit any function'.""" |
| env = OpenSleuthEnv(fuzz_count=10) |
| obs = env.reset("digit_sum") |
| code = "def digit_sum(n):\n return -1\n" |
| resp = env.step(obs.episode_id, SubmitAction(code=code)) |
| assert resp.done is True |
| assert resp.info["execution_reward"] < 50.0 |
| assert resp.reward < 0.0 |
| assert resp.info["floor_penalty"] == 25.0 |
|
|
|
|
| def test_env_submit_import_hack_scores_clearly_negative(): |
| env = OpenSleuthEnv(fuzz_count=10) |
| obs = env.reset("fibonacci") |
| code = ( |
| "def fibonacci(n):\n" |
| " from opensleuth_env.black_box import _fibonacci\n" |
| " return _fibonacci(n)\n" |
| ) |
| resp = env.step(obs.episode_id, SubmitAction(code=code)) |
| assert resp.done is True |
| assert resp.reward < 0.0 |
| assert resp.info["reward_hack_penalty"] >= 25.0 |
|
|
|
|
| |
|
|
|
|
| def test_all_specs_have_difficulty_and_edge_cases(): |
| valid = {"easy", "medium", "hard"} |
| for name, spec in BLACK_BOX_FUNCTIONS.items(): |
| assert spec.difficulty in valid, f"{name} has invalid difficulty {spec.difficulty!r}" |
| assert isinstance(spec.edge_cases, list) |
| assert len(spec.edge_cases) >= 3, f"{name} should declare >=3 edge cases for robust scoring" |
|
|