"""Unit tests for the OpenSleuth env + verifier. Run with `pytest -q` from the env/ directory. """ from __future__ import annotations import pytest from opensleuth_env import ( BLACK_BOX_FUNCTIONS, OpenSleuthEnv, ProbeAction, SubmitAction, ) from opensleuth_env.env import _bucket_of, NEW_BUCKET_BONUS, NEW_OUTPUT_BONUS, PROBE_STEP_COST from opensleuth_env.verifier import ( calculate_complexity_penalty, generate_fuzz_inputs, get_edge_inputs, verify_submission, _looks_like_reference_import, ) # ---------- env transitions ------------------------------------------------ def test_reset_returns_episode_id_and_signature(): env = OpenSleuthEnv() obs = env.reset("fibonacci") assert obs.episode_id assert obs.target_function_name == "fibonacci" assert "fibonacci" in obs.target_function_signature assert obs.probe_history == [] assert obs.steps_taken == 0 # New v0.3 metadata. assert obs.difficulty == "easy" assert obs.coverage_buckets_seen == 0 def test_unknown_target_raises(): env = OpenSleuthEnv() with pytest.raises(ValueError): env.reset("not_a_real_function") def test_probe_with_int_input_records_output(): env = OpenSleuthEnv() obs = env.reset("fibonacci") resp = env.step(obs.episode_id, ProbeAction(input_repr="10")) assert resp.done is False assert resp.observation.probe_history[-1].is_error is False assert resp.observation.probe_history[-1].output_repr == "55" # First successful probe = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST. expected = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST assert resp.reward == pytest.approx(expected) assert resp.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) assert resp.info["bucket"] == "int:medium" assert resp.observation.coverage_buckets_seen == 1 assert resp.observation.seen_outputs_count == 1 def test_probe_with_invalid_literal_returns_parse_error(): env = OpenSleuthEnv() obs = env.reset("fibonacci") resp = env.step(obs.episode_id, ProbeAction(input_repr="not a literal")) assert resp.done is False assert resp.observation.probe_history[-1].error_type == "ParseError" def test_repeated_output_only_pays_intrinsic_once(): env = OpenSleuthEnv() obs = env.reset("fibonacci") r1 = env.step(obs.episode_id, ProbeAction(input_repr="10")) r2 = env.step(obs.episode_id, ProbeAction(input_repr="10")) assert r1.reward > r2.reward # Second hit on the same bucket+output: just the per-step cost. assert r2.reward == pytest.approx(PROBE_STEP_COST) def test_step_limit_terminates_episode(): env = OpenSleuthEnv() obs = env.reset("fibonacci", max_steps=2) env.step(obs.episode_id, ProbeAction(input_repr="1")) resp = env.step(obs.episode_id, ProbeAction(input_repr="2")) assert resp.done is True def test_unknown_episode_id_raises(): env = OpenSleuthEnv() with pytest.raises(KeyError): env.step("does-not-exist", ProbeAction(input_repr="1")) # ---------- coverage bucketing (CovRL-Fuzz inspired) ----------------------- def test_bucket_of_distinguishes_qualitative_input_classes(): assert _bucket_of(0) == "int:zero" assert _bucket_of(-1) == "int:negative" assert _bucket_of(5) == "int:small" assert _bucket_of(50) == "int:medium" assert _bucket_of(5000) == "int:large" assert _bucket_of(50_000) == "int:huge" assert _bucket_of("") == "str:empty" assert _bucket_of("a") == "str:singleton" assert _bucket_of([]) == "list:empty" assert _bucket_of((1, 2)) == "tuple:short" assert _bucket_of(True) == "bool:True" # bool isolated from int assert _bucket_of(None) == "none" def test_probe_distinct_buckets_each_pay_coverage_bonus(): env = OpenSleuthEnv() obs = env.reset("fibonacci") # 1 (small), 50 (medium), 5 (already small) r1 = env.step(obs.episode_id, ProbeAction(input_repr="1")) r2 = env.step(obs.episode_id, ProbeAction(input_repr="50")) r3 = env.step(obs.episode_id, ProbeAction(input_repr="5")) assert r1.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) assert r2.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS) assert r3.info["coverage_bonus"] == pytest.approx(0.0) assert r3.observation.coverage_buckets_seen == 2 # ---------- verifier ------------------------------------------------------- def test_verifier_perfect_score_on_reference_impl(): spec = BLACK_BOX_FUNCTIONS["fibonacci"] code = ( "def fibonacci(n):\n" " if not isinstance(n, int) or n <= 0 or n > 90:\n" " raise ValueError('bad')\n" " a, b = 0, 1\n" " for _ in range(n - 1):\n" " a, b = b, a + b\n" " return b\n" ) inputs = generate_fuzz_inputs(spec, count=30, seed=0) edges = get_edge_inputs(spec) result = verify_submission(code, spec.fn, inputs, target_name="fibonacci", edge_inputs=edges) assert result.matches == 30 + len(edges) assert result.execution_reward == pytest.approx(100.0) assert result.edge_pass_rate == pytest.approx(1.0) assert result.floor_penalty == 0.0 assert result.reward_hack_penalty == 0.0 def test_verifier_partial_score_on_buggy_impl(): spec = BLACK_BOX_FUNCTIONS["fibonacci"] buggy = ( "def fibonacci(n):\n" " if not isinstance(n, int) or n <= 0 or n > 90:\n" " raise ValueError('bad')\n" " a, b = 0, 1\n" " for _ in range(n - 1):\n" " a, b = b, a + b\n" " return b + 1\n" ) inputs = generate_fuzz_inputs(spec, count=30, seed=0) result = verify_submission(buggy, spec.fn, inputs, target_name="fibonacci") assert result.execution_reward == pytest.approx(0.0) assert result.matches == 0 # Sub-50% match rate triggers the hard floor. assert result.floor_penalty == 25.0 def test_verifier_syntax_error_returns_define_error_and_full_penalty(): spec = BLACK_BOX_FUNCTIONS["fibonacci"] inputs = generate_fuzz_inputs(spec, count=10, seed=0) result = verify_submission("def fib(:\n pass", spec.fn, inputs, target_name="fibonacci") assert result.define_error is not None assert result.execution_reward == 0.0 assert result.complexity_penalty == 50.0 assert result.floor_penalty == 25.0 def test_verifier_missing_target_returns_error(): spec = BLACK_BOX_FUNCTIONS["fibonacci"] inputs = generate_fuzz_inputs(spec, count=10, seed=0) result = verify_submission("def other(x): return x", spec.fn, inputs, target_name="fibonacci") assert result.define_error is not None assert result.execution_reward == 0.0 def test_complexity_penalty_low_for_simple_fn(): code = "def f(x): return x\n" assert calculate_complexity_penalty(code) < 1.0 def test_complexity_penalty_high_for_branchy_fn(): body = "\n ".join(f"if x == {i}: return {i}" for i in range(100)) code = f"def f(x):\n {body}\n return -1\n" assert calculate_complexity_penalty(code) > 5.0 # ---------- anti-reward-hacking -------------------------------------------- def test_sandbox_blocks_import_of_reference_module(): """Critical regression: previously an agent could write:: from opensleuth_env.black_box import _fibonacci def fibonacci(n): return _fibonacci(n) and reward-hack to a perfect score. The hardened sandbox must block this. """ spec = BLACK_BOX_FUNCTIONS["fibonacci"] hack = ( "def fibonacci(n):\n" " from opensleuth_env.black_box import _fibonacci\n" " return _fibonacci(n)\n" ) inputs = generate_fuzz_inputs(spec, count=10, seed=0) result = verify_submission(hack, spec.fn, inputs, target_name="fibonacci") # Either definition fails (no __import__) or per-call fails. Either way # the agent must NOT score positively. assert result.execution_reward < 50.0 # Static detector flagged the import attempt. assert result.reward_hack_penalty >= 25.0 def test_static_detector_flags_opensleuth_import(): code = "import opensleuth_env\ndef f(x): return x\n" assert _looks_like_reference_import(code) is True assert _looks_like_reference_import("def f(x): return x\n") is False def test_constant_function_collapse_is_penalised(): """An agent that learns to always return the same value should be penalised even if some random inputs happen to match (e.g. for `digit_sum`, `lambda x: 0` matches only x=0).""" spec = BLACK_BOX_FUNCTIONS["digit_sum"] code = "def digit_sum(n):\n return 0\n" inputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999] result = verify_submission(code, spec.fn, inputs, target_name="digit_sum") # All distinct inputs return 0 (one signature) while ref produces many. assert result.reward_hack_penalty >= 15.0 def test_sandbox_blocks_open_and_eval(): spec = BLACK_BOX_FUNCTIONS["fibonacci"] bad = ( "def fibonacci(n):\n" " open('/tmp/x', 'w')\n" " return 0\n" ) inputs = generate_fuzz_inputs(spec, count=5, seed=0) result = verify_submission(bad, spec.fn, inputs, target_name="fibonacci") # Either the per-call NameError on `open` makes everything mismatch, # or it raises at definition time. Either way, low reward. assert result.execution_reward < 50.0 # ---------- stratified scoring (edge vs random) ---------------------------- def test_edge_cases_are_always_evaluated(): spec = BLACK_BOX_FUNCTIONS["reverse_string"] # Submission that fails the empty-string edge case but works for non-empty. code = ( "def reverse_string(s):\n" " if s == '':\n" " return 'OOPS'\n" " return s[::-1]\n" ) inputs = generate_fuzz_inputs(spec, count=20, seed=0) edges = get_edge_inputs(spec) assert "" in edges result = verify_submission( code, spec.fn, inputs, target_name="reverse_string", edge_inputs=edges ) # Should pass most random + most edge except the empty-string edge case. assert result.matches_by_category["edge"] == len(edges) - 1 assert result.edge_pass_rate < 1.0 assert result.matches_by_category["random"] >= 18 # very rare to roll empty # ---------- end-to-end submission via env ---------------------------------- def test_env_submit_reference_implementation_gives_high_reward(): env = OpenSleuthEnv(fuzz_count=20) obs = env.reset("reverse_string") code = "def reverse_string(s):\n return s[::-1]\n" resp = env.step(obs.episode_id, SubmitAction(code=code)) assert resp.done is True # 100 - tiny complexity penalty + 50 perfect bonus. assert resp.reward > 140.0 assert resp.info["execution_reward"] == pytest.approx(100.0) assert resp.info["edge_pass_rate"] == pytest.approx(1.0) assert resp.info["floor_penalty"] == 0.0 assert resp.info["reward_hack_penalty"] == 0.0 assert resp.info["perfect_bonus"] == 50.0 def test_env_submit_buggy_function_lands_clearly_negative(): """Wrong submissions must end up clearly negative so the trainer's GRPO advantage penalises 'just emit any function'.""" env = OpenSleuthEnv(fuzz_count=10) obs = env.reset("digit_sum") code = "def digit_sum(n):\n return -1\n" resp = env.step(obs.episode_id, SubmitAction(code=code)) assert resp.done is True assert resp.info["execution_reward"] < 50.0 assert resp.reward < 0.0 assert resp.info["floor_penalty"] == 25.0 def test_env_submit_import_hack_scores_clearly_negative(): env = OpenSleuthEnv(fuzz_count=10) obs = env.reset("fibonacci") code = ( "def fibonacci(n):\n" " from opensleuth_env.black_box import _fibonacci\n" " return _fibonacci(n)\n" ) resp = env.step(obs.episode_id, SubmitAction(code=code)) assert resp.done is True assert resp.reward < 0.0 assert resp.info["reward_hack_penalty"] >= 25.0 # ---------- spec metadata -------------------------------------------------- def test_all_specs_have_difficulty_and_edge_cases(): valid = {"easy", "medium", "hard"} for name, spec in BLACK_BOX_FUNCTIONS.items(): assert spec.difficulty in valid, f"{name} has invalid difficulty {spec.difficulty!r}" assert isinstance(spec.edge_cases, list) assert len(spec.edge_cases) >= 3, f"{name} should declare >=3 edge cases for robust scoring"