Spaces:

anugrah55
/

opensleuth-env-gemini-cli

Paused

App Files Files Community

opensleuth-env-gemini-cli / tests /test_env.py

anugrah55

OpenEnv 0.2.3 conformance: mount /openenv sub-app, add adapter + tests + example client

31715b5 verified 12 days ago

raw

history blame contribute delete

12.5 kB

	"""Unit tests for the OpenSleuth env + verifier.

	Run with `pytest -q` from the env/ directory.
	"""

	from __future__ import annotations

	import pytest

	from opensleuth_env import (
	BLACK_BOX_FUNCTIONS,
	OpenSleuthEnv,
	ProbeAction,
	SubmitAction,
	)
	from opensleuth_env.env import _bucket_of, NEW_BUCKET_BONUS, NEW_OUTPUT_BONUS, PROBE_STEP_COST
	from opensleuth_env.verifier import (
	calculate_complexity_penalty,
	generate_fuzz_inputs,
	get_edge_inputs,
	verify_submission,
	_looks_like_reference_import,
	)


	# ---------- env transitions ------------------------------------------------


	def test_reset_returns_episode_id_and_signature():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci")
	assert obs.episode_id
	assert obs.target_function_name == "fibonacci"
	assert "fibonacci" in obs.target_function_signature
	assert obs.probe_history == []
	assert obs.steps_taken == 0
	# New v0.3 metadata.
	assert obs.difficulty == "easy"
	assert obs.coverage_buckets_seen == 0


	def test_unknown_target_raises():
	env = OpenSleuthEnv()
	with pytest.raises(ValueError):
	env.reset("not_a_real_function")


	def test_probe_with_int_input_records_output():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci")
	resp = env.step(obs.episode_id, ProbeAction(input_repr="10"))
	assert resp.done is False
	assert resp.observation.probe_history[-1].is_error is False
	assert resp.observation.probe_history[-1].output_repr == "55"
	# First successful probe = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST.
	expected = NEW_OUTPUT_BONUS + NEW_BUCKET_BONUS + PROBE_STEP_COST
	assert resp.reward == pytest.approx(expected)
	assert resp.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
	assert resp.info["bucket"] == "int:medium"
	assert resp.observation.coverage_buckets_seen == 1
	assert resp.observation.seen_outputs_count == 1


	def test_probe_with_invalid_literal_returns_parse_error():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci")
	resp = env.step(obs.episode_id, ProbeAction(input_repr="not a literal"))
	assert resp.done is False
	assert resp.observation.probe_history[-1].error_type == "ParseError"


	def test_repeated_output_only_pays_intrinsic_once():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci")
	r1 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
	r2 = env.step(obs.episode_id, ProbeAction(input_repr="10"))
	assert r1.reward > r2.reward
	# Second hit on the same bucket+output: just the per-step cost.
	assert r2.reward == pytest.approx(PROBE_STEP_COST)


	def test_step_limit_terminates_episode():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci", max_steps=2)
	env.step(obs.episode_id, ProbeAction(input_repr="1"))
	resp = env.step(obs.episode_id, ProbeAction(input_repr="2"))
	assert resp.done is True


	def test_unknown_episode_id_raises():
	env = OpenSleuthEnv()
	with pytest.raises(KeyError):
	env.step("does-not-exist", ProbeAction(input_repr="1"))


	# ---------- coverage bucketing (CovRL-Fuzz inspired) -----------------------


	def test_bucket_of_distinguishes_qualitative_input_classes():
	assert _bucket_of(0) == "int:zero"
	assert _bucket_of(-1) == "int:negative"
	assert _bucket_of(5) == "int:small"
	assert _bucket_of(50) == "int:medium"
	assert _bucket_of(5000) == "int:large"
	assert _bucket_of(50_000) == "int:huge"
	assert _bucket_of("") == "str:empty"
	assert _bucket_of("a") == "str:singleton"
	assert _bucket_of([]) == "list:empty"
	assert _bucket_of((1, 2)) == "tuple:short"
	assert _bucket_of(True) == "bool:True" # bool isolated from int
	assert _bucket_of(None) == "none"


	def test_probe_distinct_buckets_each_pay_coverage_bonus():
	env = OpenSleuthEnv()
	obs = env.reset("fibonacci")
	# 1 (small), 50 (medium), 5 (already small)
	r1 = env.step(obs.episode_id, ProbeAction(input_repr="1"))
	r2 = env.step(obs.episode_id, ProbeAction(input_repr="50"))
	r3 = env.step(obs.episode_id, ProbeAction(input_repr="5"))
	assert r1.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
	assert r2.info["coverage_bonus"] == pytest.approx(NEW_BUCKET_BONUS)
	assert r3.info["coverage_bonus"] == pytest.approx(0.0)
	assert r3.observation.coverage_buckets_seen == 2


	# ---------- verifier -------------------------------------------------------


	def test_verifier_perfect_score_on_reference_impl():
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	code = (
	"def fibonacci(n):\n"
	" if not isinstance(n, int) or n <= 0 or n > 90:\n"
	" raise ValueError('bad')\n"
	" a, b = 0, 1\n"
	" for _ in range(n - 1):\n"
	" a, b = b, a + b\n"
	" return b\n"
	)
	inputs = generate_fuzz_inputs(spec, count=30, seed=0)
	edges = get_edge_inputs(spec)
	result = verify_submission(code, spec.fn, inputs, target_name="fibonacci", edge_inputs=edges)
	assert result.matches == 30 + len(edges)
	assert result.execution_reward == pytest.approx(100.0)
	assert result.edge_pass_rate == pytest.approx(1.0)
	assert result.floor_penalty == 0.0
	assert result.reward_hack_penalty == 0.0


	def test_verifier_partial_score_on_buggy_impl():
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	buggy = (
	"def fibonacci(n):\n"
	" if not isinstance(n, int) or n <= 0 or n > 90:\n"
	" raise ValueError('bad')\n"
	" a, b = 0, 1\n"
	" for _ in range(n - 1):\n"
	" a, b = b, a + b\n"
	" return b + 1\n"
	)
	inputs = generate_fuzz_inputs(spec, count=30, seed=0)
	result = verify_submission(buggy, spec.fn, inputs, target_name="fibonacci")
	assert result.execution_reward == pytest.approx(0.0)
	assert result.matches == 0
	# Sub-50% match rate triggers the hard floor.
	assert result.floor_penalty == 25.0


	def test_verifier_syntax_error_returns_define_error_and_full_penalty():
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	inputs = generate_fuzz_inputs(spec, count=10, seed=0)
	result = verify_submission("def fib(:\n pass", spec.fn, inputs, target_name="fibonacci")
	assert result.define_error is not None
	assert result.execution_reward == 0.0
	assert result.complexity_penalty == 50.0
	assert result.floor_penalty == 25.0


	def test_verifier_missing_target_returns_error():
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	inputs = generate_fuzz_inputs(spec, count=10, seed=0)
	result = verify_submission("def other(x): return x", spec.fn, inputs, target_name="fibonacci")
	assert result.define_error is not None
	assert result.execution_reward == 0.0


	def test_complexity_penalty_low_for_simple_fn():
	code = "def f(x): return x\n"
	assert calculate_complexity_penalty(code) < 1.0


	def test_complexity_penalty_high_for_branchy_fn():
	body = "\n ".join(f"if x == {i}: return {i}" for i in range(100))
	code = f"def f(x):\n {body}\n return -1\n"
	assert calculate_complexity_penalty(code) > 5.0


	# ---------- anti-reward-hacking --------------------------------------------


	def test_sandbox_blocks_import_of_reference_module():
	"""Critical regression: previously an agent could write::

	from opensleuth_env.black_box import _fibonacci
	def fibonacci(n): return _fibonacci(n)

	and reward-hack to a perfect score. The hardened sandbox must block this.
	"""
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	hack = (
	"def fibonacci(n):\n"
	" from opensleuth_env.black_box import _fibonacci\n"
	" return _fibonacci(n)\n"
	)
	inputs = generate_fuzz_inputs(spec, count=10, seed=0)
	result = verify_submission(hack, spec.fn, inputs, target_name="fibonacci")
	# Either definition fails (no __import__) or per-call fails. Either way
	# the agent must NOT score positively.
	assert result.execution_reward < 50.0
	# Static detector flagged the import attempt.
	assert result.reward_hack_penalty >= 25.0


	def test_static_detector_flags_opensleuth_import():
	code = "import opensleuth_env\ndef f(x): return x\n"
	assert _looks_like_reference_import(code) is True
	assert _looks_like_reference_import("def f(x): return x\n") is False


	def test_constant_function_collapse_is_penalised():
	"""An agent that learns to always return the same value should be
	penalised even if some random inputs happen to match (e.g. for
	`digit_sum`, `lambda x: 0` matches only x=0)."""
	spec = BLACK_BOX_FUNCTIONS["digit_sum"]
	code = "def digit_sum(n):\n return 0\n"
	inputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100, 999]
	result = verify_submission(code, spec.fn, inputs, target_name="digit_sum")
	# All distinct inputs return 0 (one signature) while ref produces many.
	assert result.reward_hack_penalty >= 15.0


	def test_sandbox_blocks_open_and_eval():
	spec = BLACK_BOX_FUNCTIONS["fibonacci"]
	bad = (
	"def fibonacci(n):\n"
	" open('/tmp/x', 'w')\n"
	" return 0\n"
	)
	inputs = generate_fuzz_inputs(spec, count=5, seed=0)
	result = verify_submission(bad, spec.fn, inputs, target_name="fibonacci")
	# Either the per-call NameError on `open` makes everything mismatch,
	# or it raises at definition time. Either way, low reward.
	assert result.execution_reward < 50.0


	# ---------- stratified scoring (edge vs random) ----------------------------


	def test_edge_cases_are_always_evaluated():
	spec = BLACK_BOX_FUNCTIONS["reverse_string"]
	# Submission that fails the empty-string edge case but works for non-empty.
	code = (
	"def reverse_string(s):\n"
	" if s == '':\n"
	" return 'OOPS'\n"
	" return s[::-1]\n"
	)
	inputs = generate_fuzz_inputs(spec, count=20, seed=0)
	edges = get_edge_inputs(spec)
	assert "" in edges
	result = verify_submission(
	code, spec.fn, inputs, target_name="reverse_string", edge_inputs=edges
	)
	# Should pass most random + most edge except the empty-string edge case.
	assert result.matches_by_category["edge"] == len(edges) - 1
	assert result.edge_pass_rate < 1.0
	assert result.matches_by_category["random"] >= 18 # very rare to roll empty


	# ---------- end-to-end submission via env ----------------------------------


	def test_env_submit_reference_implementation_gives_high_reward():
	env = OpenSleuthEnv(fuzz_count=20)
	obs = env.reset("reverse_string")
	code = "def reverse_string(s):\n return s[::-1]\n"
	resp = env.step(obs.episode_id, SubmitAction(code=code))
	assert resp.done is True
	# 100 - tiny complexity penalty + 50 perfect bonus.
	assert resp.reward > 140.0
	assert resp.info["execution_reward"] == pytest.approx(100.0)
	assert resp.info["edge_pass_rate"] == pytest.approx(1.0)
	assert resp.info["floor_penalty"] == 0.0
	assert resp.info["reward_hack_penalty"] == 0.0
	assert resp.info["perfect_bonus"] == 50.0


	def test_env_submit_buggy_function_lands_clearly_negative():
	"""Wrong submissions must end up clearly negative so the trainer's GRPO
	advantage penalises 'just emit any function'."""
	env = OpenSleuthEnv(fuzz_count=10)
	obs = env.reset("digit_sum")
	code = "def digit_sum(n):\n return -1\n"
	resp = env.step(obs.episode_id, SubmitAction(code=code))
	assert resp.done is True
	assert resp.info["execution_reward"] < 50.0
	assert resp.reward < 0.0
	assert resp.info["floor_penalty"] == 25.0


	def test_env_submit_import_hack_scores_clearly_negative():
	env = OpenSleuthEnv(fuzz_count=10)
	obs = env.reset("fibonacci")
	code = (
	"def fibonacci(n):\n"
	" from opensleuth_env.black_box import _fibonacci\n"
	" return _fibonacci(n)\n"
	)
	resp = env.step(obs.episode_id, SubmitAction(code=code))
	assert resp.done is True
	assert resp.reward < 0.0
	assert resp.info["reward_hack_penalty"] >= 25.0


	# ---------- spec metadata --------------------------------------------------


	def test_all_specs_have_difficulty_and_edge_cases():
	valid = {"easy", "medium", "hard"}
	for name, spec in BLACK_BOX_FUNCTIONS.items():
	assert spec.difficulty in valid, f"{name} has invalid difficulty {spec.difficulty!r}"
	assert isinstance(spec.edge_cases, list)
	assert len(spec.edge_cases) >= 3, f"{name} should declare >=3 edge cases for robust scoring"