| """Tests for visible verifier, held-out evaluator, and R-Zero reward functions.""" | |
| from forgeenv.tasks.models import ExecutionResult, Task | |
| from forgeenv.training.reward_functions import ( | |
| compute_alignment_score, | |
| compute_drift_gen_reward, | |
| compute_repetition_penalty, | |
| compute_uncertainty_reward, | |
| ) | |
| from forgeenv.verifier.held_out_evaluator import compute_held_out_scores | |
| from forgeenv.verifier.visible_verifier import compute_visible_reward | |
| SAMPLE_TASK = Task( | |
| task_id="test_001", | |
| description="Test task", | |
| script_content=( | |
| "from transformers import Trainer\n" | |
| "trainer = Trainer()\n" | |
| "trainer.train()\n" | |
| ), | |
| difficulty="easy", | |
| ) | |
| def test_visible_reward_success(): | |
| result = ExecutionResult( | |
| exit_code=0, | |
| stdout="step=1 loss=3.5\nstep=2 loss=2.1\nTRAINING_COMPLETE", | |
| stderr="", | |
| wall_time_ms=1000, | |
| checkpoint_exists=True, | |
| script_content=SAMPLE_TASK.script_content, | |
| ) | |
| reward, breakdown = compute_visible_reward(result, SAMPLE_TASK) | |
| assert reward > 0, f"Successful run should have positive reward, got {reward}" | |
| assert breakdown["script_executes"] == 1.0 | |
| assert breakdown["loss_decreased"] > 0 | |
| def test_visible_reward_failure(): | |
| result = ExecutionResult( | |
| exit_code=1, | |
| stdout="", | |
| stderr="Error", | |
| wall_time_ms=100, | |
| script_content=SAMPLE_TASK.script_content, | |
| ) | |
| reward, breakdown = compute_visible_reward(result, SAMPLE_TASK) | |
| assert breakdown["script_executes"] == 0.0 | |
| assert reward <= 0.0 | |
| def test_held_out_success(): | |
| result = ExecutionResult( | |
| exit_code=0, | |
| stdout="step=1 loss=3.5\nstep=2 loss=2.1\neval_accuracy=0.78\nTRAINING_COMPLETE", | |
| stderr="", | |
| wall_time_ms=1000, | |
| checkpoint_exists=True, | |
| script_content=SAMPLE_TASK.script_content, | |
| ) | |
| scores = compute_held_out_scores(result, SAMPLE_TASK) | |
| assert scores["executed_cleanly"] == 1.0 | |
| assert scores["loss_decreased"] > 0 | |
| assert scores["hidden_tests_passed"] == 1.0 | |
| assert scores["intent_preserved"] == 1.0 | |
| def test_held_out_workaround_detection(): | |
| """Bare except wrapping all code should reduce no_forbidden_workarounds.""" | |
| result = ExecutionResult( | |
| exit_code=0, | |
| stdout="TRAINING_COMPLETE", | |
| stderr="", | |
| wall_time_ms=100, | |
| checkpoint_exists=True, | |
| script_content="try:\n bad()\nexcept:\n pass\n", | |
| ) | |
| scores = compute_held_out_scores(result, SAMPLE_TASK) | |
| assert scores["no_forbidden_workarounds"] < 1.0 | |
| def test_uncertainty_peaks_at_half(): | |
| r_half = compute_uncertainty_reward([True, False, True, False, True, False]) | |
| r_all = compute_uncertainty_reward([True, True, True, True]) | |
| r_none = compute_uncertainty_reward([False, False, False, False]) | |
| assert r_half > r_all | |
| assert r_half > r_none | |
| assert abs(r_all) < 0.01 | |
| assert abs(r_none) < 0.01 | |
| def test_uncertainty_handles_empty(): | |
| assert compute_uncertainty_reward([]) == 0.0 | |
| def test_repetition_penalty_higher_for_duplicates(): | |
| batch = [ | |
| "rename evaluate to eval_model", | |
| "rename evaluate to eval_model", | |
| "rename evaluate to eval_model", | |
| "change import path for trainer", | |
| ] | |
| p_dup = compute_repetition_penalty(batch[0], batch) | |
| p_unique = compute_repetition_penalty(batch[3], batch) | |
| assert p_dup >= p_unique | |
| def test_drift_gen_reward_combines_signals(): | |
| """Composite reward should rise with uncertainty and fall with repetition.""" | |
| high_unc_unique = compute_drift_gen_reward( | |
| "unique unique unique tokens", | |
| [True, False, True, False], | |
| ["totally different a b c", "unique unique unique tokens"], | |
| ) | |
| high_unc_repeated = compute_drift_gen_reward( | |
| "same same same same", | |
| [True, False, True, False], | |
| ["same same same same", "same same same same", "same same same same"], | |
| ) | |
| assert high_unc_unique >= high_unc_repeated | |
| def test_alignment_score_perfect_correlation(): | |
| visible = [0.0, 0.25, 0.5, 0.75, 1.0] | |
| held_out = [0.0, 0.25, 0.5, 0.75, 1.0] | |
| assert compute_alignment_score(visible, held_out) > 0.99 | |
| def test_alignment_score_anti_correlation(): | |
| visible = [1.0, 0.5, 0.0] | |
| held_out = [0.0, 0.5, 1.0] | |
| assert compute_alignment_score(visible, held_out) < -0.99 | |
| def test_alignment_score_constant_returns_zero(): | |
| """No variance in either array → no signal → 0.0.""" | |
| assert compute_alignment_score([0.5, 0.5, 0.5], [0.1, 0.9, 0.4]) == 0.0 | |