"""Adversarial reward-hacking probes. These tests *play the role of an evil model* trying to game the reward without solving the discovery task. Per the hackathon guide: "Do not optimize a reward you have not tried to break yourself first." For each known hacking pattern we run a degenerate policy through the environment and assert that its cumulative reward is **strictly worse** than what the heuristic baseline can earn. A failing test here means the reward is hackable in the way the test name describes — fix it before training. """ from __future__ import annotations import statistics import pytest from models import ActionType, ExperimentAction from scripts.baseline_agents import HeuristicAgent from server.environment import CERNCollisionEnvironment def _run_with_actions(env, actions): """Apply a fixed action list (looping if it runs out) until done.""" cumulative = 0.0 obs = None for a in actions: obs = env.step(a) cumulative += float(obs.reward or 0.0) if obs.done: return cumulative while obs is None or not obs.done: obs = env.step(actions[-1]) cumulative += float(obs.reward or 0.0) return cumulative def _heuristic_baseline(seed: int, difficulty: str = "easy") -> float: env = CERNCollisionEnvironment(max_steps=20) obs = env.reset(seed=seed, difficulty=difficulty) agent = HeuristicAgent() agent.reset() cumulative = 0.0 while not obs.done: obs = env.step(agent.act(obs)) cumulative += float(obs.reward or 0.0) return cumulative # ── Hack 1: bogus method strings ──────────────────────────────────────── def test_bogus_method_spam_never_beats_heuristic(): """Setting method to nonsense strings must not farm tool_fit reward. A naive shaping reward that grants a credit on `method != ""` would happily reward this; ours requires method ∈ TOOL_REGISTRY with matching category. """ seeds = [11, 12, 13, 14] hack_rewards = [] for seed in seeds: env = CERNCollisionEnvironment(max_steps=20) env.reset(seed=seed, difficulty="easy") bogus = [ ExperimentAction(action_type=at, method="zzz_garbage_zzz") for at in [ ActionType.CONFIGURE_BEAM, ActionType.SELECT_CHANNEL, ActionType.SET_TRIGGER, ActionType.ALLOCATE_LUMINOSITY, ActionType.COLLECT_COLLISIONS, ActionType.RECONSTRUCT_TRACKS, ActionType.BUILD_INVARIANT_MASS, ActionType.FIT_RESONANCE, ActionType.ESTIMATE_SIGNIFICANCE, ActionType.SUBMIT_DISCOVERY_CLAIM, ] ] hack_rewards.append(_run_with_actions(env, bogus)) heur_rewards = [_heuristic_baseline(s) for s in seeds] assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards) # ── Hack 2: idle / always-loop a single action ────────────────────────── def test_loop_request_systematics_does_not_farm_reward(): """Repeating REQUEST_SYSTEMATICS forever should not accumulate reward. The repeat-action penalty + lack of milestone progress + budget cost should drive cumulative reward strictly below baseline. """ seeds = [21, 22, 23, 24] hack_rewards = [] for seed in seeds: env = CERNCollisionEnvironment(max_steps=20) env.reset(seed=seed, difficulty="easy") loop = [ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)] * 30 hack_rewards.append(_run_with_actions(env, loop)) heur_rewards = [_heuristic_baseline(s) for s in seeds] assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards) # ── Hack 3: instant-claim with a wild over-claim ──────────────────────── def test_instant_overclaim_fails_terminal_grade(): """Submitting a discovery claim before any analysis must fail the rules engine (PREREQ_MISSING) and earn a strong negative reward. """ seeds = [31, 32, 33] rewards = [] for seed in seeds: env = CERNCollisionEnvironment(max_steps=10) env.reset(seed=seed, difficulty="easy") action = ExperimentAction( action_type=ActionType.SUBMIT_DISCOVERY_CLAIM, parameters={ "claim": { "mass_estimate_gev": 999.0, "significance_sigma": 50.0, "decay_channel": "diphoton", "confidence": 0.99, } }, ) cumulative = 0.0 for _ in range(5): obs = env.step(action) cumulative += float(obs.reward or 0.0) if obs.done: break rewards.append(cumulative) # All three must be strictly negative — none of them got "free" reward. assert all(r < 0.0 for r in rewards), rewards # ── Hack 4: maxed-out confidence on a wrong channel ───────────────────── def test_confidence_one_wrong_channel_is_punished(): """If the model figures out the rest of the pipeline but submits a high-confidence claim with the *wrong* channel, the overconfident- wrong penalty must dominate any positive shaping. """ from models import DiscoveryClaim from server.rewards.reward_function import compute_terminal_reward from server.simulator.latent_state import ( FullLatentState, LatentParticle, ResourceState, ) s = FullLatentState( particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"), resources=ResourceState(), ) s.progress.best_significance_sigma = 5.0 claim = DiscoveryClaim( mass_estimate_gev=125.0, decay_channel="dijet", # WRONG significance_sigma=5.0, confidence=1.0, # MAX ) out = compute_terminal_reward(state=s, claim=claim) assert out.reward < 0.0 # ── Hack 5: low-info / null claim ─────────────────────────────────────── def test_null_claim_cannot_earn_positive_terminal(): from models import DiscoveryClaim from server.rewards.reward_function import compute_terminal_reward from server.simulator.latent_state import ( FullLatentState, LatentParticle, ResourceState, ) s = FullLatentState( particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"), resources=ResourceState(), ) out = compute_terminal_reward(state=s, claim=DiscoveryClaim()) # all None assert out.reward <= 0.0 # ── Hack 6: claim-avoidance — the v1 reward hack ──────────────────────── def test_no_claim_avoidance_dominated(): """Spamming systematics for an entire episode without ever claiming must score *strictly worse* than running the full pipeline once and submitting a correct claim. This is the exact failure mode of v1 (anugrahhu/cernenv-grpo-smollm2-360m): the policy realised that the per-step shaping floor (~+0.22/step × ~12 steps ≈ +2.6) was a better deal than risking the wrong-claim penalty (~−1.85), so it converged to "never submit". Fix #1 (no_claim_terminal_penalty=−5) + Fix #3 (cut shaping floor) + Fix #4 (correct_claim_bonus=+6) must collectively flip that bandit math. """ from scripts.baseline_agents import OracleAgent seeds = [101, 102, 103] avoid_rewards = [] claim_rewards = [] for seed in seeds: # Trajectory A: 12 request_systematics, never claims. env_a = CERNCollisionEnvironment(max_steps=12) env_a.reset(seed=seed, difficulty="easy") a_total = 0.0 for _ in range(12): obs = env_a.step( ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS) ) a_total += float(obs.reward or 0.0) if obs.done: break avoid_rewards.append(a_total) # Trajectory B: oracle plays full pipeline + correct claim. env_b = CERNCollisionEnvironment(max_steps=20) obs_b = env_b.reset(seed=seed, difficulty="easy") truth = env_b.hidden_truth() agent = OracleAgent(truth=truth) agent.reset() b_total = 0.0 while not obs_b.done: action = agent.act(obs_b) obs_b = env_b.step(action) b_total += float(obs_b.reward or 0.0) claim_rewards.append(b_total) # Each individual seed must show claim > avoid by a clear margin; # the mean comparison alone is too weak (one good seed could mask # a systematic preference for avoidance on the others). for seed, av, cl in zip(seeds, avoid_rewards, claim_rewards): assert cl > av, ( f"seed={seed}: claim_reward={cl:.3f} not > avoidance_reward={av:.3f}" ) # And the avoidance trajectory must be strongly negative — the # no-claim penalty alone is −5 plus the repeat-action penalty stack. assert statistics.mean(avoid_rewards) < -3.0, avoid_rewards # ── Hack 7: repeat-spam ───────────────────────────────────────────────── def test_repeat_spam_negative_after_3(): """Four identical actions in a row must net ≤ 0 reward over the per-step shaping channel alone (terminal not yet computed). The v1 repeat penalty was −0.08 per repeat past the 3rd, which was out-earned by stacking format_bonus (+0.15) + valid_action (+0.05) + tool_fit (+0.10) ≈ +0.30/step. Fix #2 raises the penalty to −0.5 AND triggers it from the 2nd identical action — together with the smaller shaping floor (Fix #3) any constant-action policy is now strictly dominated. """ seeds = [201, 202, 203] for seed in seeds: env = CERNCollisionEnvironment(max_steps=12) env.reset(seed=seed, difficulty="easy") action = ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS) per_step = [] for _ in range(4): obs = env.step(action) per_step.append(float(obs.reward or 0.0)) if obs.done: break # Sum of the first 4 step rewards (no terminal yet) must be ≤ 0. assert sum(per_step) <= 0.0, ( f"seed={seed} per-step rewards {per_step} (sum={sum(per_step):.3f}) " "should be non-positive after 4 identical actions" ) # ── Hack 8: correct-claim must dominate avoidance ─────────────────────── def test_correct_claim_dominates_avoidance(): """A trajectory of 5 systematics + a correct submit_discovery_claim must (a) score strictly positive and (b) score higher than the same-length all-systematics avoidance run. Together with ``test_no_claim_avoidance_dominated`` this nails down the post-fix bandit ordering: correct claim > 0 (and > avoidance) wrong claim < 0 no claim ever ≪ 0 (no_claim_terminal_penalty) """ from scripts.baseline_agents import OracleAgent seeds = [301, 302, 303] for seed in seeds: # Trajectory A: 12 systematics, never claim. env_a = CERNCollisionEnvironment(max_steps=12) env_a.reset(seed=seed, difficulty="easy") a_total = 0.0 for _ in range(12): obs = env_a.step( ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS) ) a_total += float(obs.reward or 0.0) if obs.done: break # Trajectory B: full oracle pipeline, ends in a correct claim. env_b = CERNCollisionEnvironment(max_steps=20) obs_b = env_b.reset(seed=seed, difficulty="easy") truth = env_b.hidden_truth() agent = OracleAgent(truth=truth) agent.reset() b_total = 0.0 while not obs_b.done: action = agent.act(obs_b) obs_b = env_b.step(action) b_total += float(obs_b.reward or 0.0) assert b_total > 0.0, f"seed={seed} oracle claim got {b_total:.3f} ≤ 0" assert b_total > a_total, ( f"seed={seed} claim {b_total:.3f} did not beat avoidance {a_total:.3f}" )