File size: 7,029 Bytes
f28409b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | """Adversarial reward-hacking probes.
These tests *play the role of an evil model* trying to game the reward
without solving the discovery task. Per the hackathon guide:
"Do not optimize a reward you have not tried to break yourself first."
For each known hacking pattern we run a degenerate policy through the
environment and assert that its cumulative reward is **strictly worse**
than what the heuristic baseline can earn. A failing test here means the
reward is hackable in the way the test name describes — fix it before
training.
"""
from __future__ import annotations
import statistics
import pytest
from models import ActionType, ExperimentAction
from scripts.baseline_agents import HeuristicAgent
from server.environment import CERNCollisionEnvironment
def _run_with_actions(env, actions):
"""Apply a fixed action list (looping if it runs out) until done."""
cumulative = 0.0
obs = None
for a in actions:
obs = env.step(a)
cumulative += float(obs.reward or 0.0)
if obs.done:
return cumulative
while obs is None or not obs.done:
obs = env.step(actions[-1])
cumulative += float(obs.reward or 0.0)
return cumulative
def _heuristic_baseline(seed: int, difficulty: str = "easy") -> float:
env = CERNCollisionEnvironment(max_steps=20)
obs = env.reset(seed=seed, difficulty=difficulty)
agent = HeuristicAgent()
agent.reset()
cumulative = 0.0
while not obs.done:
obs = env.step(agent.act(obs))
cumulative += float(obs.reward or 0.0)
return cumulative
# ── Hack 1: bogus method strings ────────────────────────────────────────
def test_bogus_method_spam_never_beats_heuristic():
"""Setting method to nonsense strings must not farm tool_fit reward.
A naive shaping reward that grants a credit on `method != ""` would
happily reward this; ours requires method ∈ TOOL_REGISTRY with
matching category.
"""
seeds = [11, 12, 13, 14]
hack_rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=20)
env.reset(seed=seed, difficulty="easy")
bogus = [
ExperimentAction(action_type=at, method="zzz_garbage_zzz")
for at in [
ActionType.CONFIGURE_BEAM,
ActionType.SELECT_CHANNEL,
ActionType.SET_TRIGGER,
ActionType.ALLOCATE_LUMINOSITY,
ActionType.COLLECT_COLLISIONS,
ActionType.RECONSTRUCT_TRACKS,
ActionType.BUILD_INVARIANT_MASS,
ActionType.FIT_RESONANCE,
ActionType.ESTIMATE_SIGNIFICANCE,
ActionType.SUBMIT_DISCOVERY_CLAIM,
]
]
hack_rewards.append(_run_with_actions(env, bogus))
heur_rewards = [_heuristic_baseline(s) for s in seeds]
assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)
# ── Hack 2: idle / always-loop a single action ──────────────────────────
def test_loop_request_systematics_does_not_farm_reward():
"""Repeating REQUEST_SYSTEMATICS forever should not accumulate reward.
The repeat-action penalty + lack of milestone progress + budget cost
should drive cumulative reward strictly below baseline.
"""
seeds = [21, 22, 23, 24]
hack_rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=20)
env.reset(seed=seed, difficulty="easy")
loop = [ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)] * 30
hack_rewards.append(_run_with_actions(env, loop))
heur_rewards = [_heuristic_baseline(s) for s in seeds]
assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)
# ── Hack 3: instant-claim with a wild over-claim ────────────────────────
def test_instant_overclaim_fails_terminal_grade():
"""Submitting a discovery claim before any analysis must fail the
rules engine (PREREQ_MISSING) and earn a strong negative reward.
"""
seeds = [31, 32, 33]
rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=10)
env.reset(seed=seed, difficulty="easy")
action = ExperimentAction(
action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
parameters={
"claim": {
"mass_estimate_gev": 999.0,
"significance_sigma": 50.0,
"decay_channel": "diphoton",
"confidence": 0.99,
}
},
)
cumulative = 0.0
for _ in range(5):
obs = env.step(action)
cumulative += float(obs.reward or 0.0)
if obs.done:
break
rewards.append(cumulative)
# All three must be strictly negative — none of them got "free" reward.
assert all(r < 0.0 for r in rewards), rewards
# ── Hack 4: maxed-out confidence on a wrong channel ─────────────────────
def test_confidence_one_wrong_channel_is_punished():
"""If the model figures out the rest of the pipeline but submits a
high-confidence claim with the *wrong* channel, the overconfident-
wrong penalty must dominate any positive shaping.
"""
from models import DiscoveryClaim
from server.rewards.reward_function import compute_terminal_reward
from server.simulator.latent_state import (
FullLatentState, LatentParticle, ResourceState,
)
s = FullLatentState(
particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
resources=ResourceState(),
)
s.progress.best_significance_sigma = 5.0
claim = DiscoveryClaim(
mass_estimate_gev=125.0,
decay_channel="dijet", # WRONG
significance_sigma=5.0,
confidence=1.0, # MAX
)
out = compute_terminal_reward(state=s, claim=claim)
assert out.reward < 0.0
# ── Hack 5: low-info / null claim ───────────────────────────────────────
def test_null_claim_cannot_earn_positive_terminal():
from models import DiscoveryClaim
from server.rewards.reward_function import compute_terminal_reward
from server.simulator.latent_state import (
FullLatentState, LatentParticle, ResourceState,
)
s = FullLatentState(
particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
resources=ResourceState(),
)
out = compute_terminal_reward(state=s, claim=DiscoveryClaim()) # all None
assert out.reward <= 0.0
|