Spaces:
Paused
Paused
File size: 12,927 Bytes
0a6c641 a7acc5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | """Adversarial reward-hacking probes.
These tests *play the role of an evil model* trying to game the reward
without solving the discovery task. Per the hackathon guide:
"Do not optimize a reward you have not tried to break yourself first."
For each known hacking pattern we run a degenerate policy through the
environment and assert that its cumulative reward is **strictly worse**
than what the heuristic baseline can earn. A failing test here means the
reward is hackable in the way the test name describes — fix it before
training.
"""
from __future__ import annotations
import statistics
import pytest
from models import ActionType, ExperimentAction
from scripts.baseline_agents import HeuristicAgent
from server.environment import CERNCollisionEnvironment
def _run_with_actions(env, actions):
"""Apply a fixed action list (looping if it runs out) until done."""
cumulative = 0.0
obs = None
for a in actions:
obs = env.step(a)
cumulative += float(obs.reward or 0.0)
if obs.done:
return cumulative
while obs is None or not obs.done:
obs = env.step(actions[-1])
cumulative += float(obs.reward or 0.0)
return cumulative
def _heuristic_baseline(seed: int, difficulty: str = "easy") -> float:
env = CERNCollisionEnvironment(max_steps=20)
obs = env.reset(seed=seed, difficulty=difficulty)
agent = HeuristicAgent()
agent.reset()
cumulative = 0.0
while not obs.done:
obs = env.step(agent.act(obs))
cumulative += float(obs.reward or 0.0)
return cumulative
# ── Hack 1: bogus method strings ────────────────────────────────────────
def test_bogus_method_spam_never_beats_heuristic():
"""Setting method to nonsense strings must not farm tool_fit reward.
A naive shaping reward that grants a credit on `method != ""` would
happily reward this; ours requires method ∈ TOOL_REGISTRY with
matching category.
"""
seeds = [11, 12, 13, 14]
hack_rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=20)
env.reset(seed=seed, difficulty="easy")
bogus = [
ExperimentAction(action_type=at, method="zzz_garbage_zzz")
for at in [
ActionType.CONFIGURE_BEAM,
ActionType.SELECT_CHANNEL,
ActionType.SET_TRIGGER,
ActionType.ALLOCATE_LUMINOSITY,
ActionType.COLLECT_COLLISIONS,
ActionType.RECONSTRUCT_TRACKS,
ActionType.BUILD_INVARIANT_MASS,
ActionType.FIT_RESONANCE,
ActionType.ESTIMATE_SIGNIFICANCE,
ActionType.SUBMIT_DISCOVERY_CLAIM,
]
]
hack_rewards.append(_run_with_actions(env, bogus))
heur_rewards = [_heuristic_baseline(s) for s in seeds]
assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)
# ── Hack 2: idle / always-loop a single action ──────────────────────────
def test_loop_request_systematics_does_not_farm_reward():
"""Repeating REQUEST_SYSTEMATICS forever should not accumulate reward.
The repeat-action penalty + lack of milestone progress + budget cost
should drive cumulative reward strictly below baseline.
"""
seeds = [21, 22, 23, 24]
hack_rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=20)
env.reset(seed=seed, difficulty="easy")
loop = [ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)] * 30
hack_rewards.append(_run_with_actions(env, loop))
heur_rewards = [_heuristic_baseline(s) for s in seeds]
assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)
# ── Hack 3: instant-claim with a wild over-claim ────────────────────────
def test_instant_overclaim_fails_terminal_grade():
"""Submitting a discovery claim before any analysis must fail the
rules engine (PREREQ_MISSING) and earn a strong negative reward.
"""
seeds = [31, 32, 33]
rewards = []
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=10)
env.reset(seed=seed, difficulty="easy")
action = ExperimentAction(
action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
parameters={
"claim": {
"mass_estimate_gev": 999.0,
"significance_sigma": 50.0,
"decay_channel": "diphoton",
"confidence": 0.99,
}
},
)
cumulative = 0.0
for _ in range(5):
obs = env.step(action)
cumulative += float(obs.reward or 0.0)
if obs.done:
break
rewards.append(cumulative)
# All three must be strictly negative — none of them got "free" reward.
assert all(r < 0.0 for r in rewards), rewards
# ── Hack 4: maxed-out confidence on a wrong channel ─────────────────────
def test_confidence_one_wrong_channel_is_punished():
"""If the model figures out the rest of the pipeline but submits a
high-confidence claim with the *wrong* channel, the overconfident-
wrong penalty must dominate any positive shaping.
"""
from models import DiscoveryClaim
from server.rewards.reward_function import compute_terminal_reward
from server.simulator.latent_state import (
FullLatentState, LatentParticle, ResourceState,
)
s = FullLatentState(
particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
resources=ResourceState(),
)
s.progress.best_significance_sigma = 5.0
claim = DiscoveryClaim(
mass_estimate_gev=125.0,
decay_channel="dijet", # WRONG
significance_sigma=5.0,
confidence=1.0, # MAX
)
out = compute_terminal_reward(state=s, claim=claim)
assert out.reward < 0.0
# ── Hack 5: low-info / null claim ───────────────────────────────────────
def test_null_claim_cannot_earn_positive_terminal():
from models import DiscoveryClaim
from server.rewards.reward_function import compute_terminal_reward
from server.simulator.latent_state import (
FullLatentState, LatentParticle, ResourceState,
)
s = FullLatentState(
particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
resources=ResourceState(),
)
out = compute_terminal_reward(state=s, claim=DiscoveryClaim()) # all None
assert out.reward <= 0.0
# ── Hack 6: claim-avoidance — the v1 reward hack ────────────────────────
def test_no_claim_avoidance_dominated():
"""Spamming systematics for an entire episode without ever claiming
must score *strictly worse* than running the full pipeline once and
submitting a correct claim.
This is the exact failure mode of v1
(anugrahhu/cernenv-grpo-smollm2-360m): the policy realised that the
per-step shaping floor (~+0.22/step × ~12 steps ≈ +2.6) was a
better deal than risking the wrong-claim penalty (~−1.85), so it
converged to "never submit". Fix #1 (no_claim_terminal_penalty=−5)
+ Fix #3 (cut shaping floor) + Fix #4 (correct_claim_bonus=+6)
must collectively flip that bandit math.
"""
from scripts.baseline_agents import OracleAgent
seeds = [101, 102, 103]
avoid_rewards = []
claim_rewards = []
for seed in seeds:
# Trajectory A: 12 request_systematics, never claims.
env_a = CERNCollisionEnvironment(max_steps=12)
env_a.reset(seed=seed, difficulty="easy")
a_total = 0.0
for _ in range(12):
obs = env_a.step(
ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
)
a_total += float(obs.reward or 0.0)
if obs.done:
break
avoid_rewards.append(a_total)
# Trajectory B: oracle plays full pipeline + correct claim.
env_b = CERNCollisionEnvironment(max_steps=20)
obs_b = env_b.reset(seed=seed, difficulty="easy")
truth = env_b.hidden_truth()
agent = OracleAgent(truth=truth)
agent.reset()
b_total = 0.0
while not obs_b.done:
action = agent.act(obs_b)
obs_b = env_b.step(action)
b_total += float(obs_b.reward or 0.0)
claim_rewards.append(b_total)
# Each individual seed must show claim > avoid by a clear margin;
# the mean comparison alone is too weak (one good seed could mask
# a systematic preference for avoidance on the others).
for seed, av, cl in zip(seeds, avoid_rewards, claim_rewards):
assert cl > av, (
f"seed={seed}: claim_reward={cl:.3f} not > avoidance_reward={av:.3f}"
)
# And the avoidance trajectory must be strongly negative — the
# no-claim penalty alone is −5 plus the repeat-action penalty stack.
assert statistics.mean(avoid_rewards) < -3.0, avoid_rewards
# ── Hack 7: repeat-spam ─────────────────────────────────────────────────
def test_repeat_spam_negative_after_3():
"""Four identical actions in a row must net ≤ 0 reward over the
per-step shaping channel alone (terminal not yet computed).
The v1 repeat penalty was −0.08 per repeat past the 3rd, which was
out-earned by stacking format_bonus (+0.15) + valid_action (+0.05)
+ tool_fit (+0.10) ≈ +0.30/step. Fix #2 raises the penalty to −0.5
AND triggers it from the 2nd identical action — together with the
smaller shaping floor (Fix #3) any constant-action policy is now
strictly dominated.
"""
seeds = [201, 202, 203]
for seed in seeds:
env = CERNCollisionEnvironment(max_steps=12)
env.reset(seed=seed, difficulty="easy")
action = ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
per_step = []
for _ in range(4):
obs = env.step(action)
per_step.append(float(obs.reward or 0.0))
if obs.done:
break
# Sum of the first 4 step rewards (no terminal yet) must be ≤ 0.
assert sum(per_step) <= 0.0, (
f"seed={seed} per-step rewards {per_step} (sum={sum(per_step):.3f}) "
"should be non-positive after 4 identical actions"
)
# ── Hack 8: correct-claim must dominate avoidance ───────────────────────
def test_correct_claim_dominates_avoidance():
"""A trajectory of 5 systematics + a correct submit_discovery_claim
must (a) score strictly positive and (b) score higher than the
same-length all-systematics avoidance run.
Together with ``test_no_claim_avoidance_dominated`` this nails down
the post-fix bandit ordering:
correct claim > 0 (and > avoidance)
wrong claim < 0
no claim ever ≪ 0 (no_claim_terminal_penalty)
"""
from scripts.baseline_agents import OracleAgent
seeds = [301, 302, 303]
for seed in seeds:
# Trajectory A: 12 systematics, never claim.
env_a = CERNCollisionEnvironment(max_steps=12)
env_a.reset(seed=seed, difficulty="easy")
a_total = 0.0
for _ in range(12):
obs = env_a.step(
ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
)
a_total += float(obs.reward or 0.0)
if obs.done:
break
# Trajectory B: full oracle pipeline, ends in a correct claim.
env_b = CERNCollisionEnvironment(max_steps=20)
obs_b = env_b.reset(seed=seed, difficulty="easy")
truth = env_b.hidden_truth()
agent = OracleAgent(truth=truth)
agent.reset()
b_total = 0.0
while not obs_b.done:
action = agent.act(obs_b)
obs_b = env_b.step(action)
b_total += float(obs_b.reward or 0.0)
assert b_total > 0.0, f"seed={seed} oracle claim got {b_total:.3f} ≤ 0"
assert b_total > a_total, (
f"seed={seed} claim {b_total:.3f} did not beat avoidance {a_total:.3f}"
)
|