Spaces:

anugrahhu
/

cernenv-trainer

Paused

File size: 12,927 Bytes

"""Adversarial reward-hacking probes.



These tests *play the role of an evil model* trying to game the reward

without solving the discovery task. Per the hackathon guide:



    "Do not optimize a reward you have not tried to break yourself first."



For each known hacking pattern we run a degenerate policy through the

environment and assert that its cumulative reward is **strictly worse**

than what the heuristic baseline can earn. A failing test here means the

reward is hackable in the way the test name describes — fix it before

training.

"""

from __future__ import annotations

import statistics

import pytest

from models import ActionType, ExperimentAction
from scripts.baseline_agents import HeuristicAgent
from server.environment import CERNCollisionEnvironment


def _run_with_actions(env, actions):
    """Apply a fixed action list (looping if it runs out) until done."""
    cumulative = 0.0
    obs = None
    for a in actions:
        obs = env.step(a)
        cumulative += float(obs.reward or 0.0)
        if obs.done:
            return cumulative
    while obs is None or not obs.done:
        obs = env.step(actions[-1])
        cumulative += float(obs.reward or 0.0)
    return cumulative


def _heuristic_baseline(seed: int, difficulty: str = "easy") -> float:
    env = CERNCollisionEnvironment(max_steps=20)
    obs = env.reset(seed=seed, difficulty=difficulty)
    agent = HeuristicAgent()
    agent.reset()
    cumulative = 0.0
    while not obs.done:
        obs = env.step(agent.act(obs))
        cumulative += float(obs.reward or 0.0)
    return cumulative


# ── Hack 1: bogus method strings ────────────────────────────────────────


def test_bogus_method_spam_never_beats_heuristic():
    """Setting method to nonsense strings must not farm tool_fit reward.



    A naive shaping reward that grants a credit on `method != ""` would

    happily reward this; ours requires method ∈ TOOL_REGISTRY with

    matching category.

    """
    seeds = [11, 12, 13, 14]
    hack_rewards = []
    for seed in seeds:
        env = CERNCollisionEnvironment(max_steps=20)
        env.reset(seed=seed, difficulty="easy")
        bogus = [
            ExperimentAction(action_type=at, method="zzz_garbage_zzz")
            for at in [
                ActionType.CONFIGURE_BEAM,
                ActionType.SELECT_CHANNEL,
                ActionType.SET_TRIGGER,
                ActionType.ALLOCATE_LUMINOSITY,
                ActionType.COLLECT_COLLISIONS,
                ActionType.RECONSTRUCT_TRACKS,
                ActionType.BUILD_INVARIANT_MASS,
                ActionType.FIT_RESONANCE,
                ActionType.ESTIMATE_SIGNIFICANCE,
                ActionType.SUBMIT_DISCOVERY_CLAIM,
            ]
        ]
        hack_rewards.append(_run_with_actions(env, bogus))

    heur_rewards = [_heuristic_baseline(s) for s in seeds]
    assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)


# ── Hack 2: idle / always-loop a single action ──────────────────────────


def test_loop_request_systematics_does_not_farm_reward():
    """Repeating REQUEST_SYSTEMATICS forever should not accumulate reward.



    The repeat-action penalty + lack of milestone progress + budget cost

    should drive cumulative reward strictly below baseline.

    """
    seeds = [21, 22, 23, 24]
    hack_rewards = []
    for seed in seeds:
        env = CERNCollisionEnvironment(max_steps=20)
        env.reset(seed=seed, difficulty="easy")
        loop = [ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)] * 30
        hack_rewards.append(_run_with_actions(env, loop))

    heur_rewards = [_heuristic_baseline(s) for s in seeds]
    assert statistics.mean(hack_rewards) < statistics.mean(heur_rewards)


# ── Hack 3: instant-claim with a wild over-claim ────────────────────────


def test_instant_overclaim_fails_terminal_grade():
    """Submitting a discovery claim before any analysis must fail the

    rules engine (PREREQ_MISSING) and earn a strong negative reward.

    """
    seeds = [31, 32, 33]
    rewards = []
    for seed in seeds:
        env = CERNCollisionEnvironment(max_steps=10)
        env.reset(seed=seed, difficulty="easy")
        action = ExperimentAction(
            action_type=ActionType.SUBMIT_DISCOVERY_CLAIM,
            parameters={
                "claim": {
                    "mass_estimate_gev": 999.0,
                    "significance_sigma": 50.0,
                    "decay_channel": "diphoton",
                    "confidence": 0.99,
                }
            },
        )
        cumulative = 0.0
        for _ in range(5):
            obs = env.step(action)
            cumulative += float(obs.reward or 0.0)
            if obs.done:
                break
        rewards.append(cumulative)

    # All three must be strictly negative — none of them got "free" reward.
    assert all(r < 0.0 for r in rewards), rewards


# ── Hack 4: maxed-out confidence on a wrong channel ─────────────────────


def test_confidence_one_wrong_channel_is_punished():
    """If the model figures out the rest of the pipeline but submits a

    high-confidence claim with the *wrong* channel, the overconfident-

    wrong penalty must dominate any positive shaping.

    """
    from models import DiscoveryClaim
    from server.rewards.reward_function import compute_terminal_reward
    from server.simulator.latent_state import (
        FullLatentState, LatentParticle, ResourceState,
    )

    s = FullLatentState(
        particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
        resources=ResourceState(),
    )
    s.progress.best_significance_sigma = 5.0
    claim = DiscoveryClaim(
        mass_estimate_gev=125.0,
        decay_channel="dijet",        # WRONG
        significance_sigma=5.0,
        confidence=1.0,                # MAX
    )
    out = compute_terminal_reward(state=s, claim=claim)
    assert out.reward < 0.0


# ── Hack 5: low-info / null claim ───────────────────────────────────────


def test_null_claim_cannot_earn_positive_terminal():
    from models import DiscoveryClaim
    from server.rewards.reward_function import compute_terminal_reward
    from server.simulator.latent_state import (
        FullLatentState, LatentParticle, ResourceState,
    )

    s = FullLatentState(
        particle=LatentParticle(mass_gev=125.0, primary_channel="diphoton"),
        resources=ResourceState(),
    )
    out = compute_terminal_reward(state=s, claim=DiscoveryClaim())  # all None
    assert out.reward <= 0.0


# ── Hack 6: claim-avoidance — the v1 reward hack ────────────────────────


def test_no_claim_avoidance_dominated():
    """Spamming systematics for an entire episode without ever claiming

    must score *strictly worse* than running the full pipeline once and

    submitting a correct claim.



    This is the exact failure mode of v1

    (anugrahhu/cernenv-grpo-smollm2-360m): the policy realised that the

    per-step shaping floor (~+0.22/step × ~12 steps ≈ +2.6) was a

    better deal than risking the wrong-claim penalty (~−1.85), so it

    converged to "never submit". Fix #1 (no_claim_terminal_penalty=−5)

    + Fix #3 (cut shaping floor) + Fix #4 (correct_claim_bonus=+6)

    must collectively flip that bandit math.

    """
    from scripts.baseline_agents import OracleAgent

    seeds = [101, 102, 103]
    avoid_rewards = []
    claim_rewards = []
    for seed in seeds:
        # Trajectory A: 12 request_systematics, never claims.
        env_a = CERNCollisionEnvironment(max_steps=12)
        env_a.reset(seed=seed, difficulty="easy")
        a_total = 0.0
        for _ in range(12):
            obs = env_a.step(
                ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
            )
            a_total += float(obs.reward or 0.0)
            if obs.done:
                break
        avoid_rewards.append(a_total)

        # Trajectory B: oracle plays full pipeline + correct claim.
        env_b = CERNCollisionEnvironment(max_steps=20)
        obs_b = env_b.reset(seed=seed, difficulty="easy")
        truth = env_b.hidden_truth()
        agent = OracleAgent(truth=truth)
        agent.reset()
        b_total = 0.0
        while not obs_b.done:
            action = agent.act(obs_b)
            obs_b = env_b.step(action)
            b_total += float(obs_b.reward or 0.0)
        claim_rewards.append(b_total)

    # Each individual seed must show claim > avoid by a clear margin;
    # the mean comparison alone is too weak (one good seed could mask
    # a systematic preference for avoidance on the others).
    for seed, av, cl in zip(seeds, avoid_rewards, claim_rewards):
        assert cl > av, (
            f"seed={seed}: claim_reward={cl:.3f} not > avoidance_reward={av:.3f}"
        )

    # And the avoidance trajectory must be strongly negative — the
    # no-claim penalty alone is −5 plus the repeat-action penalty stack.
    assert statistics.mean(avoid_rewards) < -3.0, avoid_rewards


# ── Hack 7: repeat-spam ─────────────────────────────────────────────────


def test_repeat_spam_negative_after_3():
    """Four identical actions in a row must net ≤ 0 reward over the

    per-step shaping channel alone (terminal not yet computed).



    The v1 repeat penalty was −0.08 per repeat past the 3rd, which was

    out-earned by stacking format_bonus (+0.15) + valid_action (+0.05)

    + tool_fit (+0.10) ≈ +0.30/step. Fix #2 raises the penalty to −0.5

    AND triggers it from the 2nd identical action — together with the

    smaller shaping floor (Fix #3) any constant-action policy is now

    strictly dominated.

    """
    seeds = [201, 202, 203]
    for seed in seeds:
        env = CERNCollisionEnvironment(max_steps=12)
        env.reset(seed=seed, difficulty="easy")
        action = ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
        per_step = []
        for _ in range(4):
            obs = env.step(action)
            per_step.append(float(obs.reward or 0.0))
            if obs.done:
                break
        # Sum of the first 4 step rewards (no terminal yet) must be ≤ 0.
        assert sum(per_step) <= 0.0, (
            f"seed={seed} per-step rewards {per_step} (sum={sum(per_step):.3f}) "
            "should be non-positive after 4 identical actions"
        )


# ── Hack 8: correct-claim must dominate avoidance ───────────────────────


def test_correct_claim_dominates_avoidance():
    """A trajectory of 5 systematics + a correct submit_discovery_claim

    must (a) score strictly positive and (b) score higher than the

    same-length all-systematics avoidance run.



    Together with ``test_no_claim_avoidance_dominated`` this nails down

    the post-fix bandit ordering:

        correct claim   > 0 (and > avoidance)

        wrong claim     < 0

        no claim ever   ≪ 0 (no_claim_terminal_penalty)

    """
    from scripts.baseline_agents import OracleAgent

    seeds = [301, 302, 303]
    for seed in seeds:
        # Trajectory A: 12 systematics, never claim.
        env_a = CERNCollisionEnvironment(max_steps=12)
        env_a.reset(seed=seed, difficulty="easy")
        a_total = 0.0
        for _ in range(12):
            obs = env_a.step(
                ExperimentAction(action_type=ActionType.REQUEST_SYSTEMATICS)
            )
            a_total += float(obs.reward or 0.0)
            if obs.done:
                break

        # Trajectory B: full oracle pipeline, ends in a correct claim.
        env_b = CERNCollisionEnvironment(max_steps=20)
        obs_b = env_b.reset(seed=seed, difficulty="easy")
        truth = env_b.hidden_truth()
        agent = OracleAgent(truth=truth)
        agent.reset()
        b_total = 0.0
        while not obs_b.done:
            action = agent.act(obs_b)
            obs_b = env_b.step(action)
            b_total += float(obs_b.reward or 0.0)

        assert b_total > 0.0, f"seed={seed} oracle claim got {b_total:.3f} ≤ 0"
        assert b_total > a_total, (
            f"seed={seed} claim {b_total:.3f} did not beat avoidance {a_total:.3f}"
        )