Spaces:

anugrahhu
/

cernenv-trainer

Paused

App Files Files Community

anugrahhu commited on 15 days ago

Commit

d91fe20

verified ·

1 Parent(s): 7df4308

sft+reward-fix: server/rewards/reward_function.py

Browse files

Files changed (1) hide show

server/rewards/reward_function.py +66 -9

server/rewards/reward_function.py CHANGED Viewed

@@ -61,12 +61,19 @@ class RewardWeights:
     valid_action: float = 0.05
     progress_milestone: float = 0.25
     evidence_quality: float = 0.20
-    tool_fit: float = 0.10            # paid only on a method ∈ TOOL_REGISTRY
                                       # whose category matches the action.
     bogus_method_penalty: float = -0.05  # penalises method strings outside
                                          # TOOL_REGISTRY (anti-string-spam).
-    repeat_action_penalty: float = -0.08  # per consecutive repeat beyond the
-                                          # second identical action_type in a row.
     soft_violation: float = -0.05
     hard_violation: float = -0.50
     redundancy: float = -0.10
@@ -76,7 +83,9 @@ class RewardWeights:
     # Hard cap on what a single shaping step can earn. Without this a
     # policy could in principle stack milestone + evidence_quality +
     # tool_fit + valid_action and approach the terminal reward magnitude.
-    step_reward_clip: float = 0.75
     # ── terminal grading ────────────────────────────────────────
     terminal_scale: float = 5.0   # multiplied with the convex sum below
@@ -92,6 +101,18 @@ class RewardWeights:
     overconfident_wrong_penalty: float = 4.0  # subtracted from terminal
     overclaim_significance_penalty: float = 1.5  # claim_sigma >> measured_sigma
 # ── Outputs ──────────────────────────────────────────────────────────────
@@ -233,13 +254,16 @@ def compute_step_reward(
             breakdown.add("soft_violation", weights.soft_violation * soft_other)
     # ── consecutive-repeat penalty (catches loop hacks) ─────────────
-    # Two-in-a-row is mildly OK (sometimes you re-collect data); three
-    # or more identical action_types in a row earns escalating penalty.
     repeats = _consecutive_repeat_count(history or [], action.action_type)
-    if repeats >= 2:
         breakdown.add(
             "repeat_action",
-            weights.repeat_action_penalty * (repeats - 1),
         )
     # ── resource overspend ──────────────────────────────────────────
@@ -328,10 +352,34 @@ def _efficiency_bonus(state: FullLatentState) -> float:
 def compute_terminal_reward(
     *,
     state: FullLatentState,
-    claim: DiscoveryClaim,
     weights: RewardWeights = RewardWeights(),
 ) -> TerminalReward:
     breakdown = RewardBreakdown()
     truth = state.particle
     mass_score = _mass_score(truth.mass_gev, claim.mass_estimate_gev, claim.mass_uncertainty_gev)
@@ -366,6 +414,15 @@ def compute_terminal_reward(
     raw = breakdown.total * weights.terminal_scale
     # Overconfident-wrong penalty: high confidence but wrong channel & far mass
     if claim.confidence >= 0.8 and (mass_score < 0.2 or not channel_ok):
         raw -= weights.overconfident_wrong_penalty

     valid_action: float = 0.05
     progress_milestone: float = 0.25
     evidence_quality: float = 0.20
+    # Cut to ~1/3 of original (was 0.10) to lower the per-step shaping
+    # floor. Combined with a smaller step_reward_clip and a heavier
+    # repeat-action penalty this prevents the agent from farming
+    # +0.20+/step by cycling well-formed-but-inert tool calls.
+    tool_fit: float = 0.033           # paid only on a method ∈ TOOL_REGISTRY
                                       # whose category matches the action.
     bogus_method_penalty: float = -0.05  # penalises method strings outside
                                          # TOOL_REGISTRY (anti-string-spam).
+    # Was -0.08; bumped to -0.5 because the previous value was easily out-
+    # earned by stacking format_bonus + valid_action + tool_fit. The
+    # gating in compute_step_reward also now triggers from the *2nd*
+    # consecutive identical action_type instead of the 3rd.
+    repeat_action_penalty: float = -0.5
     soft_violation: float = -0.05
     hard_violation: float = -0.50
     redundancy: float = -0.10
     # Hard cap on what a single shaping step can earn. Without this a
     # policy could in principle stack milestone + evidence_quality +
     # tool_fit + valid_action and approach the terminal reward magnitude.
+    # Cut from 0.75 → 0.25 so the per-step shaping floor cannot exceed
+    # ~1/3 of the wrong-claim terminal penalty.
+    step_reward_clip: float = 0.25
     # ── terminal grading ────────────────────────────────────────
     terminal_scale: float = 5.0   # multiplied with the convex sum below
     overconfident_wrong_penalty: float = 4.0  # subtracted from terminal
     overclaim_significance_penalty: float = 1.5  # claim_sigma >> measured_sigma
+    # Big bonus for getting BOTH mass and channel right, on top of the
+    # terminal grade. Makes the bandit math strictly favour attempting a
+    # claim when uncertain rather than running out the clock: a correct
+    # claim now returns ~+10–12, a wrong one ~−1.85, no claim ~−5.
+    correct_claim_bonus: float = 6.0
+    # Penalty applied at episode end when the trajectory never even
+    # *attempted* a SUBMIT_DISCOVERY_CLAIM. Defeats the "hide forever and
+    # farm shaping" reward hack we observed in v1 (mean +0.22/step over
+    # ~12 steps was a better deal than risking the wrong-claim penalty).
+    no_claim_terminal_penalty: float = -5.0
 # ── Outputs ──────────────────────────────────────────────────────────────
             breakdown.add("soft_violation", weights.soft_violation * soft_other)
     # ── consecutive-repeat penalty (catches loop hacks) ─────────────
+    # Triggers from the *2nd* identical action in a row (previously
+    # only kicked in at the 3rd). The escalating multiplier scales with
+    # the run length so that 4-in-a-row gets 4× the base penalty —
+    # important because v1 found that a tiny -0.08 was easily out-earned
+    # by the +0.22/step shaping floor.
     repeats = _consecutive_repeat_count(history or [], action.action_type)
+    if repeats >= 1:
         breakdown.add(
             "repeat_action",
+            weights.repeat_action_penalty * repeats,
         )
     # ── resource overspend ──────────────────────────────────────────
 def compute_terminal_reward(
     *,
     state: FullLatentState,
+    claim: Optional[DiscoveryClaim],
     weights: RewardWeights = RewardWeights(),
 ) -> TerminalReward:
+    """Grade the end-of-episode submission.
+    ``claim`` is ``None`` when the episode terminated by *any* reason
+    other than a ``submit_discovery_claim`` action (max_steps, budget
+    exhausted, time exhausted) AND the trajectory never attempted to
+    submit a claim. In that case we return a flat
+    ``no_claim_terminal_penalty`` so the bandit math always favours
+    *attempting* a claim over hiding forever to farm per-step shaping.
+    See: v1 (anugrahhu/cernenv-grpo-smollm2-360m) which exploited this
+    exact gap by spamming request_systematics for ~+0.22/step instead
+    of risking the wrong-claim penalty (~−1.85).
+    """
     breakdown = RewardBreakdown()
+    if claim is None:
+        breakdown.add("no_claim_terminal_penalty", weights.no_claim_terminal_penalty)
+        return TerminalReward(
+            reward=float(weights.no_claim_terminal_penalty),
+            breakdown=breakdown,
+            discovered=False,
+            correct_mass=False,
+            correct_channel=False,
+            correct_spin=False,
+        )
     truth = state.particle
     mass_score = _mass_score(truth.mass_gev, claim.mass_estimate_gev, claim.mass_uncertainty_gev)
     raw = breakdown.total * weights.terminal_scale
+    # Asymmetric claim cost (Fix #4). When the claim gets BOTH the mass
+    # and the decay channel right, add a flat bonus on top of the graded
+    # terminal so that a correct attempt is worth substantially more
+    # than the no-claim penalty (-5) and the wrong-claim penalty (~-1.85).
+    # This makes the bandit math: correct +10–12 ≫ no-claim −5 > wrong −2.
+    if mass_score >= 0.5 and channel_ok:
+        raw += weights.correct_claim_bonus
+        breakdown.add("correct_claim_bonus", weights.correct_claim_bonus)
     # Overconfident-wrong penalty: high confidence but wrong channel & far mass
     if claim.confidence >= 0.8 and (mass_score < 0.2 or not channel_ok):
         raw -= weights.overconfident_wrong_penalty