Spaces:

anugrahhu
/

cernenv-trainer

Paused

anugrahhu commited on 12 days ago

Commit

2b97998

verified ·

1 Parent(s): a8d4d87

sft+reward-fix: training/training_script.py

Files changed (1) hide show

training/training_script.py CHANGED Viewed

@@ -208,17 +208,19 @@ class RewardComponentAccumulator:
         }
-FORMAT_BONUS_VALID = 0.15
-FORMAT_BONUS_INVALID = -0.20
 def _format_validity_bonus(completion_text: str) -> float:
     """Small ± nudge for emitting a structured action.
     Kept intentionally small (≪ terminal_scale) so the policy can't be
-    dominated by a "spam well-formed JSON" objective. The negative branch
-    is slightly larger than the positive branch so unparseable garbage
-    is dispreferred without crowding out the actual task reward.
     """
     return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID

         }
+FORMAT_BONUS_VALID = 0.05      # was 0.15 — Fix #3 (lower per-step floor)
+FORMAT_BONUS_INVALID = -0.20    # kept punitive so unparseable completions still hurt
 def _format_validity_bonus(completion_text: str) -> float:
     """Small ± nudge for emitting a structured action.
     Kept intentionally small (≪ terminal_scale) so the policy can't be
+    dominated by a "spam well-formed JSON" objective. After Fix #3 the
+    positive branch is 1/3 of its v1 value (0.05 vs 0.15) — combined
+    with the lower step_reward_clip and the heavier repeat-action
+    penalty, this means a model can no longer farm ~+0.22/step by
+    looping a single well-formed action.
     """
     return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID