Spaces:
Paused
Paused
sft+reward-fix: training/training_script.py
Browse files
training/training_script.py
CHANGED
|
@@ -208,17 +208,19 @@ class RewardComponentAccumulator:
|
|
| 208 |
}
|
| 209 |
|
| 210 |
|
| 211 |
-
FORMAT_BONUS_VALID = 0.15
|
| 212 |
-
FORMAT_BONUS_INVALID = -0.20
|
| 213 |
|
| 214 |
|
| 215 |
def _format_validity_bonus(completion_text: str) -> float:
|
| 216 |
"""Small ± nudge for emitting a structured action.
|
| 217 |
|
| 218 |
Kept intentionally small (≪ terminal_scale) so the policy can't be
|
| 219 |
-
dominated by a "spam well-formed JSON" objective.
|
| 220 |
-
is
|
| 221 |
-
|
|
|
|
|
|
|
| 222 |
"""
|
| 223 |
return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID
|
| 224 |
|
|
|
|
| 208 |
}
|
| 209 |
|
| 210 |
|
| 211 |
+
FORMAT_BONUS_VALID = 0.05 # was 0.15 — Fix #3 (lower per-step floor)
|
| 212 |
+
FORMAT_BONUS_INVALID = -0.20 # kept punitive so unparseable completions still hurt
|
| 213 |
|
| 214 |
|
| 215 |
def _format_validity_bonus(completion_text: str) -> float:
|
| 216 |
"""Small ± nudge for emitting a structured action.
|
| 217 |
|
| 218 |
Kept intentionally small (≪ terminal_scale) so the policy can't be
|
| 219 |
+
dominated by a "spam well-formed JSON" objective. After Fix #3 the
|
| 220 |
+
positive branch is 1/3 of its v1 value (0.05 vs 0.15) — combined
|
| 221 |
+
with the lower step_reward_clip and the heavier repeat-action
|
| 222 |
+
penalty, this means a model can no longer farm ~+0.22/step by
|
| 223 |
+
looping a single well-formed action.
|
| 224 |
"""
|
| 225 |
return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID
|
| 226 |
|