anugrahhu commited on
Commit
2b97998
·
verified ·
1 Parent(s): a8d4d87

sft+reward-fix: training/training_script.py

Browse files
Files changed (1) hide show
  1. training/training_script.py +7 -5
training/training_script.py CHANGED
@@ -208,17 +208,19 @@ class RewardComponentAccumulator:
208
  }
209
 
210
 
211
- FORMAT_BONUS_VALID = 0.15
212
- FORMAT_BONUS_INVALID = -0.20
213
 
214
 
215
  def _format_validity_bonus(completion_text: str) -> float:
216
  """Small ± nudge for emitting a structured action.
217
 
218
  Kept intentionally small (≪ terminal_scale) so the policy can't be
219
- dominated by a "spam well-formed JSON" objective. The negative branch
220
- is slightly larger than the positive branch so unparseable garbage
221
- is dispreferred without crowding out the actual task reward.
 
 
222
  """
223
  return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID
224
 
 
208
  }
209
 
210
 
211
+ FORMAT_BONUS_VALID = 0.05 # was 0.15 — Fix #3 (lower per-step floor)
212
+ FORMAT_BONUS_INVALID = -0.20 # kept punitive so unparseable completions still hurt
213
 
214
 
215
  def _format_validity_bonus(completion_text: str) -> float:
216
  """Small ± nudge for emitting a structured action.
217
 
218
  Kept intentionally small (≪ terminal_scale) so the policy can't be
219
+ dominated by a "spam well-formed JSON" objective. After Fix #3 the
220
+ positive branch is 1/3 of its v1 value (0.05 vs 0.15) — combined
221
+ with the lower step_reward_clip and the heavier repeat-action
222
+ penalty, this means a model can no longer farm ~+0.22/step by
223
+ looping a single well-formed action.
224
  """
225
  return FORMAT_BONUS_VALID if parse_action(completion_text) is not None else FORMAT_BONUS_INVALID
226