anugrahhu commited on
Commit
70b06db
·
verified ·
1 Parent(s): d91fe20

sft+reward-fix: server/environment.py

Browse files
Files changed (1) hide show
  1. server/environment.py +22 -0
server/environment.py CHANGED
@@ -258,6 +258,28 @@ class CERNCollisionEnvironment(Environment[ExperimentAction, CollisionObservatio
258
  self._state.correct_mass = term.correct_mass
259
  self._state.correct_channel = term.correct_channel
260
  self._state.correct_spin = term.correct_spin
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  done = terminal_now or time_up
263
  if done:
 
258
  self._state.correct_mass = term.correct_mass
259
  self._state.correct_channel = term.correct_channel
260
  self._state.correct_spin = term.correct_spin
261
+ elif time_up:
262
+ # Fix #1: if the episode runs out of steps/budget/time and the
263
+ # agent never even *attempted* a SUBMIT_DISCOVERY_CLAIM, levy a
264
+ # flat no-claim penalty so claim-avoidance can no longer
265
+ # dominate the per-step shaping reward (the v1 reward hack).
266
+ ever_claimed = any(
267
+ rec.action_type == ActionType.SUBMIT_DISCOVERY_CLAIM
268
+ for rec in self._history
269
+ )
270
+ if not ever_claimed:
271
+ term = compute_terminal_reward(
272
+ state=self._latent,
273
+ claim=None,
274
+ weights=self.reward_weights,
275
+ )
276
+ terminal_reward_value = term.reward
277
+ self._state.cumulative_reward += terminal_reward_value
278
+ self._state.terminal_reward = terminal_reward_value
279
+ self._state.discovered = term.discovered
280
+ self._state.correct_mass = term.correct_mass
281
+ self._state.correct_channel = term.correct_channel
282
+ self._state.correct_spin = term.correct_spin
283
 
284
  done = terminal_now or time_up
285
  if done: