Spaces:
Paused
Paused
sft+reward-fix: server/environment.py
Browse files- server/environment.py +22 -0
server/environment.py
CHANGED
|
@@ -258,6 +258,28 @@ class CERNCollisionEnvironment(Environment[ExperimentAction, CollisionObservatio
|
|
| 258 |
self._state.correct_mass = term.correct_mass
|
| 259 |
self._state.correct_channel = term.correct_channel
|
| 260 |
self._state.correct_spin = term.correct_spin
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
done = terminal_now or time_up
|
| 263 |
if done:
|
|
|
|
| 258 |
self._state.correct_mass = term.correct_mass
|
| 259 |
self._state.correct_channel = term.correct_channel
|
| 260 |
self._state.correct_spin = term.correct_spin
|
| 261 |
+
elif time_up:
|
| 262 |
+
# Fix #1: if the episode runs out of steps/budget/time and the
|
| 263 |
+
# agent never even *attempted* a SUBMIT_DISCOVERY_CLAIM, levy a
|
| 264 |
+
# flat no-claim penalty so claim-avoidance can no longer
|
| 265 |
+
# dominate the per-step shaping reward (the v1 reward hack).
|
| 266 |
+
ever_claimed = any(
|
| 267 |
+
rec.action_type == ActionType.SUBMIT_DISCOVERY_CLAIM
|
| 268 |
+
for rec in self._history
|
| 269 |
+
)
|
| 270 |
+
if not ever_claimed:
|
| 271 |
+
term = compute_terminal_reward(
|
| 272 |
+
state=self._latent,
|
| 273 |
+
claim=None,
|
| 274 |
+
weights=self.reward_weights,
|
| 275 |
+
)
|
| 276 |
+
terminal_reward_value = term.reward
|
| 277 |
+
self._state.cumulative_reward += terminal_reward_value
|
| 278 |
+
self._state.terminal_reward = terminal_reward_value
|
| 279 |
+
self._state.discovered = term.discovered
|
| 280 |
+
self._state.correct_mass = term.correct_mass
|
| 281 |
+
self._state.correct_channel = term.correct_channel
|
| 282 |
+
self._state.correct_spin = term.correct_spin
|
| 283 |
|
| 284 |
done = terminal_now or time_up
|
| 285 |
if done:
|