Imaginephoenix commited on
Commit
636d085
·
verified ·
1 Parent(s): 7d127dd

Update environment.py

Browse files
Files changed (1) hide show
  1. environment.py +40 -2
environment.py CHANGED
@@ -152,7 +152,9 @@ class EmailTriageEnv:
152
 
153
  base_result = self._grade_current_step(validated_action)
154
  base_score = base_result.score
155
-
 
 
156
  truth_for_step = (
157
  self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
158
  if self._ground_truth
@@ -166,8 +168,9 @@ class EmailTriageEnv:
166
 
167
  penalties = self._compute_penalties(validated_action)
168
  trajectory_bonus = self._compute_trajectory_bonus()
 
169
  final_reward = self._clip_reward(
170
- base_score - (self._current_step * 0.01) + trajectory_bonus - penalties
171
  )
172
 
173
  self._reward_history.append(final_reward)
@@ -193,6 +196,8 @@ class EmailTriageEnv:
193
  "emails_processed": min(self._current_index, len(self._emails)),
194
  "emails_remaining": max(len(self._emails) - self._current_index, 0),
195
  "base_score": round(base_score, 4),
 
 
196
  "penalties": round(penalties, 4),
197
  "trajectory_bonus": round(trajectory_bonus, 4),
198
  "grading_feedback": base_result.feedback,
@@ -382,6 +387,39 @@ class EmailTriageEnv:
382
 
383
  return penalty_total
384
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  def _compute_trajectory_bonus(self) -> float:
386
  """Return trajectory bonus when episode completion quality is high.
387
 
 
152
 
153
  base_result = self._grade_current_step(validated_action)
154
  base_score = base_result.score
155
+ previous_base_score = self._base_score_history[-1] if self._base_score_history else None
156
+ progress_signal = self._compute_progress_signal(base_score, previous_base_score)
157
+
158
  truth_for_step = (
159
  self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
160
  if self._ground_truth
 
168
 
169
  penalties = self._compute_penalties(validated_action)
170
  trajectory_bonus = self._compute_trajectory_bonus()
171
+ step_cost = self._compute_step_cost()
172
  final_reward = self._clip_reward(
173
+ base_score + progress_signal + trajectory_bonus - penalties - step_cost
174
  )
175
 
176
  self._reward_history.append(final_reward)
 
196
  "emails_processed": min(self._current_index, len(self._emails)),
197
  "emails_remaining": max(len(self._emails) - self._current_index, 0),
198
  "base_score": round(base_score, 4),
199
+ "progress_signal": round(progress_signal, 4),
200
+ "step_cost": round(step_cost, 4),
201
  "penalties": round(penalties, 4),
202
  "trajectory_bonus": round(trajectory_bonus, 4),
203
  "grading_feedback": base_result.feedback,
 
387
 
388
  return penalty_total
389
 
390
+ def _compute_progress_signal(
391
+ self,
392
+ base_score: float,
393
+ previous_base_score: float | None,
394
+ ) -> float:
395
+ """Compute dense partial-progress reward independent of final completion.
396
+
397
+ Args:
398
+ base_score: Current-step base grade in [0.0, 1.0].
399
+ previous_base_score: Previous step base grade when available.
400
+
401
+ Returns:
402
+ Small positive/negative signal reflecting progress and quality trend.
403
+ """
404
+ total_emails = max(len(self._emails), 1)
405
+ progress_ratio = min(1.0, (self._current_index + 1) / total_emails)
406
+
407
+ completion_signal = 0.05 * progress_ratio
408
+ quality_signal = 0.05 * self._clip_reward(base_score)
409
+
410
+ trend_signal = 0.0
411
+ if previous_base_score is not None:
412
+ delta = base_score - previous_base_score
413
+ trend_signal = max(-0.02, min(0.03, delta * 0.1))
414
+
415
+ return completion_signal + quality_signal + trend_signal
416
+
417
+ def _compute_step_cost(self) -> float:
418
+ """Return a gentle efficiency cost that grows with episode length."""
419
+ normalized_step = self._current_step / max(self._max_steps, 1)
420
+ return 0.005 + (0.01 * normalized_step)
421
+
422
+
423
  def _compute_trajectory_bonus(self) -> float:
424
  """Return trajectory bonus when episode completion quality is high.
425