Spaces:
Runtime error
Runtime error
Update environment.py
Browse files- environment.py +40 -2
environment.py
CHANGED
|
@@ -152,7 +152,9 @@ class EmailTriageEnv:
|
|
| 152 |
|
| 153 |
base_result = self._grade_current_step(validated_action)
|
| 154 |
base_score = base_result.score
|
| 155 |
-
|
|
|
|
|
|
|
| 156 |
truth_for_step = (
|
| 157 |
self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 158 |
if self._ground_truth
|
|
@@ -166,8 +168,9 @@ class EmailTriageEnv:
|
|
| 166 |
|
| 167 |
penalties = self._compute_penalties(validated_action)
|
| 168 |
trajectory_bonus = self._compute_trajectory_bonus()
|
|
|
|
| 169 |
final_reward = self._clip_reward(
|
| 170 |
-
base_score
|
| 171 |
)
|
| 172 |
|
| 173 |
self._reward_history.append(final_reward)
|
|
@@ -193,6 +196,8 @@ class EmailTriageEnv:
|
|
| 193 |
"emails_processed": min(self._current_index, len(self._emails)),
|
| 194 |
"emails_remaining": max(len(self._emails) - self._current_index, 0),
|
| 195 |
"base_score": round(base_score, 4),
|
|
|
|
|
|
|
| 196 |
"penalties": round(penalties, 4),
|
| 197 |
"trajectory_bonus": round(trajectory_bonus, 4),
|
| 198 |
"grading_feedback": base_result.feedback,
|
|
@@ -382,6 +387,39 @@ class EmailTriageEnv:
|
|
| 382 |
|
| 383 |
return penalty_total
|
| 384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
def _compute_trajectory_bonus(self) -> float:
|
| 386 |
"""Return trajectory bonus when episode completion quality is high.
|
| 387 |
|
|
|
|
| 152 |
|
| 153 |
base_result = self._grade_current_step(validated_action)
|
| 154 |
base_score = base_result.score
|
| 155 |
+
previous_base_score = self._base_score_history[-1] if self._base_score_history else None
|
| 156 |
+
progress_signal = self._compute_progress_signal(base_score, previous_base_score)
|
| 157 |
+
|
| 158 |
truth_for_step = (
|
| 159 |
self._ground_truth[min(self._current_index, len(self._ground_truth) - 1)]
|
| 160 |
if self._ground_truth
|
|
|
|
| 168 |
|
| 169 |
penalties = self._compute_penalties(validated_action)
|
| 170 |
trajectory_bonus = self._compute_trajectory_bonus()
|
| 171 |
+
step_cost = self._compute_step_cost()
|
| 172 |
final_reward = self._clip_reward(
|
| 173 |
+
base_score + progress_signal + trajectory_bonus - penalties - step_cost
|
| 174 |
)
|
| 175 |
|
| 176 |
self._reward_history.append(final_reward)
|
|
|
|
| 196 |
"emails_processed": min(self._current_index, len(self._emails)),
|
| 197 |
"emails_remaining": max(len(self._emails) - self._current_index, 0),
|
| 198 |
"base_score": round(base_score, 4),
|
| 199 |
+
"progress_signal": round(progress_signal, 4),
|
| 200 |
+
"step_cost": round(step_cost, 4),
|
| 201 |
"penalties": round(penalties, 4),
|
| 202 |
"trajectory_bonus": round(trajectory_bonus, 4),
|
| 203 |
"grading_feedback": base_result.feedback,
|
|
|
|
| 387 |
|
| 388 |
return penalty_total
|
| 389 |
|
| 390 |
+
def _compute_progress_signal(
|
| 391 |
+
self,
|
| 392 |
+
base_score: float,
|
| 393 |
+
previous_base_score: float | None,
|
| 394 |
+
) -> float:
|
| 395 |
+
"""Compute dense partial-progress reward independent of final completion.
|
| 396 |
+
|
| 397 |
+
Args:
|
| 398 |
+
base_score: Current-step base grade in [0.0, 1.0].
|
| 399 |
+
previous_base_score: Previous step base grade when available.
|
| 400 |
+
|
| 401 |
+
Returns:
|
| 402 |
+
Small positive/negative signal reflecting progress and quality trend.
|
| 403 |
+
"""
|
| 404 |
+
total_emails = max(len(self._emails), 1)
|
| 405 |
+
progress_ratio = min(1.0, (self._current_index + 1) / total_emails)
|
| 406 |
+
|
| 407 |
+
completion_signal = 0.05 * progress_ratio
|
| 408 |
+
quality_signal = 0.05 * self._clip_reward(base_score)
|
| 409 |
+
|
| 410 |
+
trend_signal = 0.0
|
| 411 |
+
if previous_base_score is not None:
|
| 412 |
+
delta = base_score - previous_base_score
|
| 413 |
+
trend_signal = max(-0.02, min(0.03, delta * 0.1))
|
| 414 |
+
|
| 415 |
+
return completion_signal + quality_signal + trend_signal
|
| 416 |
+
|
| 417 |
+
def _compute_step_cost(self) -> float:
|
| 418 |
+
"""Return a gentle efficiency cost that grows with episode length."""
|
| 419 |
+
normalized_step = self._current_step / max(self._max_steps, 1)
|
| 420 |
+
return 0.005 + (0.01 * normalized_step)
|
| 421 |
+
|
| 422 |
+
|
| 423 |
def _compute_trajectory_bonus(self) -> float:
|
| 424 |
"""Return trajectory bonus when episode completion quality is high.
|
| 425 |
|