Spaces:
Sleeping
Sleeping
| """ | |
| Shaped reward function for the SQL Query Optimizer environment. | |
| Design: | |
| - Partial credit every step based on grader improvement delta | |
| - Completion bonus when agent signals is_done and score ≥ threshold | |
| - Step penalty for unnecessary steps beyond task minimum | |
| - Invalid action penalty for empty / unparseable queries | |
| """ | |
| from __future__ import annotations | |
| import math | |
| _COMPLETION_THRESHOLD = 0.80 | |
| _COMPLETION_BONUS = 0.50 | |
| _STEP_PENALTY = 0.02 | |
| _INVALID_PENALTY = 0.10 | |
| _DELTA_WEIGHT = 0.50 # weight for grader improvement delta in step reward | |
| def compute_step_reward( | |
| *, | |
| grader_score: float, | |
| prev_grader_score: float, | |
| step_number: int, | |
| max_steps: int, | |
| is_done: bool, | |
| is_invalid: bool, | |
| ) -> float: | |
| """ | |
| Returns a reward in [-0.10, 1.0] for a single step. | |
| Components (all summed then clamped to [0, 1]): | |
| 1. delta_reward = _DELTA_WEIGHT * max(0, grader_score - prev_grader_score) | |
| 2. completion_bonus (only if is_done and grader_score >= threshold) | |
| 3. step_penalty (only if step > min_steps_expected and not done-early) | |
| 4. invalid_penalty (if query is empty / not parseable) | |
| """ | |
| if is_invalid: | |
| return -_INVALID_PENALTY | |
| delta = max(0.0, grader_score - prev_grader_score) | |
| reward = _DELTA_WEIGHT * delta | |
| if is_done: | |
| if grader_score >= _COMPLETION_THRESHOLD: | |
| reward += _COMPLETION_BONUS | |
| # proportional partial completion signal even without bonus | |
| reward += grader_score * 0.30 | |
| # Step penalty starts after half of max_steps used | |
| halfway = math.ceil(max_steps / 2) | |
| if step_number > halfway and not is_done: | |
| reward -= _STEP_PENALTY | |
| return round(min(max(reward, -_INVALID_PENALTY), 1.0), 4) | |