sql-query-optimizer / env /reward.py
Param20h's picture
Upload folder using huggingface_hub
210535c verified
"""
Shaped reward function for the SQL Query Optimizer environment.
Design:
- Partial credit every step based on grader improvement delta
- Completion bonus when agent signals is_done and score ≥ threshold
- Step penalty for unnecessary steps beyond task minimum
- Invalid action penalty for empty / unparseable queries
"""
from __future__ import annotations
import math
_COMPLETION_THRESHOLD = 0.80
_COMPLETION_BONUS = 0.50
_STEP_PENALTY = 0.02
_INVALID_PENALTY = 0.10
_DELTA_WEIGHT = 0.50 # weight for grader improvement delta in step reward
def compute_step_reward(
*,
grader_score: float,
prev_grader_score: float,
step_number: int,
max_steps: int,
is_done: bool,
is_invalid: bool,
) -> float:
"""
Returns a reward in [-0.10, 1.0] for a single step.
Components (all summed then clamped to [0, 1]):
1. delta_reward = _DELTA_WEIGHT * max(0, grader_score - prev_grader_score)
2. completion_bonus (only if is_done and grader_score >= threshold)
3. step_penalty (only if step > min_steps_expected and not done-early)
4. invalid_penalty (if query is empty / not parseable)
"""
if is_invalid:
return -_INVALID_PENALTY
delta = max(0.0, grader_score - prev_grader_score)
reward = _DELTA_WEIGHT * delta
if is_done:
if grader_score >= _COMPLETION_THRESHOLD:
reward += _COMPLETION_BONUS
# proportional partial completion signal even without bonus
reward += grader_score * 0.30
# Step penalty starts after half of max_steps used
halfway = math.ceil(max_steps / 2)
if step_number > halfway and not is_done:
reward -= _STEP_PENALTY
return round(min(max(reward, -_INVALID_PENALTY), 1.0), 4)