Spaces:

Param20h
/

sql-query-optimizer

Sleeping

App Files Files Community

sql-query-optimizer / env /reward.py

Param20h

Upload folder using huggingface_hub

210535c verified 15 days ago

raw

history blame contribute delete

1.76 kB

	"""
	Shaped reward function for the SQL Query Optimizer environment.

	Design:
	- Partial credit every step based on grader improvement delta
	- Completion bonus when agent signals is_done and score ≥ threshold
	- Step penalty for unnecessary steps beyond task minimum
	- Invalid action penalty for empty / unparseable queries
	"""
	from __future__ import annotations

	import math


	_COMPLETION_THRESHOLD = 0.80
	_COMPLETION_BONUS = 0.50
	_STEP_PENALTY = 0.02
	_INVALID_PENALTY = 0.10
	_DELTA_WEIGHT = 0.50 # weight for grader improvement delta in step reward


	def compute_step_reward(
	*,
	grader_score: float,
	prev_grader_score: float,
	step_number: int,
	max_steps: int,
	is_done: bool,
	is_invalid: bool,
	) -> float:
	"""
	Returns a reward in [-0.10, 1.0] for a single step.

	Components (all summed then clamped to [0, 1]):
	1. delta_reward = _DELTA_WEIGHT * max(0, grader_score - prev_grader_score)
	2. completion_bonus (only if is_done and grader_score >= threshold)
	3. step_penalty (only if step > min_steps_expected and not done-early)
	4. invalid_penalty (if query is empty / not parseable)
	"""
	if is_invalid:
	return -_INVALID_PENALTY

	delta = max(0.0, grader_score - prev_grader_score)
	reward = _DELTA_WEIGHT * delta

	if is_done:
	if grader_score >= _COMPLETION_THRESHOLD:
	reward += _COMPLETION_BONUS
	# proportional partial completion signal even without bonus
	reward += grader_score * 0.30

	# Step penalty starts after half of max_steps used
	halfway = math.ceil(max_steps / 2)
	if step_number > halfway and not is_done:
	reward -= _STEP_PENALTY

	return round(min(max(reward, -_INVALID_PENALTY), 1.0), 4)