Spaces:
Sleeping
Sleeping
File size: 5,767 Bytes
611be05 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | # environment/graders.py
# Deterministic grading system for SQL Data Analyst environment
# Implements type-agnostic normalization and SQL evaluation
from typing import Any, Tuple, Optional
import re
def normalize_value(value: Any) -> str:
"""
Normalize a value for comparison.
Type-Agnostic Normalization:
- Strip whitespace
- Lowercase strings
- Handle numeric conversions
Args:
value: Any value to normalize
Returns:
str: Normalized string representation
"""
if value is None:
return ""
# Convert to string first
str_value = str(value).strip().lower()
# Remove extra whitespace
str_value = re.sub(r"\s+", " ", str_value)
# Try to normalize numeric values
try:
# Try float first
float_val = float(str_value)
# Round to 2 decimal places for comparison
return str(round(float_val, 2))
except (ValueError, TypeError):
pass
return str_value
def extract_numeric(value: str) -> Optional[float]:
"""
Extract a numeric value from a string.
Args:
value: String that may contain a number
Returns:
Optional[float]: Extracted number or None
"""
# Remove common formatting
cleaned = re.sub(r"[$,]", "", str(value).strip())
try:
return float(cleaned)
except (ValueError, TypeError):
return None
def compare_values(submitted: Any, ground_truth: Any) -> Tuple[bool, float]:
"""
Compare submitted answer to ground truth.
Args:
submitted: The agent's submitted answer
ground_truth: The expected correct answer
Returns:
Tuple[bool, float]: (is_correct, score)
- is_correct: True if answer matches
- score: Value strictly between 0.0 and 1.0
"""
# Normalize both values
norm_submitted = normalize_value(submitted)
norm_truth = normalize_value(ground_truth)
# Direct string comparison after normalization
if norm_submitted == norm_truth:
return True, 0.99
# Try numeric comparison for numeric ground truths
if isinstance(ground_truth, (int, float)):
submitted_num = extract_numeric(submitted)
if submitted_num is not None:
truth_num = float(ground_truth)
# Allow small floating point tolerance
if abs(submitted_num - truth_num) < 0.01:
return True, 0.99
# Partial credit for being close (within 10%)
if truth_num != 0:
error_pct = abs(submitted_num - truth_num) / abs(truth_num)
if error_pct < 0.1:
return False, 0.05
# Check if submitted answer contains the ground truth
if norm_truth in norm_submitted:
return True, 0.99
return False, 0.01
def grade_sql_result(
query_result: str, ground_truth: Any, is_error: bool
) -> Tuple[bool, float]:
"""
Grade a SQL query result against ground truth.
Args:
query_result: The result string from executing the SQL query
ground_truth: The expected correct answer
is_error: Whether the query execution resulted in an error
Returns:
Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0
"""
if is_error:
return False, 0.01
lines = query_result.strip().split("\n")
data_lines = [l for l in lines if l.strip() and not l.startswith("|---")]
if len(data_lines) < 2:
return False, 0.01
data_row = data_lines[1] if len(data_lines) > 1 else ""
values = [v.strip() for v in data_row.split("|") if v.strip()]
if not values:
return False, 0.01
for value in values:
is_correct, score = compare_values(value, ground_truth)
if is_correct:
return True, score
return False, 0.01
def grade_answer(
submitted_answer: str, ground_truth: Any, db_engine: Any = None
) -> Tuple[bool, float]:
"""
Grade the agent's submitted answer.
Args:
submitted_answer: The agent's submitted answer string
ground_truth: The expected correct answer
db_engine: Optional database engine for SQL evaluation
Returns:
Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0
"""
if not submitted_answer or not submitted_answer.strip():
return False, 0.01
submitted = submitted_answer.strip()
sql_keywords = ["SELECT", "FROM", "WHERE", "JOIN", "GROUP", "ORDER"]
is_sql_query = any(keyword in submitted.upper() for keyword in sql_keywords)
if is_sql_query and db_engine is not None:
result, is_error = db_engine.execute_query(submitted)
return grade_sql_result(result, ground_truth, is_error)
return compare_values(submitted, ground_truth)
def calculate_final_score(
is_correct: bool, total_steps: int, max_steps: int = 15
) -> float:
"""
Calculate the final score for a task.
Args:
is_correct: Whether the answer was correct
total_steps: Number of steps taken
max_steps: Maximum allowed steps
Returns:
float: Final score strictly between 0.0 and 1.0
"""
if not is_correct:
return 0.01
base_score = 0.7
efficiency_ratio = 1.0 - (total_steps / max_steps)
efficiency_bonus = max(0.0, efficiency_ratio * 0.3)
final_score = base_score + efficiency_bonus
# Ensure score is strictly between 0.0 and 1.0
# Use 0.99 as max to stay strictly under 1.0
return min(0.99, max(0.01, final_score))
|