Spaces:
Sleeping
Sleeping
| # environment/graders.py | |
| # Deterministic grading system for SQL Data Analyst environment | |
| # Implements type-agnostic normalization and SQL evaluation | |
| from typing import Any, Tuple, Optional | |
| import re | |
| def normalize_value(value: Any) -> str: | |
| """ | |
| Normalize a value for comparison. | |
| Type-Agnostic Normalization: | |
| - Strip whitespace | |
| - Lowercase strings | |
| - Handle numeric conversions | |
| Args: | |
| value: Any value to normalize | |
| Returns: | |
| str: Normalized string representation | |
| """ | |
| if value is None: | |
| return "" | |
| # Convert to string first | |
| str_value = str(value).strip().lower() | |
| # Remove extra whitespace | |
| str_value = re.sub(r'\s+', ' ', str_value) | |
| # Try to normalize numeric values | |
| try: | |
| # Try float first | |
| float_val = float(str_value) | |
| # Round to 2 decimal places for comparison | |
| return str(round(float_val, 2)) | |
| except (ValueError, TypeError): | |
| pass | |
| return str_value | |
| def extract_numeric(value: str) -> Optional[float]: | |
| """ | |
| Extract a numeric value from a string. | |
| Args: | |
| value: String that may contain a number | |
| Returns: | |
| Optional[float]: Extracted number or None | |
| """ | |
| # Remove common formatting | |
| cleaned = re.sub(r'[$,]', '', str(value).strip()) | |
| try: | |
| return float(cleaned) | |
| except (ValueError, TypeError): | |
| return None | |
| def compare_values(submitted: Any, ground_truth: Any) -> Tuple[bool, float]: | |
| """ | |
| Compare submitted answer to ground truth. | |
| Args: | |
| submitted: The agent's submitted answer | |
| ground_truth: The expected correct answer | |
| Returns: | |
| Tuple[bool, float]: (is_correct, score) | |
| - is_correct: True if answer matches | |
| - score: Value between 0.0 and 1.0 | |
| """ | |
| # Normalize both values | |
| norm_submitted = normalize_value(submitted) | |
| norm_truth = normalize_value(ground_truth) | |
| # Direct string comparison after normalization | |
| if norm_submitted == norm_truth: | |
| return True, 1.0 | |
| # Try numeric comparison for numeric ground truths | |
| if isinstance(ground_truth, (int, float)): | |
| submitted_num = extract_numeric(submitted) | |
| if submitted_num is not None: | |
| truth_num = float(ground_truth) | |
| # Allow small floating point tolerance | |
| if abs(submitted_num - truth_num) < 0.01: | |
| return True, 1.0 | |
| # Partial credit for being close (within 10%) | |
| if truth_num != 0: | |
| error_pct = abs(submitted_num - truth_num) / abs(truth_num) | |
| if error_pct < 0.1: | |
| return False, 0.5 | |
| # Check if submitted answer contains the ground truth | |
| if norm_truth in norm_submitted: | |
| return True, 1.0 | |
| return False, 0.0 | |
| def grade_sql_result( | |
| query_result: str, | |
| ground_truth: Any, | |
| is_error: bool | |
| ) -> Tuple[bool, float]: | |
| """ | |
| Grade a SQL query result against ground truth. | |
| If the agent submits a SQL query as the final answer, | |
| this function evaluates the query result. | |
| Args: | |
| query_result: The result string from executing the SQL query | |
| ground_truth: The expected correct answer | |
| is_error: Whether the query execution resulted in an error | |
| Returns: | |
| Tuple[bool, float]: (is_correct, score) | |
| """ | |
| if is_error: | |
| return False, 0.0 | |
| # Parse the query result to extract values | |
| # Result format is markdown table: | col1 | col2 | | |
| lines = query_result.strip().split('\n') | |
| # Skip header and separator lines | |
| data_lines = [l for l in lines if l.strip() and not l.startswith('|---')] | |
| if len(data_lines) < 2: # Need at least header + 1 data row | |
| return False, 0.0 | |
| # Get the first data row (skip header) | |
| data_row = data_lines[1] if len(data_lines) > 1 else "" | |
| # Extract values from the row | |
| values = [v.strip() for v in data_row.split('|') if v.strip()] | |
| if not values: | |
| return False, 0.0 | |
| # For single-value answers, compare the first value | |
| # For multi-column results, try each value | |
| for value in values: | |
| is_correct, score = compare_values(value, ground_truth) | |
| if is_correct: | |
| return True, score | |
| return False, 0.0 | |
| def grade_answer( | |
| submitted_answer: str, | |
| ground_truth: Any, | |
| db_engine: Any = None | |
| ) -> Tuple[bool, float]: | |
| """ | |
| Grade the agent's submitted answer. | |
| This is the main grading function called by the environment. | |
| Args: | |
| submitted_answer: The agent's submitted answer string | |
| ground_truth: The expected correct answer | |
| db_engine: Optional database engine for SQL evaluation | |
| Returns: | |
| Tuple[bool, float]: (is_correct, score) | |
| - is_correct: True if answer is correct | |
| - score: Value strictly between 0.0 and 1.0 | |
| """ | |
| if not submitted_answer or not submitted_answer.strip(): | |
| return False, 0.0 | |
| submitted = submitted_answer.strip() | |
| # Check if the submitted answer looks like a SQL query | |
| sql_keywords = ['SELECT', 'FROM', 'WHERE', 'JOIN', 'GROUP', 'ORDER'] | |
| is_sql_query = any( | |
| keyword in submitted.upper() | |
| for keyword in sql_keywords | |
| ) | |
| if is_sql_query and db_engine is not None: | |
| # Execute the SQL and grade the result | |
| result, is_error = db_engine.execute_query(submitted) | |
| return grade_sql_result(result, ground_truth, is_error) | |
| # Direct answer comparison | |
| return compare_values(submitted, ground_truth) | |
| def calculate_final_score( | |
| is_correct: bool, | |
| total_steps: int, | |
| max_steps: int = 15 | |
| ) -> float: | |
| """ | |
| Calculate the final score for a task. | |
| Scoring factors: | |
| - Correctness is primary (0 if incorrect) | |
| - Efficiency bonus for fewer steps | |
| Args: | |
| is_correct: Whether the answer was correct | |
| total_steps: Number of steps taken | |
| max_steps: Maximum allowed steps | |
| Returns: | |
| float: Final score between 0.0 and 1.0 | |
| """ | |
| if not is_correct: | |
| return 0.0 | |
| # Base score for correct answer | |
| base_score = 0.7 | |
| # Efficiency bonus (up to 0.3) | |
| # Fewer steps = higher bonus | |
| efficiency_ratio = 1.0 - (total_steps / max_steps) | |
| efficiency_bonus = max(0.0, efficiency_ratio * 0.3) | |
| final_score = base_score + efficiency_bonus | |
| # Ensure score is strictly between 0.0 and 1.0 | |
| return min(1.0, max(0.0, final_score)) | |