Spaces:

YashashMathur
/

sql_data_analyst

Sleeping

File size: 5,767 Bytes

611be05

# environment/graders.py
# Deterministic grading system for SQL Data Analyst environment
# Implements type-agnostic normalization and SQL evaluation

from typing import Any, Tuple, Optional
import re


def normalize_value(value: Any) -> str:
    """

    Normalize a value for comparison.



    Type-Agnostic Normalization:

    - Strip whitespace

    - Lowercase strings

    - Handle numeric conversions



    Args:

        value: Any value to normalize



    Returns:

        str: Normalized string representation

    """
    if value is None:
        return ""

    # Convert to string first
    str_value = str(value).strip().lower()

    # Remove extra whitespace
    str_value = re.sub(r"\s+", " ", str_value)

    # Try to normalize numeric values
    try:
        # Try float first
        float_val = float(str_value)
        # Round to 2 decimal places for comparison
        return str(round(float_val, 2))
    except (ValueError, TypeError):
        pass

    return str_value


def extract_numeric(value: str) -> Optional[float]:
    """

    Extract a numeric value from a string.



    Args:

        value: String that may contain a number



    Returns:

        Optional[float]: Extracted number or None

    """
    # Remove common formatting
    cleaned = re.sub(r"[$,]", "", str(value).strip())

    try:
        return float(cleaned)
    except (ValueError, TypeError):
        return None


def compare_values(submitted: Any, ground_truth: Any) -> Tuple[bool, float]:
    """

    Compare submitted answer to ground truth.



    Args:

        submitted: The agent's submitted answer

        ground_truth: The expected correct answer



    Returns:

        Tuple[bool, float]: (is_correct, score)

            - is_correct: True if answer matches

            - score: Value strictly between 0.0 and 1.0

    """
    # Normalize both values
    norm_submitted = normalize_value(submitted)
    norm_truth = normalize_value(ground_truth)

    # Direct string comparison after normalization
    if norm_submitted == norm_truth:
        return True, 0.99

    # Try numeric comparison for numeric ground truths
    if isinstance(ground_truth, (int, float)):
        submitted_num = extract_numeric(submitted)
        if submitted_num is not None:
            truth_num = float(ground_truth)
            # Allow small floating point tolerance
            if abs(submitted_num - truth_num) < 0.01:
                return True, 0.99
            # Partial credit for being close (within 10%)
            if truth_num != 0:
                error_pct = abs(submitted_num - truth_num) / abs(truth_num)
                if error_pct < 0.1:
                    return False, 0.05

    # Check if submitted answer contains the ground truth
    if norm_truth in norm_submitted:
        return True, 0.99

    return False, 0.01


def grade_sql_result(

    query_result: str, ground_truth: Any, is_error: bool

) -> Tuple[bool, float]:
    """

    Grade a SQL query result against ground truth.



    Args:

        query_result: The result string from executing the SQL query

        ground_truth: The expected correct answer

        is_error: Whether the query execution resulted in an error



    Returns:

        Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0

    """
    if is_error:
        return False, 0.01

    lines = query_result.strip().split("\n")

    data_lines = [l for l in lines if l.strip() and not l.startswith("|---")]

    if len(data_lines) < 2:
        return False, 0.01

    data_row = data_lines[1] if len(data_lines) > 1 else ""

    values = [v.strip() for v in data_row.split("|") if v.strip()]

    if not values:
        return False, 0.01

    for value in values:
        is_correct, score = compare_values(value, ground_truth)
        if is_correct:
            return True, score

    return False, 0.01


def grade_answer(

    submitted_answer: str, ground_truth: Any, db_engine: Any = None

) -> Tuple[bool, float]:
    """

    Grade the agent's submitted answer.



    Args:

        submitted_answer: The agent's submitted answer string

        ground_truth: The expected correct answer

        db_engine: Optional database engine for SQL evaluation



    Returns:

        Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0

    """
    if not submitted_answer or not submitted_answer.strip():
        return False, 0.01

    submitted = submitted_answer.strip()

    sql_keywords = ["SELECT", "FROM", "WHERE", "JOIN", "GROUP", "ORDER"]
    is_sql_query = any(keyword in submitted.upper() for keyword in sql_keywords)

    if is_sql_query and db_engine is not None:
        result, is_error = db_engine.execute_query(submitted)
        return grade_sql_result(result, ground_truth, is_error)

    return compare_values(submitted, ground_truth)


def calculate_final_score(

    is_correct: bool, total_steps: int, max_steps: int = 15

) -> float:
    """

    Calculate the final score for a task.



    Args:

        is_correct: Whether the answer was correct

        total_steps: Number of steps taken

        max_steps: Maximum allowed steps



    Returns:

        float: Final score strictly between 0.0 and 1.0

    """
    if not is_correct:
        return 0.01

    base_score = 0.7

    efficiency_ratio = 1.0 - (total_steps / max_steps)
    efficiency_bonus = max(0.0, efficiency_ratio * 0.3)

    final_score = base_score + efficiency_bonus

    # Ensure score is strictly between 0.0 and 1.0
    # Use 0.99 as max to stay strictly under 1.0
    return min(0.99, max(0.01, final_score))