# environment/graders.py
# Deterministic grading system for SQL Data Analyst environment
# Implements type-agnostic normalization and SQL evaluation

from typing import Any, Tuple, Optional
import re


def normalize_value(value: Any) -> str:
    """
    Normalize a value for comparison.

    Type-Agnostic Normalization:
    - Strip whitespace
    - Lowercase strings
    - Handle numeric conversions

    Args:
        value: Any value to normalize

    Returns:
        str: Normalized string representation
    """
    if value is None:
        return ""

    # Convert to string first
    str_value = str(value).strip().lower()

    # Remove extra whitespace
    str_value = re.sub(r"\s+", " ", str_value)

    # Try to normalize numeric values
    try:
        # Try float first
        float_val = float(str_value)
        # Round to 2 decimal places for comparison
        return str(round(float_val, 2))
    except (ValueError, TypeError):
        pass

    return str_value


def extract_numeric(value: str) -> Optional[float]:
    """
    Extract a numeric value from a string.

    Args:
        value: String that may contain a number

    Returns:
        Optional[float]: Extracted number or None
    """
    # Remove common formatting
    cleaned = re.sub(r"[$,]", "", str(value).strip())

    try:
        return float(cleaned)
    except (ValueError, TypeError):
        return None


def compare_values(submitted: Any, ground_truth: Any) -> Tuple[bool, float]:
    """
    Compare submitted answer to ground truth.

    Args:
        submitted: The agent's submitted answer
        ground_truth: The expected correct answer

    Returns:
        Tuple[bool, float]: (is_correct, score)
            - is_correct: True if answer matches
            - score: Value strictly between 0.0 and 1.0
    """
    # Normalize both values
    norm_submitted = normalize_value(submitted)
    norm_truth = normalize_value(ground_truth)

    # Direct string comparison after normalization
    if norm_submitted == norm_truth:
        return True, 0.99

    # Try numeric comparison for numeric ground truths
    if isinstance(ground_truth, (int, float)):
        submitted_num = extract_numeric(submitted)
        if submitted_num is not None:
            truth_num = float(ground_truth)
            # Allow small floating point tolerance
            if abs(submitted_num - truth_num) < 0.01:
                return True, 0.99
            # Partial credit for being close (within 10%)
            if truth_num != 0:
                error_pct = abs(submitted_num - truth_num) / abs(truth_num)
                if error_pct < 0.1:
                    return False, 0.05

    # Check if submitted answer contains the ground truth
    if norm_truth in norm_submitted:
        return True, 0.99

    return False, 0.01


def grade_sql_result(
    query_result: str, ground_truth: Any, is_error: bool
) -> Tuple[bool, float]:
    """
    Grade a SQL query result against ground truth.

    Args:
        query_result: The result string from executing the SQL query
        ground_truth: The expected correct answer
        is_error: Whether the query execution resulted in an error

    Returns:
        Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0
    """
    if is_error:
        return False, 0.01

    lines = query_result.strip().split("\n")

    data_lines = [l for l in lines if l.strip() and not l.startswith("|---")]

    if len(data_lines) < 2:
        return False, 0.01

    data_row = data_lines[1] if len(data_lines) > 1 else ""

    values = [v.strip() for v in data_row.split("|") if v.strip()]

    if not values:
        return False, 0.01

    for value in values:
        is_correct, score = compare_values(value, ground_truth)
        if is_correct:
            return True, score

    return False, 0.01


def grade_answer(
    submitted_answer: str, ground_truth: Any, db_engine: Any = None
) -> Tuple[bool, float]:
    """
    Grade the agent's submitted answer.

    Args:
        submitted_answer: The agent's submitted answer string
        ground_truth: The expected correct answer
        db_engine: Optional database engine for SQL evaluation

    Returns:
        Tuple[bool, float]: (is_correct, score) - score strictly between 0.0 and 1.0
    """
    if not submitted_answer or not submitted_answer.strip():
        return False, 0.01

    submitted = submitted_answer.strip()

    sql_keywords = ["SELECT", "FROM", "WHERE", "JOIN", "GROUP", "ORDER"]
    is_sql_query = any(keyword in submitted.upper() for keyword in sql_keywords)

    if is_sql_query and db_engine is not None:
        result, is_error = db_engine.execute_query(submitted)
        return grade_sql_result(result, ground_truth, is_error)

    return compare_values(submitted, ground_truth)


def calculate_final_score(
    is_correct: bool, total_steps: int, max_steps: int = 15
) -> float:
    """
    Calculate the final score for a task.

    Args:
        is_correct: Whether the answer was correct
        total_steps: Number of steps taken
        max_steps: Maximum allowed steps

    Returns:
        float: Final score strictly between 0.0 and 1.0
    """
    if not is_correct:
        return 0.01

    base_score = 0.7

    efficiency_ratio = 1.0 - (total_steps / max_steps)
    efficiency_bonus = max(0.0, efficiency_ratio * 0.3)

    final_score = base_score + efficiency_bonus

    # Ensure score is strictly between 0.0 and 1.0
    # Use 0.99 as max to stay strictly under 1.0
    return min(0.99, max(0.01, final_score))