# environment/graders.py
# Deterministic grading system for SQL Data Analyst environment
# Implements type-agnostic normalization and SQL evaluation

from typing import Any, Tuple, Optional
import re


def normalize_value(value: Any) -> str:
    """
    Normalize a value for comparison.
    
    Type-Agnostic Normalization:
    - Strip whitespace
    - Lowercase strings
    - Handle numeric conversions
    
    Args:
        value: Any value to normalize
        
    Returns:
        str: Normalized string representation
    """
    if value is None:
        return ""
    
    # Convert to string first
    str_value = str(value).strip().lower()
    
    # Remove extra whitespace
    str_value = re.sub(r'\s+', ' ', str_value)
    
    # Try to normalize numeric values
    try:
        # Try float first
        float_val = float(str_value)
        # Round to 2 decimal places for comparison
        return str(round(float_val, 2))
    except (ValueError, TypeError):
        pass
    
    return str_value


def extract_numeric(value: str) -> Optional[float]:
    """
    Extract a numeric value from a string.
    
    Args:
        value: String that may contain a number
        
    Returns:
        Optional[float]: Extracted number or None
    """
    # Remove common formatting
    cleaned = re.sub(r'[$,]', '', str(value).strip())
    
    try:
        return float(cleaned)
    except (ValueError, TypeError):
        return None


def compare_values(submitted: Any, ground_truth: Any) -> Tuple[bool, float]:
    """
    Compare submitted answer to ground truth.
    
    Args:
        submitted: The agent's submitted answer
        ground_truth: The expected correct answer
        
    Returns:
        Tuple[bool, float]: (is_correct, score)
            - is_correct: True if answer matches
            - score: Value between 0.0 and 1.0
    """
    # Normalize both values
    norm_submitted = normalize_value(submitted)
    norm_truth = normalize_value(ground_truth)
    
    # Direct string comparison after normalization
    if norm_submitted == norm_truth:
        return True, 1.0
    
    # Try numeric comparison for numeric ground truths
    if isinstance(ground_truth, (int, float)):
        submitted_num = extract_numeric(submitted)
        if submitted_num is not None:
            truth_num = float(ground_truth)
            # Allow small floating point tolerance
            if abs(submitted_num - truth_num) < 0.01:
                return True, 1.0
            # Partial credit for being close (within 10%)
            if truth_num != 0:
                error_pct = abs(submitted_num - truth_num) / abs(truth_num)
                if error_pct < 0.1:
                    return False, 0.5
    
    # Check if submitted answer contains the ground truth
    if norm_truth in norm_submitted:
        return True, 1.0
    
    return False, 0.0


def grade_sql_result(
    query_result: str,
    ground_truth: Any,
    is_error: bool
) -> Tuple[bool, float]:
    """
    Grade a SQL query result against ground truth.
    
    If the agent submits a SQL query as the final answer,
    this function evaluates the query result.
    
    Args:
        query_result: The result string from executing the SQL query
        ground_truth: The expected correct answer
        is_error: Whether the query execution resulted in an error
        
    Returns:
        Tuple[bool, float]: (is_correct, score)
    """
    if is_error:
        return False, 0.0
    
    # Parse the query result to extract values
    # Result format is markdown table: | col1 | col2 |
    lines = query_result.strip().split('\n')
    
    # Skip header and separator lines
    data_lines = [l for l in lines if l.strip() and not l.startswith('|---')]
    
    if len(data_lines) < 2:  # Need at least header + 1 data row
        return False, 0.0
    
    # Get the first data row (skip header)
    data_row = data_lines[1] if len(data_lines) > 1 else ""
    
    # Extract values from the row
    values = [v.strip() for v in data_row.split('|') if v.strip()]
    
    if not values:
        return False, 0.0
    
    # For single-value answers, compare the first value
    # For multi-column results, try each value
    for value in values:
        is_correct, score = compare_values(value, ground_truth)
        if is_correct:
            return True, score
    
    return False, 0.0


def grade_answer(
    submitted_answer: str,
    ground_truth: Any,
    db_engine: Any = None
) -> Tuple[bool, float]:
    """
    Grade the agent's submitted answer.
    
    This is the main grading function called by the environment.
    
    Args:
        submitted_answer: The agent's submitted answer string
        ground_truth: The expected correct answer
        db_engine: Optional database engine for SQL evaluation
        
    Returns:
        Tuple[bool, float]: (is_correct, score)
            - is_correct: True if answer is correct
            - score: Value strictly between 0.0 and 1.0
    """
    if not submitted_answer or not submitted_answer.strip():
        return False, 0.0
    
    submitted = submitted_answer.strip()
    
    # Check if the submitted answer looks like a SQL query
    sql_keywords = ['SELECT', 'FROM', 'WHERE', 'JOIN', 'GROUP', 'ORDER']
    is_sql_query = any(
        keyword in submitted.upper() 
        for keyword in sql_keywords
    )
    
    if is_sql_query and db_engine is not None:
        # Execute the SQL and grade the result
        result, is_error = db_engine.execute_query(submitted)
        return grade_sql_result(result, ground_truth, is_error)
    
    # Direct answer comparison
    return compare_values(submitted, ground_truth)


def calculate_final_score(
    is_correct: bool,
    total_steps: int,
    max_steps: int = 15
) -> float:
    """
    Calculate the final score for a task.
    
    Scoring factors:
    - Correctness is primary (0 if incorrect)
    - Efficiency bonus for fewer steps
    
    Args:
        is_correct: Whether the answer was correct
        total_steps: Number of steps taken
        max_steps: Maximum allowed steps
        
    Returns:
        float: Final score between 0.0 and 1.0
    """
    if not is_correct:
        return 0.0
    
    # Base score for correct answer
    base_score = 0.7
    
    # Efficiency bonus (up to 0.3)
    # Fewer steps = higher bonus
    efficiency_ratio = 1.0 - (total_steps / max_steps)
    efficiency_bonus = max(0.0, efficiency_ratio * 0.3)
    
    final_score = base_score + efficiency_bonus
    
    # Ensure score is strictly between 0.0 and 1.0
    return min(1.0, max(0.0, final_score))