""" Base Grader — Abstract base class for all graders. """ from abc import ABC, abstractmethod from typing import List, Dict, Any class BaseGrader(ABC): """Abstract base grader. All graders must implement score().""" @abstractmethod def score( self, task_config: dict, attempts: List[Dict[str, Any]], best_tests_passed: int, tests_total: int, attempts_used: int, max_attempts: int, hypotheses: List[str], ) -> float: """ Score an episode. Must return a float in [0.0, 1.0]. Must be deterministic: same inputs → same output. Args: task_config: The full task config dict attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc. best_tests_passed: Best test pass count across all attempts tests_total: Total tests in the suite attempts_used: Number of fix attempts used max_attempts: Maximum allowed attempts hypotheses: All hypotheses submitted Returns: float in [0.0, 1.0] """ pass def _check_hypothesis_keywords( self, hypothesis: str, keywords: List[str], mode: str = "any" ) -> bool: """Check if a hypothesis matches any/all of the ground truth keywords.""" hypothesis_lower = hypothesis.lower() if mode == "any": return any(kw.lower() in hypothesis_lower for kw in keywords) elif mode == "all": return all(kw.lower() in hypothesis_lower for kw in keywords) return False def _clamp(self, value: float) -> float: """Clamp a value to (0.0, 1.0).""" return max(0.01, min(0.99, value))