File size: 1,771 Bytes
0ee66d2 0769caa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | """
Base Grader — Abstract base class for all graders.
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any
class BaseGrader(ABC):
"""Abstract base grader. All graders must implement score()."""
@abstractmethod
def score(
self,
task_config: dict,
attempts: List[Dict[str, Any]],
best_tests_passed: int,
tests_total: int,
attempts_used: int,
max_attempts: int,
hypotheses: List[str],
) -> float:
"""
Score an episode. Must return a float in [0.0, 1.0].
Must be deterministic: same inputs → same output.
Args:
task_config: The full task config dict
attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc.
best_tests_passed: Best test pass count across all attempts
tests_total: Total tests in the suite
attempts_used: Number of fix attempts used
max_attempts: Maximum allowed attempts
hypotheses: All hypotheses submitted
Returns:
float in [0.0, 1.0]
"""
pass
def _check_hypothesis_keywords(
self, hypothesis: str, keywords: List[str], mode: str = "any"
) -> bool:
"""Check if a hypothesis matches any/all of the ground truth keywords."""
hypothesis_lower = hypothesis.lower()
if mode == "any":
return any(kw.lower() in hypothesis_lower for kw in keywords)
elif mode == "all":
return all(kw.lower() in hypothesis_lower for kw in keywords)
return False
def _clamp(self, value: float) -> float:
"""Clamp a value to (0.0, 1.0)."""
return max(0.01, min(0.99, value))
|