shank
Fix: Changed scores range
0769caa
"""
Base Grader — Abstract base class for all graders.
"""
from abc import ABC, abstractmethod
from typing import List, Dict, Any
class BaseGrader(ABC):
"""Abstract base grader. All graders must implement score()."""
@abstractmethod
def score(
self,
task_config: dict,
attempts: List[Dict[str, Any]],
best_tests_passed: int,
tests_total: int,
attempts_used: int,
max_attempts: int,
hypotheses: List[str],
) -> float:
"""
Score an episode. Must return a float in [0.0, 1.0].
Must be deterministic: same inputs → same output.
Args:
task_config: The full task config dict
attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc.
best_tests_passed: Best test pass count across all attempts
tests_total: Total tests in the suite
attempts_used: Number of fix attempts used
max_attempts: Maximum allowed attempts
hypotheses: All hypotheses submitted
Returns:
float in [0.0, 1.0]
"""
pass
def _check_hypothesis_keywords(
self, hypothesis: str, keywords: List[str], mode: str = "any"
) -> bool:
"""Check if a hypothesis matches any/all of the ground truth keywords."""
hypothesis_lower = hypothesis.lower()
if mode == "any":
return any(kw.lower() in hypothesis_lower for kw in keywords)
elif mode == "all":
return all(kw.lower() in hypothesis_lower for kw in keywords)
return False
def _clamp(self, value: float) -> float:
"""Clamp a value to (0.0, 1.0)."""
return max(0.01, min(0.99, value))