File size: 1,771 Bytes
0ee66d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0769caa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
Base Grader — Abstract base class for all graders.
"""

from abc import ABC, abstractmethod
from typing import List, Dict, Any


class BaseGrader(ABC):
    """Abstract base grader. All graders must implement score()."""

    @abstractmethod
    def score(
        self,
        task_config: dict,
        attempts: List[Dict[str, Any]],
        best_tests_passed: int,
        tests_total: int,
        attempts_used: int,
        max_attempts: int,
        hypotheses: List[str],
    ) -> float:
        """
        Score an episode. Must return a float in [0.0, 1.0].
        Must be deterministic: same inputs → same output.
        
        Args:
            task_config: The full task config dict
            attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc.
            best_tests_passed: Best test pass count across all attempts
            tests_total: Total tests in the suite
            attempts_used: Number of fix attempts used
            max_attempts: Maximum allowed attempts
            hypotheses: All hypotheses submitted
        
        Returns:
            float in [0.0, 1.0]
        """
        pass

    def _check_hypothesis_keywords(
        self, hypothesis: str, keywords: List[str], mode: str = "any"
    ) -> bool:
        """Check if a hypothesis matches any/all of the ground truth keywords."""
        hypothesis_lower = hypothesis.lower()
        if mode == "any":
            return any(kw.lower() in hypothesis_lower for kw in keywords)
        elif mode == "all":
            return all(kw.lower() in hypothesis_lower for kw in keywords)
        return False

    def _clamp(self, value: float) -> float:
        """Clamp a value to (0.0, 1.0)."""
        return max(0.01, min(0.99, value))