File size: 1,771 Bytes
0ee66d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | """
Grader Easy — Standard scoring formula for the binary search task.
Formula: 0.60 test_pass_ratio + 0.20 efficiency + 0.15 hypothesis + 0.05 early_solve
"""
import math
from typing import List, Dict, Any
from env.graders.base_grader import BaseGrader
class EasyGrader(BaseGrader):
def score(
self,
task_config: dict,
attempts: List[Dict[str, Any]],
best_tests_passed: int,
tests_total: int,
attempts_used: int,
max_attempts: int,
hypotheses: List[str],
) -> float:
ground_truth = task_config["ground_truth"]
keywords = ground_truth["hypothesis_keywords"]
# 1. Test pass ratio (weight: 0.60)
test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
test_score = test_pass_ratio * 0.60
# 2. Efficiency bonus (weight: 0.20)
efficiency = max(0.0, (max_attempts - attempts_used) / max_attempts) if max_attempts > 0 else 0.0
efficiency_score = efficiency * 0.20
# 3. Hypothesis accuracy (weight: 0.15)
if hypotheses:
matches = sum(
1 for h in hypotheses
if self._check_hypothesis_keywords(h, keywords, "any")
)
hypothesis_ratio = matches / len(hypotheses)
else:
hypothesis_ratio = 0.0
hypothesis_score = hypothesis_ratio * 0.15
# 4. Early solve bonus (weight: 0.05)
early_threshold = math.ceil(max_attempts / 3)
all_pass = best_tests_passed == tests_total
early_solve_score = 0.05 if (all_pass and attempts_used <= early_threshold) else 0.0
total = test_score + efficiency_score + hypothesis_score + early_solve_score
return self._clamp(total)
|