ajaxwin
refactor: Update task configurations and grading logic for improved scoring and consistency
dccaaac
"""
grader.py (Task 2 – Property Discovery)
-----------------------------------------
Deterministic scorer for natural-language property submissions.
One submission attempt per episode.
Grade range: 0.0 – 1.0 (matchscore output, already normalised).
"""
from typing import Tuple
from utils import SemanticMatcher
class Task2Grader:
"""
Grades a Task 2 property submission.
Parameters
----------
function_name : name of the target function
property : the 'property' field from the target function's data
"""
def __init__(self, function_name: str, property: str, n: int) -> None:
self.function_name = function_name
self.property = property
self.n = n
self._decay = 0.75
def _clamp(self, reward: float) -> float:
return max(0.001, min(0.999, reward))
def grade(self, submitted: str, steps: int, cummulative_cost: int) -> Tuple[float, str]:
"""Deterministic grade strictly in (0, 1)."""
if not submitted or not submitted.strip():
return 0.001, "no_match"
matcher = SemanticMatcher()
match_score = matcher.matchscore(self.property, submitted)
free_budget = (cummulative_cost / steps) * (self.n + 2)
final_score = (match_score * 0.5) + (self._decay ** max(0, cummulative_cost - free_budget))
return self._clamp(final_score), matcher.confidence()