Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

shank

Fix: Changed scores range

0769caa about 1 month ago

1.77 kB

	"""
	Base Grader — Abstract base class for all graders.
	"""

	from abc import ABC, abstractmethod
	from typing import List, Dict, Any


	class BaseGrader(ABC):
	"""Abstract base grader. All graders must implement score()."""

	@abstractmethod
	def score(
	self,
	task_config: dict,
	attempts: List[Dict[str, Any]],
	best_tests_passed: int,
	tests_total: int,
	attempts_used: int,
	max_attempts: int,
	hypotheses: List[str],
	) -> float:
	"""
	Score an episode. Must return a float in [0.0, 1.0].
	Must be deterministic: same inputs → same output.

	Args:
	task_config: The full task config dict
	attempts: List of attempt dicts with code_submitted, hypothesis, tests_passed, etc.
	best_tests_passed: Best test pass count across all attempts
	tests_total: Total tests in the suite
	attempts_used: Number of fix attempts used
	max_attempts: Maximum allowed attempts
	hypotheses: All hypotheses submitted

	Returns:
	float in [0.0, 1.0]
	"""
	pass

	def _check_hypothesis_keywords(
	self, hypothesis: str, keywords: List[str], mode: str = "any"
	) -> bool:
	"""Check if a hypothesis matches any/all of the ground truth keywords."""
	hypothesis_lower = hypothesis.lower()
	if mode == "any":
	return any(kw.lower() in hypothesis_lower for kw in keywords)
	elif mode == "all":
	return all(kw.lower() in hypothesis_lower for kw in keywords)
	return False

	def _clamp(self, value: float) -> float:
	"""Clamp a value to (0.0, 1.0)."""
	return max(0.01, min(0.99, value))