Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

AgentDebugger-training-v3 / env /graders /grader_medium.py

shank

fix: score floor for medium grader, add root and tasks endpoints

b658e10 about 1 month ago

3.07 kB

	"""
	Grader Medium — Scoring with red herring detection.
	Same base formula as easy, but with special hypothesis logic:
	- Hypothesis mentioning ONLY "authenticate_user" scores 0.0 for hypothesis_accuracy
	- Must mention "hash_password" AND at least 1 other keyword to get full marks
	"""

	import math
	from typing import List, Dict, Any
	from env.graders.base_grader import BaseGrader


	class MediumGrader(BaseGrader):

	def _score_hypothesis(self, hypothesis: str, ground_truth: dict) -> float:
	"""Score a single hypothesis with red herring detection."""
	h_lower = hypothesis.lower()
	keywords = ground_truth["hypothesis_keywords"]
	red_herring = ground_truth.get("red_herring_keyword", "authenticate_user")

	# Check if only the red herring is mentioned (no correct keywords)
	mentions_red_herring = red_herring.lower() in h_lower
	mentions_hash_password = "hash_password" in h_lower

	# Must mention "hash_password" AND at least 1 other keyword
	other_keywords = [kw for kw in keywords if kw.lower() != "hash_password"]
	mentions_other = any(kw.lower() in h_lower for kw in other_keywords)

	if mentions_hash_password and mentions_other:
	return 1.0 # Full credit
	elif mentions_hash_password:
	return 0.5 # Partial — found right function but no detail
	elif mentions_red_herring and not mentions_hash_password:
	return 0.0 # Red herring was followed
	else:
	return 0.1 # Generic hypothesis

	def score(
	self,
	task_config: dict,
	attempts: List[Dict[str, Any]],
	best_tests_passed: int,
	tests_total: int,
	attempts_used: int,
	max_attempts: int,
	hypotheses: List[str],
	) -> float:
	ground_truth = task_config["ground_truth"]

	# 1. Test pass ratio (weight: 0.60)

	if attempts:
	agent_best = max(a.get("tests_passed",0) for a in attempts)
	else:
	agent_best = 0
	test_pass_ratio = (agent_best / tests_total) if tests_total > 0 else 0.0
	test_score = test_pass_ratio * 0.60

	# 2. Efficiency bonus (weight: 0.20)
	efficiency = max(0.0, (max_attempts - attempts_used) / max_attempts) if max_attempts > 0 else 0.0
	efficiency_score = efficiency * 0.20

	# 3. Hypothesis accuracy with red herring detection (weight: 0.15)
	if hypotheses:
	h_scores = [self._score_hypothesis(h, ground_truth) for h in hypotheses]
	hypothesis_ratio = sum(h_scores) / len(h_scores)
	else:
	hypothesis_ratio = 0.0
	hypothesis_score = hypothesis_ratio * 0.15

	# 4. Early solve bonus (weight: 0.05)
	early_threshold = math.ceil(max_attempts / 3)
	all_pass = best_tests_passed == tests_total
	early_solve_score = 0.05 if (all_pass and attempts_used <= early_threshold) else 0.0

	total = test_score + efficiency_score + hypothesis_score + early_solve_score
	return self._clamp(total)