OpenEnv / openenv /core /grader.py
mahammadaftab's picture
Initial OpenEnv Email Triage Submission
4b77608
"""
Task Graders for OpenEnv Email Triage
Implements agent graders for three difficulty levels (easy, medium, hard)
with scoring from 0.0 to 1.0 based on criteria like accuracy and critical safety.
"""
import numpy as np
from typing import Dict, Any, List, Tuple
from dataclasses import dataclass
@dataclass
class GradingCriteria:
name: str
weight: float
score: float = 0.0
class TaskGrader:
def __init__(self, config: Dict[str, Any]):
self.config = config
self.criteria = []
self.episode_data = {
'steps': 0,
'correct_actions': 0,
'incorrect_actions': 0,
'critical_failures': 0,
}
self._initialize_criteria()
def _initialize_criteria(self) -> None:
for criterion_config in self.config.get('criteria', []):
criterion = GradingCriteria(
name=criterion_config['name'],
weight=criterion_config['weight']
)
self.criteria.append(criterion)
def reset(self) -> None:
self.episode_data = {
'steps': 0,
'correct_actions': 0,
'incorrect_actions': 0,
'critical_failures': 0,
}
for criterion in self.criteria:
criterion.score = 0.0
def update(self, **kwargs) -> None:
for key, value in kwargs.items():
if key in self.episode_data:
self.episode_data[key] = value
def compute_scores(self) -> Dict[str, float]:
scores = {}
# Accuracy Criterion
acc_criterion = next((c for c in self.criteria if c.name == 'accuracy'), None)
if acc_criterion:
total_actions = self.episode_data['correct_actions'] + self.episode_data['incorrect_actions']
if total_actions > 0:
acc_criterion.score = float(self.episode_data['correct_actions']) / total_actions
else:
acc_criterion.score = 0.0
scores['accuracy'] = acc_criterion.score if acc_criterion else 0.0
# Critical Safety Criterion
safety_criterion = next((c for c in self.criteria if c.name == 'critical_safety'), None)
if safety_criterion:
failures = self.episode_data['critical_failures']
if failures == 0:
safety_criterion.score = 1.0
else:
safety_criterion.score = max(0.0, 1.0 - (failures * 0.3)) # Penalty per failure
scores['critical_safety'] = safety_criterion.score if safety_criterion else 0.0
return scores
def get_final_score(self) -> float:
self.compute_scores()
total_weight = sum(c.weight for c in self.criteria)
weighted_sum = sum(c.score * c.weight for c in self.criteria)
if total_weight > 0:
final_score = weighted_sum / total_weight
else:
final_score = 0.0
return np.clip(final_score, 0.0, 1.0)
def get_grade_report(self) -> Dict[str, Any]:
scores = self.compute_scores()
final_score = self.get_final_score()
threshold = self.config.get('success_threshold', 0.7)
return {
'final_score': final_score,
'success_threshold': threshold,
'passed': final_score >= threshold,
'criteria_scores': {c.name: c.score for c in self.criteria},
'episode_data': self.episode_data.copy(),
'feedback': self._generate_feedback(scores),
}
def _generate_feedback(self, scores: Dict[str, float]) -> str:
feedback = []
if scores.get('accuracy', 0) < 0.7:
feedback.append("Triage Accuracy needs improvement.")
else:
feedback.append("Good triage accuracy.")
if scores.get('critical_safety', 1.0) < 1.0:
feedback.append("Critical safety failures occurred (e.g. ignored urgent email).")
return " | ".join(feedback)
class EasyGrader(TaskGrader):
pass
class MediumGrader(TaskGrader):
pass
class HardGrader(TaskGrader):
pass
def create_grader(task_level: str, config: Dict[str, Any]) -> TaskGrader:
graders = {
'easy': EasyGrader,
'medium': MediumGrader,
'hard': HardGrader,
}
if task_level not in graders:
raise ValueError(f"Unknown task level: {task_level}")
return graders[task_level](config)