HyperBrickCaseOps / graders.py
modelbuilderhq's picture
Upload folder using huggingface_hub
4f129c9 verified
raw
history blame
5.5 kB
"""Deterministic graders and reward helpers for SupportDesk."""
from __future__ import annotations
import re
from dataclasses import dataclass
from models import SupportCaseProgress
from tasks import SupportTaskSpec, get_task
STRICT_SCORE_EPSILON = 0.01
@dataclass(frozen=True)
class GradeBreakdown:
"""A scored view of how close a case is to the gold solution."""
total_score: float
queue_score: float
priority_score: float
issue_type_score: float
requested_fields_score: float
reply_score: float
note_score: float
status_score: float
resolution_score: float
completed_milestones: tuple[str, ...]
def _normalize(text: str | None) -> str:
if not text:
return ""
normalized = text.lower().replace("-", " ")
return re.sub(r"[^a-z0-9\s]", " ", normalized)
def _marker_group_score(text: str | None, marker_groups: tuple[tuple[str, ...], ...]) -> float:
if not marker_groups:
return 1.0
normalized = _normalize(text)
if not normalized:
return 0.0
matches = 0
for group in marker_groups:
if any(_normalize(marker) in normalized for marker in group):
matches += 1
return matches / len(marker_groups)
def _requested_fields_score(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
required = set(task.required_requested_fields)
requested = set(case.requested_fields)
if not required:
return 1.0 if not requested else 0.0
if not requested:
return 0.0
matched = len(required.intersection(requested))
extras = len(requested.difference(required))
raw = matched / len(required)
penalty = min(0.25, extras * 0.05)
return max(0.0, raw - penalty)
def _reply_penalty(case: SupportCaseProgress, task: SupportTaskSpec) -> float:
text = _normalize(case.reply)
if not text:
return 0.0
return 0.0 if not any(_normalize(marker) in text for marker in task.forbidden_reply_markers) else 0.5
def _strict_open_unit_interval(score: float) -> float:
"""Keep final task scores strictly within (0, 1) for evaluator compatibility."""
return min(1.0 - STRICT_SCORE_EPSILON, max(STRICT_SCORE_EPSILON, score))
def grade_case(task: SupportTaskSpec, case: SupportCaseProgress) -> GradeBreakdown:
"""Score a case deterministically with total_score strictly inside (0, 1)."""
queue_score = 1.0 if case.queue == task.gold_queue else 0.0
priority_score = 1.0 if case.priority == task.gold_priority else 0.0
issue_type_score = 1.0 if case.issue_type == task.gold_issue_type else 0.0
requested_fields_score = _requested_fields_score(case, task)
reply_score = max(0.0, _marker_group_score(case.reply, task.required_reply_markers) - _reply_penalty(case, task))
note_score = _marker_group_score(case.internal_note, task.required_note_markers)
status_score = 1.0 if case.status == task.gold_status else 0.0
resolution_score = 1.0 if case.resolution_code == task.gold_resolution_code else 0.0
weighted_total = (
queue_score * 0.15
+ priority_score * 0.10
+ issue_type_score * 0.10
+ requested_fields_score * 0.15
+ reply_score * 0.25
+ note_score * 0.10
+ status_score * 0.10
+ resolution_score * 0.05
)
milestones: list[str] = []
if queue_score:
milestones.append("queue")
if priority_score:
milestones.append("priority")
if issue_type_score:
milestones.append("issue_type")
if requested_fields_score >= 0.99:
milestones.append("requested_fields")
if reply_score >= 0.99:
milestones.append("reply")
if note_score >= 0.99:
milestones.append("internal_note")
if status_score:
milestones.append("status")
if resolution_score:
milestones.append("resolution_code")
return GradeBreakdown(
total_score=round(_strict_open_unit_interval(weighted_total), 4),
queue_score=queue_score,
priority_score=priority_score,
issue_type_score=issue_type_score,
requested_fields_score=round(requested_fields_score, 4),
reply_score=round(reply_score, 4),
note_score=round(note_score, 4),
status_score=status_score,
resolution_score=resolution_score,
completed_milestones=tuple(milestones),
)
def grade_task_id(task_id: str, case: SupportCaseProgress) -> GradeBreakdown:
"""Convenience wrapper used by tests and evaluation scripts."""
return grade_case(get_task(task_id), case)
class _TaskSpecificGrader:
"""Importable task-specific grader wrapper for validator task discovery."""
task_id: str = ""
def grade(self, case: SupportCaseProgress) -> float:
return grade_task_id(self.task_id, case).total_score
def __call__(self, case: SupportCaseProgress) -> float:
return self.grade(case)
class BillingRefundEasyGrader(_TaskSpecificGrader):
task_id = "billing_refund_easy"
class AccountTakeoverMediumGrader(_TaskSpecificGrader):
task_id = "account_takeover_medium"
class ApiIncidentHardGrader(_TaskSpecificGrader):
task_id = "api_incident_hard"
class RegulatedExportExceptionHardGrader(_TaskSpecificGrader):
task_id = "regulated_export_exception_hard"
__all__ = [
"AccountTakeoverMediumGrader",
"ApiIncidentHardGrader",
"BillingRefundEasyGrader",
"GradeBreakdown",
"RegulatedExportExceptionHardGrader",
"grade_case",
"grade_task_id",
]