Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from typing import Any | |
| def _keyword_score(reply: str | None, keywords: list[str]) -> float: | |
| if not reply or not keywords: | |
| return 0.0 | |
| lowered = reply.lower() | |
| hits = sum(1 for keyword in keywords if keyword.lower() in lowered) | |
| return hits / len(keywords) | |
| def grade_task( | |
| task: dict[str, Any], | |
| collected_evidence: list[str], | |
| drafted_reply: str | None, | |
| resolution_code: str | None, | |
| step_count: int, | |
| repeat_action_count: int, | |
| ) -> dict[str, float]: | |
| required_evidence = task["required_evidence"] | |
| evidence_hits = sum(1 for key in required_evidence if key in collected_evidence) | |
| evidence_score = evidence_hits / len(required_evidence) | |
| resolution_score = 1.0 if resolution_code == task["expected_resolution_code"] else 0.0 | |
| reply_score = _keyword_score(drafted_reply, task["reply_keywords"]) | |
| optimal_steps = task.get("optimal_steps", len(required_evidence) + 2) | |
| extra_steps = max(0, step_count - optimal_steps) | |
| efficiency_penalty = min(0.25, (extra_steps * 0.05) + (repeat_action_count * 0.04)) | |
| efficiency_score = max(0.0, 1.0 - efficiency_penalty) | |
| final_score = ( | |
| 0.5 * resolution_score | |
| + 0.25 * evidence_score | |
| + 0.2 * reply_score | |
| + 0.05 * efficiency_score | |
| ) | |
| return { | |
| "resolution_score": round(resolution_score, 3), | |
| "evidence_score": round(evidence_score, 3), | |
| "reply_score": round(reply_score, 3), | |
| "efficiency_score": round(efficiency_score, 3), | |
| "final_score": round(min(max(final_score, 0.0), 1.0), 3), | |
| } | |