from __future__ import annotations from typing import Any def _keyword_score(reply: str | None, keywords: list[str]) -> float: if not reply or not keywords: return 0.0 lowered = reply.lower() hits = sum(1 for keyword in keywords if keyword.lower() in lowered) return hits / len(keywords) def grade_task( task: dict[str, Any], collected_evidence: list[str], drafted_reply: str | None, resolution_code: str | None, step_count: int, repeat_action_count: int, ) -> dict[str, float]: required_evidence = task["required_evidence"] evidence_hits = sum(1 for key in required_evidence if key in collected_evidence) evidence_score = evidence_hits / len(required_evidence) resolution_score = 1.0 if resolution_code == task["expected_resolution_code"] else 0.0 reply_score = _keyword_score(drafted_reply, task["reply_keywords"]) optimal_steps = task.get("optimal_steps", len(required_evidence) + 2) extra_steps = max(0, step_count - optimal_steps) efficiency_penalty = min(0.25, (extra_steps * 0.05) + (repeat_action_count * 0.04)) efficiency_score = max(0.0, 1.0 - efficiency_penalty) final_score = ( 0.5 * resolution_score + 0.25 * evidence_score + 0.2 * reply_score + 0.05 * efficiency_score ) return { "resolution_score": round(resolution_score, 3), "evidence_score": round(evidence_score, 3), "reply_score": round(reply_score, 3), "efficiency_score": round(efficiency_score, 3), "final_score": round(min(max(final_score, 0.0), 1.0), 3), }