Spaces:
Sleeping
Sleeping
File size: 1,614 Bytes
18feac5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | from __future__ import annotations
from typing import Any
def _keyword_score(reply: str | None, keywords: list[str]) -> float:
if not reply or not keywords:
return 0.0
lowered = reply.lower()
hits = sum(1 for keyword in keywords if keyword.lower() in lowered)
return hits / len(keywords)
def grade_task(
task: dict[str, Any],
collected_evidence: list[str],
drafted_reply: str | None,
resolution_code: str | None,
step_count: int,
repeat_action_count: int,
) -> dict[str, float]:
required_evidence = task["required_evidence"]
evidence_hits = sum(1 for key in required_evidence if key in collected_evidence)
evidence_score = evidence_hits / len(required_evidence)
resolution_score = 1.0 if resolution_code == task["expected_resolution_code"] else 0.0
reply_score = _keyword_score(drafted_reply, task["reply_keywords"])
optimal_steps = task.get("optimal_steps", len(required_evidence) + 2)
extra_steps = max(0, step_count - optimal_steps)
efficiency_penalty = min(0.25, (extra_steps * 0.05) + (repeat_action_count * 0.04))
efficiency_score = max(0.0, 1.0 - efficiency_penalty)
final_score = (
0.5 * resolution_score
+ 0.25 * evidence_score
+ 0.2 * reply_score
+ 0.05 * efficiency_score
)
return {
"resolution_score": round(resolution_score, 3),
"evidence_score": round(evidence_score, 3),
"reply_score": round(reply_score, 3),
"efficiency_score": round(efficiency_score, 3),
"final_score": round(min(max(final_score, 0.0), 1.0), 3),
}
|