Spaces:
Sleeping
Sleeping
| """Strict semantic evaluation math for ``dataops-gym``.""" | |
| from __future__ import annotations | |
| from typing import Any, Dict, Mapping, MutableMapping, Optional, Tuple | |
| def grade_step_details( | |
| state: MutableMapping[str, Any], | |
| action: Mapping[str, Any], | |
| result: MutableMapping[str, Any], | |
| ) -> Tuple[float, Dict[str, Any]]: | |
| """Apply the exact per-step reward rules with no score clamping.""" | |
| score = 0.0 | |
| components: Dict[str, float] = {} | |
| confidence = float(action.get("confidence", 0.0)) | |
| action_type = str(action.get("action_type", "")) | |
| if result.get("classification_correct"): | |
| # Detect is intentionally lower value than fix/cannot_determine. | |
| if action_type == "detect_issue": | |
| score += 0.1 | |
| components["classification"] = 0.1 | |
| else: | |
| score += 0.2 | |
| components["classification"] = 0.2 | |
| elif result.get("classification_incorrect"): | |
| score -= 0.2 | |
| components["classification"] = -0.2 | |
| if result.get("correct_issue_detected"): | |
| if action_type == "detect_issue": | |
| score += 0.05 | |
| components["issue_detection"] = 0.05 | |
| else: | |
| score += 0.15 | |
| components["issue_detection"] = 0.15 | |
| elif result.get("missed_issue"): | |
| score -= 0.15 | |
| components["issue_detection"] = -0.15 | |
| elif result.get("false_issue"): | |
| score -= 0.05 | |
| components["issue_detection"] = -0.05 | |
| if result.get("correct_fix"): | |
| score += 0.25 | |
| components["decision"] = 0.25 | |
| elif result.get("correct_cannot_determine"): | |
| score += 0.25 | |
| components["decision"] = 0.25 | |
| elif result.get("hallucinated_fix"): | |
| score -= 0.5 | |
| components["decision"] = -0.5 | |
| elif result.get("wrong_fix"): | |
| score -= 0.4 | |
| components["decision"] = -0.4 | |
| elif result.get("wrong_cannot_determine"): | |
| score -= 0.2 | |
| components["decision"] = -0.2 | |
| if result.get("passive_penalty"): | |
| score -= 0.05 | |
| components["passive_penalty"] = -0.05 | |
| if result.get("repeated_detection"): | |
| score -= 0.1 | |
| components["repeated_detection_penalty"] = -0.1 | |
| extra_mods = int(result.get("extra_fields_modified", 0)) | |
| if extra_mods > 0: | |
| over = -0.05 * extra_mods | |
| score += over | |
| components["overcorrection"] = over | |
| if result.get("consistent_handling"): | |
| score += 0.2 | |
| components["cross_record_consistency"] = 0.2 | |
| elif result.get("inconsistent_handling"): | |
| score -= 0.3 | |
| components["cross_record_consistency"] = -0.3 | |
| is_correct = bool( | |
| result.get("classification_correct") | |
| or result.get("correct_fix") | |
| or result.get("correct_cannot_determine") | |
| or result.get("correct_issue_detected") | |
| ) | |
| is_wrong = bool( | |
| result.get("classification_incorrect") | |
| or result.get("wrong_fix") | |
| or result.get("hallucinated_fix") | |
| or result.get("wrong_cannot_determine") | |
| or result.get("false_issue") | |
| ) | |
| if confidence > 0.7 and is_correct: | |
| score += 0.05 | |
| components["confidence"] = 0.05 | |
| elif confidence > 0.7 and is_wrong: | |
| score -= 0.1 | |
| components["confidence"] = -0.1 | |
| if result.get("hallucinated_fix") and confidence > 0.8: | |
| score -= 0.2 | |
| components["confident_hallucination_amplification"] = -0.2 | |
| if result.get("resolved_detected_issue"): | |
| score += 0.15 | |
| components["resolution_reward"] = 0.15 | |
| return score, components | |
| def grade_task_result( | |
| task_definition: Mapping[str, Any], | |
| table: Any, | |
| state: Optional[Mapping[str, Any]] = None, | |
| ) -> float: | |
| """Compute final task score in [0, 1] using required formula.""" | |
| _ = task_definition | |
| _ = table | |
| state = state or {} | |
| per_record_scores = dict(state.get("per_record_scores", {})) | |
| n = max(1, len(per_record_scores)) | |
| avg_record_score = sum(float(v) for v in per_record_scores.values()) / n | |
| normalized_record_score = (avg_record_score + 1.0) / 2.0 | |
| normalized_record_score = max(0.0, min(1.0, normalized_record_score)) | |
| hallucination_rate = float(state.get("hallucination_rate", 0.0)) | |
| uncertainty_accuracy = float(state.get("uncertainty_accuracy", 0.0)) | |
| consistency_score = float(state.get("consistency_score", 1.0)) | |
| task_score = ( | |
| 0.5 * normalized_record_score | |
| + 0.2 * (1.0 - hallucination_rate) | |
| + 0.15 * uncertainty_accuracy | |
| + 0.15 * consistency_score | |
| ) | |
| eps = 1e-6 | |
| return max(eps, min(1.0 - eps, task_score)) | |
| def task_failure_messages( | |
| task_definition: Mapping[str, Any], | |
| table: Any, | |
| state: Optional[Mapping[str, Any]] = None, | |
| ) -> list[str]: | |
| """Return lightweight failure reasons collected during stepping.""" | |
| _ = task_definition | |
| _ = table | |
| state = state or {} | |
| failures = state.get("failure_logs", []) | |
| return [str(f.get("details", "")) for f in failures if f.get("details")] | |
| __all__ = ["grade_step_details", "grade_task_result", "task_failure_messages"] | |