Spaces:
Running
Running
| """ | |
| server/calibration_grader.py | |
| DebateFloor — Calibrated Uncertainty Training Environment | |
| Core innovation: rewards agents that know when they don't know. | |
| Based on CoCA framework: arXiv:2603.05881 | |
| "Co-optimizing Confidence and Accuracy via Segment-Specific GRPO Rewards" | |
| CRITICAL: This file implements the CALIBRATION reward only. | |
| The TRAINING reward (simple scalar) is also here. | |
| NEVER use eval_reward() for GRPO training — use training_reward(). | |
| """ | |
| from typing import Optional | |
| # ───────────────────────────────────────────────────────────── | |
| # THE 3×2 CALIBRATION MATRIX | |
| # This is the core innovation. Read this before editing anything. | |
| # | |
| # Philosophy: | |
| # HIGH confidence + CORRECT = best outcome (1.0) — decisive and right | |
| # HIGH confidence + WRONG = worst outcome (-0.8) — confident and wrong | |
| # MED confidence + CORRECT = good (0.6) — right but cautious | |
| # MED confidence + WRONG = ok (-0.2) — wrong but knew it | |
| # LOW confidence + CORRECT = weak (0.1) — right, wasted escalation | |
| # LOW confidence + WRONG = neutral (0.0) — at least it knew | |
| # ───────────────────────────────────────────────────────────── | |
| CALIBRATION_MATRIX: dict[tuple[str, bool], float] = { | |
| ("HIGH", True): 1.0, | |
| ("HIGH", False): -0.8, | |
| ("MED", True): 0.6, | |
| ("MED", False): -0.2, | |
| ("LOW", True): 0.1, | |
| ("LOW", False): 0.0, | |
| } | |
| # Anti-gaming thresholds | |
| LOW_CONFIDENCE_GAMING_THRESHOLD = 0.70 # >70% LOW = gaming | |
| HIGH_CONFIDENCE_GAMING_THRESHOLD = 0.80 # >80% HIGH = overconfidence | |
| MIN_HISTORY_FOR_GAMING_DETECTION = 10 # need at least 10 episodes | |
| def detect_confidence_gaming(episode_history: list[dict]) -> float: | |
| """ | |
| Detects and penalises systematic confidence manipulation. | |
| An agent cannot game the calibration reward by always declaring LOW | |
| confidence (to avoid HIGH+WRONG penalty) or always declaring HIGH | |
| confidence (to maximise HIGH+CORRECT reward). | |
| Args: | |
| episode_history: List of dicts with "confidence" key per episode. | |
| Example: [{"confidence": "LOW"}, {"confidence": "HIGH"}, ...] | |
| Returns: | |
| float: Penalty to subtract from reward. Always >= 0. | |
| Returns 0.0 if history is too short to detect gaming. | |
| """ | |
| if len(episode_history) < MIN_HISTORY_FOR_GAMING_DETECTION: | |
| return 0.0 | |
| total = len(episode_history) | |
| low_count = sum(1 for e in episode_history if e.get("confidence") == "LOW") | |
| high_count = sum(1 for e in episode_history if e.get("confidence") == "HIGH") | |
| low_rate = low_count / total | |
| high_rate = high_count / total | |
| penalty = 0.0 | |
| # Penalise systematic under-confidence (always say LOW to avoid punishment) | |
| if low_rate > LOW_CONFIDENCE_GAMING_THRESHOLD: | |
| penalty += (low_rate - LOW_CONFIDENCE_GAMING_THRESHOLD) * 2.0 | |
| # Penalise systematic over-confidence (always say HIGH to maximise reward) | |
| if high_rate > HIGH_CONFIDENCE_GAMING_THRESHOLD: | |
| penalty += (high_rate - HIGH_CONFIDENCE_GAMING_THRESHOLD) * 1.5 | |
| return min(penalty, 1.0) # cap total penalty at 1.0 | |
| def calibration_reward( | |
| decision: str, | |
| confidence: str, | |
| ground_truth: str, | |
| episode_history: Optional[list[dict]] = None, | |
| ) -> float: | |
| """ | |
| Core calibration reward. Used in EVALUATION reward composition. | |
| Args: | |
| decision: Agent's decision ("approve_claim", "deny_claim", "escalate_to_human") | |
| confidence: Agent's declared confidence ("HIGH", "MED", "LOW") | |
| ground_truth: Correct decision for this episode | |
| episode_history: List of past episode results for gaming detection | |
| Returns: | |
| float: Calibration reward in [-1.0, 1.0] | |
| """ | |
| if confidence not in ("HIGH", "MED", "LOW"): | |
| raise ValueError(f"Invalid confidence: {confidence}. Must be HIGH, MED, or LOW.") | |
| is_correct = (decision == ground_truth) | |
| base_reward = CALIBRATION_MATRIX[(confidence, is_correct)] | |
| # Apply anti-gaming penalty if we have enough history | |
| gaming_penalty = 0.0 | |
| if episode_history: | |
| gaming_penalty = detect_confidence_gaming(episode_history) | |
| result = base_reward - gaming_penalty | |
| # Always clamp to valid range | |
| return max(-1.0, min(1.0, result)) | |
| def escalation_reward( | |
| decision: str, | |
| confidence: str, | |
| ambiguity_score: float, | |
| ) -> float: | |
| """ | |
| Rewards appropriate escalation behaviour. | |
| An agent should escalate when genuinely uncertain (high ambiguity). | |
| Escalating on obvious cases wastes resources and is penalised. | |
| Args: | |
| decision: Agent's decision | |
| confidence: Agent's declared confidence | |
| ambiguity_score: How genuinely ambiguous this task is (0.0=obvious, 1.0=very ambiguous) | |
| Returns: | |
| float: Escalation reward in [-0.5, 0.7] | |
| """ | |
| is_escalation = (decision == "escalate_to_human") | |
| is_genuinely_ambiguous = ambiguity_score > 0.6 | |
| is_obviously_clear = ambiguity_score < 0.3 | |
| if is_escalation and is_genuinely_ambiguous and confidence == "LOW": | |
| return 0.7 # Perfect: uncertain + ambiguous task + escalated | |
| elif is_escalation and is_obviously_clear: | |
| return -0.3 # Bad: escalated on an easy/obvious task | |
| elif is_escalation and confidence == "HIGH": | |
| return -0.2 # Bad: escalated but was confident (contradictory) | |
| else: | |
| return 0.0 # Neutral: didn't escalate | |
| def training_reward( | |
| decision: str, | |
| confidence: Optional[str], | |
| ground_truth: str, | |
| legitimate_flags: int, | |
| step_num: int, | |
| done: bool, | |
| ) -> float: | |
| """ | |
| SIMPLE shaped scalar reward for GRPO training stability. | |
| ⚠️ USE THIS FOR GRPO TRAINING — NOT eval_reward(). | |
| Complex compound rewards cause gradient instability in GRPO. | |
| This function provides a clear, stable learning signal. | |
| Args: | |
| decision: Agent's terminal decision (or None if non-terminal) | |
| confidence: Agent's declared confidence (None for non-terminal steps) | |
| ground_truth: Correct decision for this episode | |
| legitimate_flags: Number of correctly identified fraud signals this episode | |
| step_num: Current step number | |
| done: Whether episode is complete | |
| Returns: | |
| float: Training reward (negative at each step, positive signal on completion) | |
| """ | |
| # Step penalty — encourages efficiency | |
| r = -0.05 | |
| if done and decision is not None: | |
| is_correct = (decision == ground_truth) | |
| # Decision accuracy (main signal) | |
| r += 1.0 if is_correct else -0.5 | |
| # Legitimate fraud signal detection (partial credit) | |
| r += 0.3 * min(legitimate_flags, 3) # cap at 3 flags | |
| # Calibration bonus (weighted 50% of calibration matrix) | |
| if confidence and confidence in ("HIGH", "MED", "LOW"): | |
| calib_value = CALIBRATION_MATRIX.get((confidence, is_correct), 0.0) | |
| r += 0.5 * calib_value | |
| return float(r) | |
| def eval_reward( | |
| decision: str, | |
| confidence: str, | |
| ground_truth: str, | |
| ambiguity_score: float, | |
| evidence_quality: float, | |
| efficiency_score: float, | |
| episode_history: Optional[list[dict]] = None, | |
| ) -> float: | |
| """ | |
| FULL 6-component evaluation reward. Used for REPORTING and DEMO only. | |
| ⚠️ DO NOT USE FOR GRPO TRAINING. Use training_reward() instead. | |
| Components: | |
| 35% calibration_reward — confidence accuracy matrix | |
| 25% escalation_reward — appropriate uncertainty escalation | |
| 20% evidence_quality — specificity of fraud signal citations | |
| 10% efficiency_score — step efficiency (inherited from Round 1) | |
| 10% gaming_penalty pool — anti-gaming deductions | |
| Args: | |
| decision: Agent's terminal decision | |
| confidence: Agent's declared confidence | |
| ground_truth: Correct decision | |
| ambiguity_score: Task ambiguity (0.0=obvious, 1.0=very ambiguous) | |
| evidence_quality: Quality of fraud signal evidence (0.0–1.0) | |
| efficiency_score: Step efficiency from environment (0.0–1.0) | |
| episode_history: For gaming detection | |
| Returns: | |
| float: Composite evaluation score in [0.0, 1.0] | |
| """ | |
| calib_r = calibration_reward(decision, confidence, ground_truth, episode_history) | |
| escal_r = escalation_reward(decision, confidence, ambiguity_score) | |
| gaming_p = detect_confidence_gaming(episode_history) if episode_history else 0.0 | |
| raw = ( | |
| 0.35 * calib_r + | |
| 0.25 * escal_r + | |
| 0.20 * evidence_quality + | |
| 0.10 * efficiency_score - | |
| 0.10 * gaming_p | |
| ) | |
| # Normalise to [0.0, 1.0] for evaluation reporting | |
| # Raw range is approximately [-0.8, 1.0], shift and scale | |
| normalised = (raw + 0.8) / 1.8 | |
| return max(0.0, min(1.0, normalised)) |