""" server/calibration_grader.py DebateFloor — Calibrated Uncertainty Training Environment Core innovation: rewards agents that know when they don't know. Based on CoCA framework: arXiv:2603.05881 "Co-optimizing Confidence and Accuracy via Segment-Specific GRPO Rewards" CRITICAL: This file implements the CALIBRATION reward only. The TRAINING reward (simple scalar) is also here. NEVER use eval_reward() for GRPO training — use training_reward(). """ from typing import Optional # ───────────────────────────────────────────────────────────── # THE 3×2 CALIBRATION MATRIX # This is the core innovation. Read this before editing anything. # # Philosophy: # HIGH confidence + CORRECT = best outcome (1.0) — decisive and right # HIGH confidence + WRONG = worst outcome (-0.8) — confident and wrong # MED confidence + CORRECT = good (0.6) — right but cautious # MED confidence + WRONG = ok (-0.2) — wrong but knew it # LOW confidence + CORRECT = weak (0.1) — right, wasted escalation # LOW confidence + WRONG = neutral (0.0) — at least it knew # ───────────────────────────────────────────────────────────── CALIBRATION_MATRIX: dict[tuple[str, bool], float] = { ("HIGH", True): 1.0, ("HIGH", False): -0.8, ("MED", True): 0.6, ("MED", False): -0.2, ("LOW", True): 0.1, ("LOW", False): 0.0, } # Anti-gaming thresholds LOW_CONFIDENCE_GAMING_THRESHOLD = 0.70 # >70% LOW = gaming HIGH_CONFIDENCE_GAMING_THRESHOLD = 0.80 # >80% HIGH = overconfidence MIN_HISTORY_FOR_GAMING_DETECTION = 10 # need at least 10 episodes def detect_confidence_gaming(episode_history: list[dict]) -> float: """ Detects and penalises systematic confidence manipulation. An agent cannot game the calibration reward by always declaring LOW confidence (to avoid HIGH+WRONG penalty) or always declaring HIGH confidence (to maximise HIGH+CORRECT reward). Args: episode_history: List of dicts with "confidence" key per episode. Example: [{"confidence": "LOW"}, {"confidence": "HIGH"}, ...] Returns: float: Penalty to subtract from reward. Always >= 0. Returns 0.0 if history is too short to detect gaming. """ if len(episode_history) < MIN_HISTORY_FOR_GAMING_DETECTION: return 0.0 total = len(episode_history) low_count = sum(1 for e in episode_history if e.get("confidence") == "LOW") high_count = sum(1 for e in episode_history if e.get("confidence") == "HIGH") low_rate = low_count / total high_rate = high_count / total penalty = 0.0 # Penalise systematic under-confidence (always say LOW to avoid punishment) if low_rate > LOW_CONFIDENCE_GAMING_THRESHOLD: penalty += (low_rate - LOW_CONFIDENCE_GAMING_THRESHOLD) * 2.0 # Penalise systematic over-confidence (always say HIGH to maximise reward) if high_rate > HIGH_CONFIDENCE_GAMING_THRESHOLD: penalty += (high_rate - HIGH_CONFIDENCE_GAMING_THRESHOLD) * 1.5 return min(penalty, 1.0) # cap total penalty at 1.0 def calibration_reward( decision: str, confidence: str, ground_truth: str, episode_history: Optional[list[dict]] = None, ) -> float: """ Core calibration reward. Used in EVALUATION reward composition. Args: decision: Agent's decision ("approve_claim", "deny_claim", "escalate_to_human") confidence: Agent's declared confidence ("HIGH", "MED", "LOW") ground_truth: Correct decision for this episode episode_history: List of past episode results for gaming detection Returns: float: Calibration reward in [-1.0, 1.0] """ if confidence not in ("HIGH", "MED", "LOW"): raise ValueError(f"Invalid confidence: {confidence}. Must be HIGH, MED, or LOW.") is_correct = (decision == ground_truth) base_reward = CALIBRATION_MATRIX[(confidence, is_correct)] # Apply anti-gaming penalty if we have enough history gaming_penalty = 0.0 if episode_history: gaming_penalty = detect_confidence_gaming(episode_history) result = base_reward - gaming_penalty # Always clamp to valid range return max(-1.0, min(1.0, result)) def escalation_reward( decision: str, confidence: str, ambiguity_score: float, ) -> float: """ Rewards appropriate escalation behaviour. An agent should escalate when genuinely uncertain (high ambiguity). Escalating on obvious cases wastes resources and is penalised. Args: decision: Agent's decision confidence: Agent's declared confidence ambiguity_score: How genuinely ambiguous this task is (0.0=obvious, 1.0=very ambiguous) Returns: float: Escalation reward in [-0.5, 0.7] """ is_escalation = (decision == "escalate_to_human") is_genuinely_ambiguous = ambiguity_score > 0.6 is_obviously_clear = ambiguity_score < 0.3 if is_escalation and is_genuinely_ambiguous and confidence == "LOW": return 0.7 # Perfect: uncertain + ambiguous task + escalated elif is_escalation and is_obviously_clear: return -0.3 # Bad: escalated on an easy/obvious task elif is_escalation and confidence == "HIGH": return -0.2 # Bad: escalated but was confident (contradictory) else: return 0.0 # Neutral: didn't escalate def training_reward( decision: str, confidence: Optional[str], ground_truth: str, legitimate_flags: int, step_num: int, done: bool, ) -> float: """ SIMPLE shaped scalar reward for GRPO training stability. ⚠️ USE THIS FOR GRPO TRAINING — NOT eval_reward(). Complex compound rewards cause gradient instability in GRPO. This function provides a clear, stable learning signal. Args: decision: Agent's terminal decision (or None if non-terminal) confidence: Agent's declared confidence (None for non-terminal steps) ground_truth: Correct decision for this episode legitimate_flags: Number of correctly identified fraud signals this episode step_num: Current step number done: Whether episode is complete Returns: float: Training reward (negative at each step, positive signal on completion) """ # Step penalty — encourages efficiency r = -0.05 if done and decision is not None: is_correct = (decision == ground_truth) # Decision accuracy (main signal) r += 1.0 if is_correct else -0.5 # Legitimate fraud signal detection (partial credit) r += 0.3 * min(legitimate_flags, 3) # cap at 3 flags # Calibration bonus (weighted 50% of calibration matrix) if confidence and confidence in ("HIGH", "MED", "LOW"): calib_value = CALIBRATION_MATRIX.get((confidence, is_correct), 0.0) r += 0.5 * calib_value return float(r) def eval_reward( decision: str, confidence: str, ground_truth: str, ambiguity_score: float, evidence_quality: float, efficiency_score: float, episode_history: Optional[list[dict]] = None, ) -> float: """ FULL 6-component evaluation reward. Used for REPORTING and DEMO only. ⚠️ DO NOT USE FOR GRPO TRAINING. Use training_reward() instead. Components: 35% calibration_reward — confidence accuracy matrix 25% escalation_reward — appropriate uncertainty escalation 20% evidence_quality — specificity of fraud signal citations 10% efficiency_score — step efficiency (inherited from Round 1) 10% gaming_penalty pool — anti-gaming deductions Args: decision: Agent's terminal decision confidence: Agent's declared confidence ground_truth: Correct decision ambiguity_score: Task ambiguity (0.0=obvious, 1.0=very ambiguous) evidence_quality: Quality of fraud signal evidence (0.0–1.0) efficiency_score: Step efficiency from environment (0.0–1.0) episode_history: For gaming detection Returns: float: Composite evaluation score in [0.0, 1.0] """ calib_r = calibration_reward(decision, confidence, ground_truth, episode_history) escal_r = escalation_reward(decision, confidence, ambiguity_score) gaming_p = detect_confidence_gaming(episode_history) if episode_history else 0.0 raw = ( 0.35 * calib_r + 0.25 * escal_r + 0.20 * evidence_quality + 0.10 * efficiency_score - 0.10 * gaming_p ) # Normalise to [0.0, 1.0] for evaluation reporting # Raw range is approximately [-0.8, 1.0], shift and scale normalised = (raw + 0.8) / 1.8 return max(0.0, min(1.0, normalised))