debatefloor / server /calibration_grader.py
AniketAsla's picture
sync: mirror git d05fcb5 to Space
b4ac377 verified
"""
server/calibration_grader.py
DebateFloor — Calibrated Uncertainty Training Environment
Core innovation: rewards agents that know when they don't know.
Based on CoCA framework: arXiv:2603.05881
"Co-optimizing Confidence and Accuracy via Segment-Specific GRPO Rewards"
CRITICAL: This file implements the CALIBRATION reward only.
The TRAINING reward (simple scalar) is also here.
NEVER use eval_reward() for GRPO training — use training_reward().
"""
from typing import Optional
# ─────────────────────────────────────────────────────────────
# THE 3×2 CALIBRATION MATRIX
# This is the core innovation. Read this before editing anything.
#
# Philosophy:
# HIGH confidence + CORRECT = best outcome (1.0) — decisive and right
# HIGH confidence + WRONG = worst outcome (-0.8) — confident and wrong
# MED confidence + CORRECT = good (0.6) — right but cautious
# MED confidence + WRONG = ok (-0.2) — wrong but knew it
# LOW confidence + CORRECT = weak (0.1) — right, wasted escalation
# LOW confidence + WRONG = neutral (0.0) — at least it knew
# ─────────────────────────────────────────────────────────────
CALIBRATION_MATRIX: dict[tuple[str, bool], float] = {
("HIGH", True): 1.0,
("HIGH", False): -0.8,
("MED", True): 0.6,
("MED", False): -0.2,
("LOW", True): 0.1,
("LOW", False): 0.0,
}
# Anti-gaming thresholds
LOW_CONFIDENCE_GAMING_THRESHOLD = 0.70 # >70% LOW = gaming
HIGH_CONFIDENCE_GAMING_THRESHOLD = 0.80 # >80% HIGH = overconfidence
MIN_HISTORY_FOR_GAMING_DETECTION = 10 # need at least 10 episodes
def detect_confidence_gaming(episode_history: list[dict]) -> float:
"""
Detects and penalises systematic confidence manipulation.
An agent cannot game the calibration reward by always declaring LOW
confidence (to avoid HIGH+WRONG penalty) or always declaring HIGH
confidence (to maximise HIGH+CORRECT reward).
Args:
episode_history: List of dicts with "confidence" key per episode.
Example: [{"confidence": "LOW"}, {"confidence": "HIGH"}, ...]
Returns:
float: Penalty to subtract from reward. Always >= 0.
Returns 0.0 if history is too short to detect gaming.
"""
if len(episode_history) < MIN_HISTORY_FOR_GAMING_DETECTION:
return 0.0
total = len(episode_history)
low_count = sum(1 for e in episode_history if e.get("confidence") == "LOW")
high_count = sum(1 for e in episode_history if e.get("confidence") == "HIGH")
low_rate = low_count / total
high_rate = high_count / total
penalty = 0.0
# Penalise systematic under-confidence (always say LOW to avoid punishment)
if low_rate > LOW_CONFIDENCE_GAMING_THRESHOLD:
penalty += (low_rate - LOW_CONFIDENCE_GAMING_THRESHOLD) * 2.0
# Penalise systematic over-confidence (always say HIGH to maximise reward)
if high_rate > HIGH_CONFIDENCE_GAMING_THRESHOLD:
penalty += (high_rate - HIGH_CONFIDENCE_GAMING_THRESHOLD) * 1.5
return min(penalty, 1.0) # cap total penalty at 1.0
def calibration_reward(
decision: str,
confidence: str,
ground_truth: str,
episode_history: Optional[list[dict]] = None,
) -> float:
"""
Core calibration reward. Used in EVALUATION reward composition.
Args:
decision: Agent's decision ("approve_claim", "deny_claim", "escalate_to_human")
confidence: Agent's declared confidence ("HIGH", "MED", "LOW")
ground_truth: Correct decision for this episode
episode_history: List of past episode results for gaming detection
Returns:
float: Calibration reward in [-1.0, 1.0]
"""
if confidence not in ("HIGH", "MED", "LOW"):
raise ValueError(f"Invalid confidence: {confidence}. Must be HIGH, MED, or LOW.")
is_correct = (decision == ground_truth)
base_reward = CALIBRATION_MATRIX[(confidence, is_correct)]
# Apply anti-gaming penalty if we have enough history
gaming_penalty = 0.0
if episode_history:
gaming_penalty = detect_confidence_gaming(episode_history)
result = base_reward - gaming_penalty
# Always clamp to valid range
return max(-1.0, min(1.0, result))
def escalation_reward(
decision: str,
confidence: str,
ambiguity_score: float,
) -> float:
"""
Rewards appropriate escalation behaviour.
An agent should escalate when genuinely uncertain (high ambiguity).
Escalating on obvious cases wastes resources and is penalised.
Args:
decision: Agent's decision
confidence: Agent's declared confidence
ambiguity_score: How genuinely ambiguous this task is (0.0=obvious, 1.0=very ambiguous)
Returns:
float: Escalation reward in [-0.5, 0.7]
"""
is_escalation = (decision == "escalate_to_human")
is_genuinely_ambiguous = ambiguity_score > 0.6
is_obviously_clear = ambiguity_score < 0.3
if is_escalation and is_genuinely_ambiguous and confidence == "LOW":
return 0.7 # Perfect: uncertain + ambiguous task + escalated
elif is_escalation and is_obviously_clear:
return -0.3 # Bad: escalated on an easy/obvious task
elif is_escalation and confidence == "HIGH":
return -0.2 # Bad: escalated but was confident (contradictory)
else:
return 0.0 # Neutral: didn't escalate
def training_reward(
decision: str,
confidence: Optional[str],
ground_truth: str,
legitimate_flags: int,
step_num: int,
done: bool,
) -> float:
"""
SIMPLE shaped scalar reward for GRPO training stability.
⚠️ USE THIS FOR GRPO TRAINING — NOT eval_reward().
Complex compound rewards cause gradient instability in GRPO.
This function provides a clear, stable learning signal.
Args:
decision: Agent's terminal decision (or None if non-terminal)
confidence: Agent's declared confidence (None for non-terminal steps)
ground_truth: Correct decision for this episode
legitimate_flags: Number of correctly identified fraud signals this episode
step_num: Current step number
done: Whether episode is complete
Returns:
float: Training reward (negative at each step, positive signal on completion)
"""
# Step penalty — encourages efficiency
r = -0.05
if done and decision is not None:
is_correct = (decision == ground_truth)
# Decision accuracy (main signal)
r += 1.0 if is_correct else -0.5
# Legitimate fraud signal detection (partial credit)
r += 0.3 * min(legitimate_flags, 3) # cap at 3 flags
# Calibration bonus (weighted 50% of calibration matrix)
if confidence and confidence in ("HIGH", "MED", "LOW"):
calib_value = CALIBRATION_MATRIX.get((confidence, is_correct), 0.0)
r += 0.5 * calib_value
return float(r)
def eval_reward(
decision: str,
confidence: str,
ground_truth: str,
ambiguity_score: float,
evidence_quality: float,
efficiency_score: float,
episode_history: Optional[list[dict]] = None,
) -> float:
"""
FULL 6-component evaluation reward. Used for REPORTING and DEMO only.
⚠️ DO NOT USE FOR GRPO TRAINING. Use training_reward() instead.
Components:
35% calibration_reward — confidence accuracy matrix
25% escalation_reward — appropriate uncertainty escalation
20% evidence_quality — specificity of fraud signal citations
10% efficiency_score — step efficiency (inherited from Round 1)
10% gaming_penalty pool — anti-gaming deductions
Args:
decision: Agent's terminal decision
confidence: Agent's declared confidence
ground_truth: Correct decision
ambiguity_score: Task ambiguity (0.0=obvious, 1.0=very ambiguous)
evidence_quality: Quality of fraud signal evidence (0.0–1.0)
efficiency_score: Step efficiency from environment (0.0–1.0)
episode_history: For gaming detection
Returns:
float: Composite evaluation score in [0.0, 1.0]
"""
calib_r = calibration_reward(decision, confidence, ground_truth, episode_history)
escal_r = escalation_reward(decision, confidence, ambiguity_score)
gaming_p = detect_confidence_gaming(episode_history) if episode_history else 0.0
raw = (
0.35 * calib_r +
0.25 * escal_r +
0.20 * evidence_quality +
0.10 * efficiency_score -
0.10 * gaming_p
)
# Normalise to [0.0, 1.0] for evaluation reporting
# Raw range is approximately [-0.8, 1.0], shift and scale
normalised = (raw + 0.8) / 1.8
return max(0.0, min(1.0, normalised))