Spaces:

AniketAsla
/

debatefloor

Running

App Files Files Community

debatefloor / server /calibration_grader.py

AniketAsla

sync: mirror git d05fcb5 to Space

b4ac377 verified 12 days ago

raw

history blame contribute delete

9.37 kB

	"""
	server/calibration_grader.py
	DebateFloor — Calibrated Uncertainty Training Environment
	Core innovation: rewards agents that know when they don't know.

	Based on CoCA framework: arXiv:2603.05881
	"Co-optimizing Confidence and Accuracy via Segment-Specific GRPO Rewards"

	CRITICAL: This file implements the CALIBRATION reward only.
	The TRAINING reward (simple scalar) is also here.
	NEVER use eval_reward() for GRPO training — use training_reward().
	"""

	from typing import Optional

	# ─────────────────────────────────────────────────────────────
	# THE 3×2 CALIBRATION MATRIX
	# This is the core innovation. Read this before editing anything.
	#
	# Philosophy:
	# HIGH confidence + CORRECT = best outcome (1.0) — decisive and right
	# HIGH confidence + WRONG = worst outcome (-0.8) — confident and wrong
	# MED confidence + CORRECT = good (0.6) — right but cautious
	# MED confidence + WRONG = ok (-0.2) — wrong but knew it
	# LOW confidence + CORRECT = weak (0.1) — right, wasted escalation
	# LOW confidence + WRONG = neutral (0.0) — at least it knew
	# ─────────────────────────────────────────────────────────────
	CALIBRATION_MATRIX: dict[tuple[str, bool], float] = {
	("HIGH", True): 1.0,
	("HIGH", False): -0.8,
	("MED", True): 0.6,
	("MED", False): -0.2,
	("LOW", True): 0.1,
	("LOW", False): 0.0,
	}

	# Anti-gaming thresholds
	LOW_CONFIDENCE_GAMING_THRESHOLD = 0.70 # >70% LOW = gaming
	HIGH_CONFIDENCE_GAMING_THRESHOLD = 0.80 # >80% HIGH = overconfidence
	MIN_HISTORY_FOR_GAMING_DETECTION = 10 # need at least 10 episodes


	def detect_confidence_gaming(episode_history: list[dict]) -> float:
	"""
	Detects and penalises systematic confidence manipulation.

	An agent cannot game the calibration reward by always declaring LOW
	confidence (to avoid HIGH+WRONG penalty) or always declaring HIGH
	confidence (to maximise HIGH+CORRECT reward).

	Args:
	episode_history: List of dicts with "confidence" key per episode.
	Example: [{"confidence": "LOW"}, {"confidence": "HIGH"}, ...]

	Returns:
	float: Penalty to subtract from reward. Always >= 0.
	Returns 0.0 if history is too short to detect gaming.
	"""
	if len(episode_history) < MIN_HISTORY_FOR_GAMING_DETECTION:
	return 0.0

	total = len(episode_history)
	low_count = sum(1 for e in episode_history if e.get("confidence") == "LOW")
	high_count = sum(1 for e in episode_history if e.get("confidence") == "HIGH")

	low_rate = low_count / total
	high_rate = high_count / total

	penalty = 0.0

	# Penalise systematic under-confidence (always say LOW to avoid punishment)
	if low_rate > LOW_CONFIDENCE_GAMING_THRESHOLD:
	penalty += (low_rate - LOW_CONFIDENCE_GAMING_THRESHOLD) * 2.0

	# Penalise systematic over-confidence (always say HIGH to maximise reward)
	if high_rate > HIGH_CONFIDENCE_GAMING_THRESHOLD:
	penalty += (high_rate - HIGH_CONFIDENCE_GAMING_THRESHOLD) * 1.5

	return min(penalty, 1.0) # cap total penalty at 1.0


	def calibration_reward(
	decision: str,
	confidence: str,
	ground_truth: str,
	episode_history: Optional[list[dict]] = None,
	) -> float:
	"""
	Core calibration reward. Used in EVALUATION reward composition.

	Args:
	decision: Agent's decision ("approve_claim", "deny_claim", "escalate_to_human")
	confidence: Agent's declared confidence ("HIGH", "MED", "LOW")
	ground_truth: Correct decision for this episode
	episode_history: List of past episode results for gaming detection

	Returns:
	float: Calibration reward in [-1.0, 1.0]
	"""
	if confidence not in ("HIGH", "MED", "LOW"):
	raise ValueError(f"Invalid confidence: {confidence}. Must be HIGH, MED, or LOW.")

	is_correct = (decision == ground_truth)
	base_reward = CALIBRATION_MATRIX[(confidence, is_correct)]

	# Apply anti-gaming penalty if we have enough history
	gaming_penalty = 0.0
	if episode_history:
	gaming_penalty = detect_confidence_gaming(episode_history)

	result = base_reward - gaming_penalty

	# Always clamp to valid range
	return max(-1.0, min(1.0, result))


	def escalation_reward(
	decision: str,
	confidence: str,
	ambiguity_score: float,
	) -> float:
	"""
	Rewards appropriate escalation behaviour.

	An agent should escalate when genuinely uncertain (high ambiguity).
	Escalating on obvious cases wastes resources and is penalised.

	Args:
	decision: Agent's decision
	confidence: Agent's declared confidence
	ambiguity_score: How genuinely ambiguous this task is (0.0=obvious, 1.0=very ambiguous)

	Returns:
	float: Escalation reward in [-0.5, 0.7]
	"""
	is_escalation = (decision == "escalate_to_human")
	is_genuinely_ambiguous = ambiguity_score > 0.6
	is_obviously_clear = ambiguity_score < 0.3

	if is_escalation and is_genuinely_ambiguous and confidence == "LOW":
	return 0.7 # Perfect: uncertain + ambiguous task + escalated
	elif is_escalation and is_obviously_clear:
	return -0.3 # Bad: escalated on an easy/obvious task
	elif is_escalation and confidence == "HIGH":
	return -0.2 # Bad: escalated but was confident (contradictory)
	else:
	return 0.0 # Neutral: didn't escalate


	def training_reward(
	decision: str,
	confidence: Optional[str],
	ground_truth: str,
	legitimate_flags: int,
	step_num: int,
	done: bool,
	) -> float:
	"""
	SIMPLE shaped scalar reward for GRPO training stability.

	⚠️ USE THIS FOR GRPO TRAINING — NOT eval_reward().
	Complex compound rewards cause gradient instability in GRPO.
	This function provides a clear, stable learning signal.

	Args:
	decision: Agent's terminal decision (or None if non-terminal)
	confidence: Agent's declared confidence (None for non-terminal steps)
	ground_truth: Correct decision for this episode
	legitimate_flags: Number of correctly identified fraud signals this episode
	step_num: Current step number
	done: Whether episode is complete

	Returns:
	float: Training reward (negative at each step, positive signal on completion)
	"""
	# Step penalty — encourages efficiency
	r = -0.05

	if done and decision is not None:
	is_correct = (decision == ground_truth)

	# Decision accuracy (main signal)
	r += 1.0 if is_correct else -0.5

	# Legitimate fraud signal detection (partial credit)
	r += 0.3 * min(legitimate_flags, 3) # cap at 3 flags

	# Calibration bonus (weighted 50% of calibration matrix)
	if confidence and confidence in ("HIGH", "MED", "LOW"):
	calib_value = CALIBRATION_MATRIX.get((confidence, is_correct), 0.0)
	r += 0.5 * calib_value

	return float(r)


	def eval_reward(
	decision: str,
	confidence: str,
	ground_truth: str,
	ambiguity_score: float,
	evidence_quality: float,
	efficiency_score: float,
	episode_history: Optional[list[dict]] = None,
	) -> float:
	"""
	FULL 6-component evaluation reward. Used for REPORTING and DEMO only.

	⚠️ DO NOT USE FOR GRPO TRAINING. Use training_reward() instead.

	Components:
	35% calibration_reward — confidence accuracy matrix
	25% escalation_reward — appropriate uncertainty escalation
	20% evidence_quality — specificity of fraud signal citations
	10% efficiency_score — step efficiency (inherited from Round 1)
	10% gaming_penalty pool — anti-gaming deductions

	Args:
	decision: Agent's terminal decision
	confidence: Agent's declared confidence
	ground_truth: Correct decision
	ambiguity_score: Task ambiguity (0.0=obvious, 1.0=very ambiguous)
	evidence_quality: Quality of fraud signal evidence (0.0–1.0)
	efficiency_score: Step efficiency from environment (0.0–1.0)
	episode_history: For gaming detection

	Returns:
	float: Composite evaluation score in [0.0, 1.0]
	"""
	calib_r = calibration_reward(decision, confidence, ground_truth, episode_history)
	escal_r = escalation_reward(decision, confidence, ambiguity_score)
	gaming_p = detect_confidence_gaming(episode_history) if episode_history else 0.0

	raw = (
	0.35 * calib_r +
	0.25 * escal_r +
	0.20 * evidence_quality +
	0.10 * efficiency_score -
	0.10 * gaming_p
	)

	# Normalise to [0.0, 1.0] for evaluation reporting
	# Raw range is approximately [-0.8, 1.0], shift and scale
	normalised = (raw + 0.8) / 1.8
	return max(0.0, min(1.0, normalised))