"""Uncertainty calibration evaluation.""" from __future__ import annotations from app.evaluation.simulator_rollouts import run_rollouts def calibration_eval() -> dict[str, float]: rows = run_rollouts(episodes=8, difficulty="medium") if not rows: return {"ece_proxy": 1.0} calibration_scores = [ float((row.get("reward_breakdown", {}) or {}).get("uncertainty_calibration_score", 0.0)) for row in rows ] mean_calibration = sum(calibration_scores) / max(1, len(calibration_scores)) return {"ece_proxy": round(max(0.0, 1.0 - mean_calibration), 6)}