Spaces:
Running
Running
| """Uncertainty calibration evaluation.""" | |
| from __future__ import annotations | |
| from app.evaluation.simulator_rollouts import run_rollouts | |
| def calibration_eval() -> dict[str, float]: | |
| rows = run_rollouts(episodes=8, difficulty="medium") | |
| if not rows: | |
| return {"ece_proxy": 1.0} | |
| calibration_scores = [ | |
| float((row.get("reward_breakdown", {}) or {}).get("uncertainty_calibration_score", 0.0)) | |
| for row in rows | |
| ] | |
| mean_calibration = sum(calibration_scores) / max(1, len(calibration_scores)) | |
| return {"ece_proxy": round(max(0.0, 1.0 - mean_calibration), 6)} | |