Spaces:
Sleeping
Sleeping
| """ | |
| ECHO ULTIMATE β 5 calibration metrics implemented from scratch. | |
| ECE, MCE, Brier Score, Sharpness, Resolution β all with mathematical comments. | |
| """ | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import numpy as np | |
| from config import cfg | |
| logger = logging.getLogger(__name__) | |
| # ββ CalibrationReport βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class CalibrationReport: | |
| """Complete calibration profile for an agent over N episodes.""" | |
| ece: float = 0.0 | |
| mce: float = 0.0 | |
| brier_score: float = 0.25 | |
| sharpness: float = 0.0 | |
| resolution: float = 0.0 | |
| accuracy: float = 0.0 | |
| mean_confidence: float = 50.0 | |
| overconfidence_rate: float = 0.0 | |
| underconfidence_rate: float = 0.0 | |
| abstention_rate: float = 0.0 | |
| bin_data: dict = field(default_factory=dict) | |
| n_samples: int = 0 | |
| domain: Optional[str] = None | |
| def to_dict(self) -> dict: | |
| return { | |
| "ece": round(self.ece, 4), | |
| "mce": round(self.mce, 4), | |
| "brier_score": round(self.brier_score, 4), | |
| "sharpness": round(self.sharpness, 4), | |
| "resolution": round(self.resolution, 4), | |
| "accuracy": round(self.accuracy, 4), | |
| "mean_confidence": round(self.mean_confidence, 2), | |
| "overconfidence_rate": round(self.overconfidence_rate, 4), | |
| "underconfidence_rate": round(self.underconfidence_rate, 4), | |
| "abstention_rate": round(self.abstention_rate, 4), | |
| "n_samples": self.n_samples, | |
| "domain": self.domain, | |
| } | |
| def summary_str(self) -> str: | |
| return ( | |
| f"ECE={self.ece:.3f} | MCE={self.mce:.3f} | Brier={self.brier_score:.3f} | " | |
| f"Acc={self.accuracy:.1%} | MeanConf={self.mean_confidence:.0f}% | " | |
| f"OverconfRate={self.overconfidence_rate:.1%} | n={self.n_samples}" | |
| ) | |
| # ββ Bin builder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_bins( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| n_bins: int, | |
| ) -> dict[int, dict]: | |
| """ | |
| Partition (confidence, outcome) pairs into equal-width bins [0,10), [10,20), β¦ | |
| Returns dict keyed by bin center with accuracy, mean_conf, and count. | |
| """ | |
| bins: dict[int, dict] = {} | |
| step = 100 // n_bins # e.g. 10 for n_bins=10 | |
| for bin_lower in range(0, 100, step): | |
| bin_upper = bin_lower + step | |
| center = bin_lower + step // 2 | |
| indices = [ | |
| i for i, c in enumerate(confidences) | |
| if bin_lower <= c < bin_upper | |
| ] | |
| if not indices: | |
| bins[center] = {"accuracy": 0.0, "mean_conf": center / 100.0, "count": 0} | |
| continue | |
| acc = float(np.mean([correctness[i] for i in indices])) | |
| mc = float(np.mean([confidences[i] for i in indices])) / 100.0 | |
| bins[center] = {"accuracy": acc, "mean_conf": mc, "count": len(indices)} | |
| return bins | |
| # ββ Metric functions ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ece( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| n_bins: int = cfg.N_CALIBRATION_BINS, | |
| ) -> float: | |
| """ | |
| Expected Calibration Error. | |
| ECE = Ξ£_{m=1}^{M} (|B_m| / n) * |acc(B_m) - conf(B_m)| | |
| where B_m = samples in bin m, acc = fraction correct, conf = mean confidence. | |
| Lower is better. Perfect calibration = 0.0. | |
| """ | |
| if not confidences: | |
| return 0.0 | |
| n = len(confidences) | |
| bins = _build_bins(confidences, correctness, n_bins) | |
| ece_val = 0.0 | |
| for b in bins.values(): | |
| if b["count"] == 0: | |
| continue | |
| ece_val += (b["count"] / n) * abs(b["accuracy"] - b["mean_conf"]) | |
| return float(ece_val) | |
| def mce( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| n_bins: int = cfg.N_CALIBRATION_BINS, | |
| ) -> float: | |
| """ | |
| Maximum Calibration Error. | |
| MCE = max_m |acc(B_m) - conf(B_m)| | |
| Worst-case calibration error across all non-empty bins. | |
| """ | |
| if not confidences: | |
| return 0.0 | |
| bins = _build_bins(confidences, correctness, n_bins) | |
| gaps = [ | |
| abs(b["accuracy"] - b["mean_conf"]) | |
| for b in bins.values() if b["count"] > 0 | |
| ] | |
| return float(max(gaps)) if gaps else 0.0 | |
| def brier_score( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| ) -> float: | |
| """ | |
| Brier Score. | |
| BS = (1/n) Ξ£ (p_i - o_i)^2 | |
| p_i = confidence_i / 100 (forecast probability) | |
| o_i = 1 if correct, 0 if wrong (outcome) | |
| Range [0, 1]. Lower = better. | |
| Perfect model = 0. Random (50%) = 0.25. | |
| Always guessing 1.0 on wrong answers = 1.0. | |
| """ | |
| if not confidences: | |
| return 0.25 | |
| scores = [ | |
| (c / 100.0 - float(o)) ** 2 | |
| for c, o in zip(confidences, correctness) | |
| ] | |
| return float(np.mean(scores)) | |
| def sharpness(confidences: list[int]) -> float: | |
| """ | |
| Sharpness. | |
| Sharpness = (1/n) Ξ£ (p_i - mean(p))^2 | |
| Variance of predicted probabilities. | |
| Higher sharpness = more decisive predictions. | |
| Can be good (confident correct) or bad (confident wrong). | |
| """ | |
| if not confidences: | |
| return 0.0 | |
| probs = [c / 100.0 for c in confidences] | |
| return float(np.var(probs)) | |
| def resolution( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| n_bins: int = cfg.N_CALIBRATION_BINS, | |
| ) -> float: | |
| """ | |
| Resolution. | |
| Resolution = (1/n) Ξ£_m |B_m| * (acc(B_m) - overall_acc)^2 | |
| Measures how much the binned confidence predictions differ from overall accuracy. | |
| Higher resolution = predictions contain more information beyond the base rate. | |
| """ | |
| if not correctness: | |
| return 0.0 | |
| n = len(correctness) | |
| overall_acc = float(np.mean(correctness)) | |
| bins = _build_bins(confidences, correctness, n_bins) | |
| res = 0.0 | |
| for b in bins.values(): | |
| if b["count"] == 0: | |
| continue | |
| res += (b["count"] / n) * (b["accuracy"] - overall_acc) ** 2 | |
| return float(res) | |
| # ββ Combined report βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def compute_report( | |
| confidences: list[int], | |
| correctness: list[bool], | |
| abstentions: Optional[list[bool]] = None, | |
| domain: Optional[str] = None, | |
| n_bins: int = cfg.N_CALIBRATION_BINS, | |
| ) -> CalibrationReport: | |
| """ | |
| Compute all 5 calibration metrics plus operational rates in one call. | |
| Args: | |
| confidences: list of int [0, 100] | |
| correctness: list of bool | |
| abstentions: list of bool (True = agent said "I don't know") | |
| domain: optional domain label for reporting | |
| """ | |
| if not confidences: | |
| return CalibrationReport(n_samples=0, domain=domain) | |
| n = len(confidences) | |
| overall_acc = float(np.mean(correctness)) | |
| # Overconfidence rate: fraction of WRONG answers with conf >= threshold | |
| wrong_mask = [not c for c in correctness] | |
| wrong_high = sum( | |
| 1 for c, w in zip(confidences, wrong_mask) | |
| if w and c >= cfg.OVERCONFIDENCE_THRESHOLD | |
| ) | |
| n_wrong = sum(wrong_mask) | |
| overconf_rate = wrong_high / max(n_wrong, 1) | |
| # Underconfidence rate: fraction of CORRECT answers with conf <= threshold | |
| correct_low = sum( | |
| 1 for c, ok in zip(confidences, correctness) | |
| if ok and c <= cfg.UNDERCONFIDENCE_THRESHOLD | |
| ) | |
| n_correct = sum(correctness) | |
| underconf_rate = correct_low / max(n_correct, 1) | |
| abst_rate = 0.0 | |
| if abstentions: | |
| abst_rate = sum(abstentions) / n | |
| bins = _build_bins(confidences, correctness, n_bins) | |
| return CalibrationReport( | |
| ece=ece(confidences, correctness, n_bins), | |
| mce=mce(confidences, correctness, n_bins), | |
| brier_score=brier_score(confidences, correctness), | |
| sharpness=sharpness(confidences), | |
| resolution=resolution(confidences, correctness, n_bins), | |
| accuracy=overall_acc, | |
| mean_confidence=float(np.mean(confidences)), | |
| overconfidence_rate=overconf_rate, | |
| underconfidence_rate=underconf_rate, | |
| abstention_rate=abst_rate, | |
| bin_data=bins, | |
| n_samples=n, | |
| domain=domain, | |
| ) | |