Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

App Files Files Community

echo-ultimate / core /metrics.py

Vikaspandey582003

Upload folder using huggingface_hub

acb327b verified 13 days ago

raw

history blame contribute delete

8.59 kB

	"""
	ECHO ULTIMATE — 5 calibration metrics implemented from scratch.

	ECE, MCE, Brier Score, Sharpness, Resolution — all with mathematical comments.
	"""

	import logging
	from dataclasses import dataclass, field
	from typing import Optional

	import numpy as np

	from config import cfg

	logger = logging.getLogger(__name__)


	# ── CalibrationReport ─────────────────────────────────────────────────────────

	@dataclass
	class CalibrationReport:
	"""Complete calibration profile for an agent over N episodes."""
	ece: float = 0.0
	mce: float = 0.0
	brier_score: float = 0.25
	sharpness: float = 0.0
	resolution: float = 0.0
	accuracy: float = 0.0
	mean_confidence: float = 50.0
	overconfidence_rate: float = 0.0
	underconfidence_rate: float = 0.0
	abstention_rate: float = 0.0
	bin_data: dict = field(default_factory=dict)
	n_samples: int = 0
	domain: Optional[str] = None

	def to_dict(self) -> dict:
	return {
	"ece": round(self.ece, 4),
	"mce": round(self.mce, 4),
	"brier_score": round(self.brier_score, 4),
	"sharpness": round(self.sharpness, 4),
	"resolution": round(self.resolution, 4),
	"accuracy": round(self.accuracy, 4),
	"mean_confidence": round(self.mean_confidence, 2),
	"overconfidence_rate": round(self.overconfidence_rate, 4),
	"underconfidence_rate": round(self.underconfidence_rate, 4),
	"abstention_rate": round(self.abstention_rate, 4),
	"n_samples": self.n_samples,
	"domain": self.domain,
	}

	def summary_str(self) -> str:
	return (
	f"ECE={self.ece:.3f} \| MCE={self.mce:.3f} \| Brier={self.brier_score:.3f} \| "
	f"Acc={self.accuracy:.1%} \| MeanConf={self.mean_confidence:.0f}% \| "
	f"OverconfRate={self.overconfidence_rate:.1%} \| n={self.n_samples}"
	)


	# ── Bin builder ───────────────────────────────────────────────────────────────

	def _build_bins(
	confidences: list[int],
	correctness: list[bool],
	n_bins: int,
	) -> dict[int, dict]:
	"""
	Partition (confidence, outcome) pairs into equal-width bins [0,10), [10,20), …
	Returns dict keyed by bin center with accuracy, mean_conf, and count.
	"""
	bins: dict[int, dict] = {}
	step = 100 // n_bins # e.g. 10 for n_bins=10

	for bin_lower in range(0, 100, step):
	bin_upper = bin_lower + step
	center = bin_lower + step // 2
	indices = [
	i for i, c in enumerate(confidences)
	if bin_lower <= c < bin_upper
	]
	if not indices:
	bins[center] = {"accuracy": 0.0, "mean_conf": center / 100.0, "count": 0}
	continue
	acc = float(np.mean([correctness[i] for i in indices]))
	mc = float(np.mean([confidences[i] for i in indices])) / 100.0
	bins[center] = {"accuracy": acc, "mean_conf": mc, "count": len(indices)}

	return bins


	# ── Metric functions ──────────────────────────────────────────────────────────

	def ece(
	confidences: list[int],
	correctness: list[bool],
	n_bins: int = cfg.N_CALIBRATION_BINS,
	) -> float:
	"""
	Expected Calibration Error.

	ECE = Σ_{m=1}^{M} (\|B_m\| / n) * \|acc(B_m) - conf(B_m)\|

	where B_m = samples in bin m, acc = fraction correct, conf = mean confidence.
	Lower is better. Perfect calibration = 0.0.
	"""
	if not confidences:
	return 0.0
	n = len(confidences)
	bins = _build_bins(confidences, correctness, n_bins)
	ece_val = 0.0
	for b in bins.values():
	if b["count"] == 0:
	continue
	ece_val += (b["count"] / n) * abs(b["accuracy"] - b["mean_conf"])
	return float(ece_val)


	def mce(
	confidences: list[int],
	correctness: list[bool],
	n_bins: int = cfg.N_CALIBRATION_BINS,
	) -> float:
	"""
	Maximum Calibration Error.

	MCE = max_m \|acc(B_m) - conf(B_m)\|

	Worst-case calibration error across all non-empty bins.
	"""
	if not confidences:
	return 0.0
	bins = _build_bins(confidences, correctness, n_bins)
	gaps = [
	abs(b["accuracy"] - b["mean_conf"])
	for b in bins.values() if b["count"] > 0
	]
	return float(max(gaps)) if gaps else 0.0


	def brier_score(
	confidences: list[int],
	correctness: list[bool],
	) -> float:
	"""
	Brier Score.

	BS = (1/n) Σ (p_i - o_i)^2

	p_i = confidence_i / 100 (forecast probability)
	o_i = 1 if correct, 0 if wrong (outcome)
	Range [0, 1]. Lower = better.
	Perfect model = 0. Random (50%) = 0.25.
	Always guessing 1.0 on wrong answers = 1.0.
	"""
	if not confidences:
	return 0.25
	scores = [
	(c / 100.0 - float(o)) ** 2
	for c, o in zip(confidences, correctness)
	]
	return float(np.mean(scores))


	def sharpness(confidences: list[int]) -> float:
	"""
	Sharpness.

	Sharpness = (1/n) Σ (p_i - mean(p))^2

	Variance of predicted probabilities.
	Higher sharpness = more decisive predictions.
	Can be good (confident correct) or bad (confident wrong).
	"""
	if not confidences:
	return 0.0
	probs = [c / 100.0 for c in confidences]
	return float(np.var(probs))


	def resolution(
	confidences: list[int],
	correctness: list[bool],
	n_bins: int = cfg.N_CALIBRATION_BINS,
	) -> float:
	"""
	Resolution.

	Resolution = (1/n) Σ_m \|B_m\| * (acc(B_m) - overall_acc)^2

	Measures how much the binned confidence predictions differ from overall accuracy.
	Higher resolution = predictions contain more information beyond the base rate.
	"""
	if not correctness:
	return 0.0
	n = len(correctness)
	overall_acc = float(np.mean(correctness))
	bins = _build_bins(confidences, correctness, n_bins)
	res = 0.0
	for b in bins.values():
	if b["count"] == 0:
	continue
	res += (b["count"] / n) * (b["accuracy"] - overall_acc) ** 2
	return float(res)


	# ── Combined report ───────────────────────────────────────────────────────────

	def compute_report(
	confidences: list[int],
	correctness: list[bool],
	abstentions: Optional[list[bool]] = None,
	domain: Optional[str] = None,
	n_bins: int = cfg.N_CALIBRATION_BINS,
	) -> CalibrationReport:
	"""
	Compute all 5 calibration metrics plus operational rates in one call.

	Args:
	confidences: list of int [0, 100]
	correctness: list of bool
	abstentions: list of bool (True = agent said "I don't know")
	domain: optional domain label for reporting
	"""
	if not confidences:
	return CalibrationReport(n_samples=0, domain=domain)

	n = len(confidences)
	overall_acc = float(np.mean(correctness))

	# Overconfidence rate: fraction of WRONG answers with conf >= threshold
	wrong_mask = [not c for c in correctness]
	wrong_high = sum(
	1 for c, w in zip(confidences, wrong_mask)
	if w and c >= cfg.OVERCONFIDENCE_THRESHOLD
	)
	n_wrong = sum(wrong_mask)
	overconf_rate = wrong_high / max(n_wrong, 1)

	# Underconfidence rate: fraction of CORRECT answers with conf <= threshold
	correct_low = sum(
	1 for c, ok in zip(confidences, correctness)
	if ok and c <= cfg.UNDERCONFIDENCE_THRESHOLD
	)
	n_correct = sum(correctness)
	underconf_rate = correct_low / max(n_correct, 1)

	abst_rate = 0.0
	if abstentions:
	abst_rate = sum(abstentions) / n

	bins = _build_bins(confidences, correctness, n_bins)

	return CalibrationReport(
	ece=ece(confidences, correctness, n_bins),
	mce=mce(confidences, correctness, n_bins),
	brier_score=brier_score(confidences, correctness),
	sharpness=sharpness(confidences),
	resolution=resolution(confidences, correctness, n_bins),
	accuracy=overall_acc,
	mean_confidence=float(np.mean(confidences)),
	overconfidence_rate=overconf_rate,
	underconfidence_rate=underconf_rate,
	abstention_rate=abst_rate,
	bin_data=bins,
	n_samples=n,
	domain=domain,
	)