Spaces:

sahilfarib
/

FactEval

Running

Sahil al farib

Deploy FactEval: claim-level hallucination detection with Gradio demo

8fb73f8 2 days ago

3.41 kB

	"""
	Calibrator – Transforms raw NLI scores into calibrated probabilities.

	Uses isotonic regression models (fitted in Week 0) to produce trustworthy
	confidence scores and calibration error estimates.

	Falls back gracefully to raw scores if no calibrator file is available.
	"""

	import logging
	import pickle
	from pathlib import Path

	logger = logging.getLogger(__name__)


	class Calibrator:
	"""Apply isotonic regression calibration to raw NLI probabilities."""

	def __init__(self, calibrator_path: str \| Path \| None = None):
	"""
	Load a pre-fitted calibrator from a pickle file.

	Args:
	calibrator_path: Path to the pickle file containing a dict of
	{label_name: IsotonicRegression} objects.
	If None or file doesn't exist, falls back to raw scores.
	"""
	self._calibrators: dict \| None = None

	if calibrator_path is not None:
	path = Path(calibrator_path)
	if path.exists():
	with open(path, "rb") as f:
	self._calibrators = pickle.load(f)
	logger.info(
	"Loaded calibrator from %s (labels: %s)",
	path, list(self._calibrators.keys()),
	)
	else:
	logger.warning("Calibrator file not found: %s. Using raw scores.", path)

	@property
	def is_calibrated(self) -> bool:
	"""Whether a calibrator is loaded."""
	return self._calibrators is not None

	def calibrate(self, raw_scores: dict[str, float]) -> tuple[float, float]:
	"""
	Calibrate raw NLI probabilities.

	Args:
	raw_scores: Dict mapping label names to raw probabilities
	(e.g. {"entailment": 0.95, "neutral": 0.03, "contradiction": 0.02}).

	Returns:
	(calibrated_confidence, calibration_error)
	- calibrated_confidence: The calibrated probability for the predicted label.
	- calibration_error: Absolute difference between raw and calibrated confidence.
	"""
	if not raw_scores:
	return 0.0, 0.0

	# Find the predicted label (highest raw score)
	predicted_label = max(raw_scores, key=raw_scores.get)
	raw_confidence = raw_scores[predicted_label]

	if not self.is_calibrated:
	# Fallback: return raw confidence with an estimated error
	return raw_confidence, self._estimate_error(raw_confidence)

	# Apply isotonic regression for each label
	calibrated_scores = {}
	for label, raw_prob in raw_scores.items():
	if label in self._calibrators:
	cal_prob = float(self._calibrators[label].predict([[raw_prob]])[0])
	calibrated_scores[label] = max(0.0, min(1.0, cal_prob))
	else:
	calibrated_scores[label] = raw_prob

	calibrated_confidence = calibrated_scores.get(predicted_label, raw_confidence)
	calibration_error = abs(raw_confidence - calibrated_confidence)

	return round(calibrated_confidence, 4), round(calibration_error, 4)

	@staticmethod
	def _estimate_error(raw_confidence: float) -> float:
	"""Rough error estimate when no calibrator is available."""
	# Higher confidence → lower estimated error, but never zero
	return round(max(0.02, (1.0 - raw_confidence) * 0.3), 4)