Spaces:

SouravNath
/

repomind-api

Running

App Files Files Community

repomind-api / tests /test_phase6_uncertainty.py

SouravNath

Initial commit

dc71cad 4 days ago

raw

history blame contribute delete

18.6 kB

	"""
	tests/test_phase6_uncertainty.py
	──────────────────────────────────
	Unit tests for Phase 6: Conformal Prediction + Temperature Scaling.

	Tests verify:
	- Coverage guarantee property (marginal coverage >= 1-alpha)
	- Prediction set size properties (non-emptiness, monotonicity w.r.t. alpha)
	- Temperature scaling NLL reduction and ECE improvement
	- CalibrationStore persistence (save/load)
	- RAPS prediction set properties
	- UncertaintyReport output format
	- Pipeline integration with mock localisation

	Run with: pytest tests/test_phase6_uncertainty.py -v
	"""
	from __future__ import annotations

	import json
	import math
	import tempfile
	from pathlib import Path

	import numpy as np
	import pytest


	# ── CalibrationStore ──────────────────────────────────────────────────────────

	class TestCalibrationStore:
	def test_add_and_scores(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	cs.add(0.8, "inst-1", "django/django")
	cs.add(0.3, "inst-2", "django/django")
	assert cs.n == 2
	assert abs(cs.scores[0] - 0.2) < 1e-6 # 1 - 0.8
	assert abs(cs.scores[1] - 0.7) < 1e-6 # 1 - 0.3

	def test_save_and_load(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	for i in range(10):
	cs.add(float(i) / 10, f"inst-{i}")
	cs.save()

	cs2 = CalibrationStore(tmp_path / "cal.json")
	assert cs2.n == 10
	assert abs(cs2.scores.mean() - cs.scores.mean()) < 1e-6

	def test_quantile_increases_with_alpha(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	for s in np.linspace(0, 1, 50):
	cs.add(float(s))

	q10 = cs.quantile(0.10) # 90th percentile
	q20 = cs.quantile(0.20) # 80th percentile
	# Higher alpha → lower quantile threshold (more permissive)
	assert q20 <= q10

	def test_empty_store_quantile(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	# Should return 1.0 (worst case) when no calibration data
	assert cs.quantile(0.10) == 1.0

	def test_stats_structure(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	for s in np.linspace(0.5, 1.0, 20):
	cs.add(float(s))
	stats = cs.stats()
	assert "n" in stats
	assert "mean_nonconformity" in stats
	assert "q50" in stats

	def test_add_batch(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore
	cs = CalibrationStore(tmp_path / "cal.json")
	batch = [(0.7, "a", "repo"), (0.5, "b", "repo"), (0.9, "c", "repo")]
	cs.add_batch(batch)
	assert cs.n == 3


	# ── ConformalPredictor ────────────────────────────────────────────────────────

	class TestConformalPredictor:

	def _make_predictor(self, tmp_path, n_cal=100, alpha=0.10):
	from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
	cs = CalibrationStore(tmp_path / "cal.json")
	# Simulate calibration scores from realistic localisation
	np.random.seed(42)
	cal_scores = np.random.beta(2, 5, n_cal) # most scores are low (model is good)
	for s in cal_scores:
	cs.add(float(s))
	return ConformalPredictor(cs, alpha=alpha)

	def test_prediction_returns_correct_types(self, tmp_path):
	from uncertainty.conformal_predictor import LocalisationWithUncertainty
	cp = self._make_predictor(tmp_path)
	files = ["a.py", "b.py", "c.py"]
	scores = [0.8, 0.5, 0.2]
	result = cp.predict(files, scores)
	assert isinstance(result, LocalisationWithUncertainty)
	assert len(result.hits) == 3

	def test_coverage_guarantee_satisfied(self, tmp_path):
	"""
	Core guarantee test:
	Empirical coverage >= 1 - alpha on synthetic test set.
	"""
	from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
	np.random.seed(123)
	alpha = 0.10

	# Large calibration set for stable quantile
	cs = CalibrationStore(tmp_path / "cal.json")
	n_cal = 500
	cal_rrf_scores = np.random.beta(3, 2, n_cal) # gold file scores
	for s in cal_rrf_scores:
	cs.add(float(s))

	cp = ConformalPredictor(cs, alpha=alpha)

	# Test instances: gold file has score sampled from same distribution
	n_test = 200
	covered = 0
	for _ in range(n_test):
	gold_score = float(np.random.beta(3, 2))
	other_scores = list(np.random.beta(1, 3, 9)) # 9 non-gold files
	all_scores = sorted([gold_score] + other_scores, reverse=True)
	all_files = [f"file_{i}.py" for i in range(10)]
	gold_idx = all_scores.index(gold_score)
	gold_file = all_files[gold_idx]

	result = cp.predict(all_files, all_scores)
	pred_set = result.prediction_set_files
	if gold_file in pred_set:
	covered += 1

	empirical_coverage = covered / n_test
	# Should be >= 1 - alpha with high probability
	assert empirical_coverage >= (1 - alpha - 0.08), (
	f"Coverage {empirical_coverage:.3f} < guarantee {1-alpha:.2f}"
	)

	def test_prediction_set_includes_high_score_file(self, tmp_path):
	"""High-scoring file should always be in prediction set."""
	cp = self._make_predictor(tmp_path)
	result = cp.predict(["best.py", "ok.py", "bad.py"], [0.99, 0.3, 0.01])
	pred_paths = result.prediction_set_files
	assert "best.py" in pred_paths

	def test_confidence_in_0_1_range(self, tmp_path):
	cp = self._make_predictor(tmp_path)
	result = cp.predict(["a.py", "b.py"], [0.7, 0.4])
	for hit in result.hits:
	assert 0.0 <= hit.confidence <= 1.0
	assert 0.0 <= hit.p_value <= 1.0

	def test_ranks_sequential(self, tmp_path):
	cp = self._make_predictor(tmp_path)
	result = cp.predict(["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2])
	assert [h.rank for h in result.hits] == [1, 2, 3]

	def test_no_calibration_data_maximum_uncertainty(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
	cs = CalibrationStore(tmp_path / "cal.json")
	cp = ConformalPredictor(cs, alpha=0.10)
	result = cp.predict(["a.py"], [0.9])
	# All files should be in prediction set (maximum uncertainty)
	assert result.hits[0].p_value == 1.0 # smoothed p-value with n=0

	def test_tighter_alpha_gives_larger_set(self, tmp_path):
	"""Lower alpha (e.g. 0.05) should produce larger prediction sets."""
	cp_strict = self._make_predictor(tmp_path, alpha=0.05)
	cp_lenient = self._make_predictor(tmp_path, alpha=0.20)

	files = [f"f{i}.py" for i in range(10)]
	scores = list(np.linspace(0.9, 0.1, 10))

	r_strict = cp_strict.predict(files, scores)
	r_lenient = cp_lenient.predict(files, scores)

	# Stricter coverage requirement → larger prediction set
	assert r_strict.prediction_set_size >= r_lenient.prediction_set_size

	def test_uncertainty_labels(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
	cs = CalibrationStore(tmp_path / "cal.json")
	# Calibrate so that only top-1 file is in prediction set
	for _ in range(100):
	cs.add(0.99) # all gold files have score=0.01 → high nonconformity

	cp = ConformalPredictor(cs, alpha=0.10)
	# One file with very high score → should be "confident" or "moderate"
	result = cp.predict(["only.py"], [0.99])
	assert result.uncertainty_label in ("confident", "moderate", "uncertain", "very_uncertain")

	def test_evaluate_coverage_api(self, tmp_path):
	cp = self._make_predictor(tmp_path, n_cal=200)
	test_instances = [
	(["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2], "a.py"),
	(["x.py", "y.py"], [0.9, 0.1], "x.py"),
	]
	result = cp.evaluate_coverage(test_instances)
	assert "empirical_coverage" in result
	assert "avg_set_size" in result
	assert 0 <= result["empirical_coverage"] <= 1

	def test_file_confidence_property(self, tmp_path):
	from uncertainty.conformal_predictor import FileConfidence
	fc = FileConfidence(
	file_path="test.py",
	rrf_score=0.75,
	p_value=0.15,
	in_prediction_set=True,
	confidence=0.85,
	rank=1,
	)
	assert "85.0%" in fc.confidence_pct


	# ── Temperature Scaling ───────────────────────────────────────────────────────

	class TestTemperatureScaler:

	def _make_overconfident_data(self, n=200, seed=42):
	"""Simulate overconfident DeBERTa logits."""
	np.random.seed(seed)
	labels = np.random.randint(0, 2, n)
	# Overconfident: logits have large magnitude
	logits = np.column_stack([
	np.where(labels == 0, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)),
	np.where(labels == 1, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)),
	])
	return logits, labels

	def test_scale_output_sums_to_one(self):
	from uncertainty.temperature_scaling import TemperatureScaler
	ts = TemperatureScaler(T=1.5)
	logits = np.array([[2.0, -1.0], [0.5, 0.8], [-3.0, 5.0]])
	probs = ts.scale(logits)
	np.testing.assert_allclose(probs.sum(axis=1), 1.0, atol=1e-6)

	def test_scale_output_in_0_1(self):
	from uncertainty.temperature_scaling import TemperatureScaler
	ts = TemperatureScaler(T=2.0)
	logits = np.random.randn(50, 2)
	probs = ts.scale(logits)
	assert probs.min() >= 0
	assert probs.max() <= 1

	def test_T_greater_than_1_softens(self):
	from uncertainty.temperature_scaling import TemperatureScaler
	logits = np.array([[5.0, -5.0]]) # very confident
	ts1 = TemperatureScaler(T=1.0)
	ts2 = TemperatureScaler(T=3.0)
	prob1 = ts1.scale(logits)[0, 0]
	prob2 = ts2.scale(logits)[0, 0]
	# T=3 should produce softer (closer to 0.5) probability
	assert prob1 > prob2 # prob1 closer to 1.0, prob2 closer to 0.5

	def test_fit_reduces_nll(self, tmp_path):
	from uncertainty.temperature_scaling import TemperatureScaler
	logits, labels = self._make_overconfident_data()
	ts = TemperatureScaler(T=1.0)
	result = ts.fit(logits, labels)
	assert result["nll_after"] <= result["nll_before"]

	def test_fit_T_greater_than_1_for_overconfident(self, tmp_path):
	from uncertainty.temperature_scaling import TemperatureScaler
	logits, labels = self._make_overconfident_data()
	ts = TemperatureScaler(T=1.0)
	ts.fit(logits, labels)
	# Overconfident model → T should increase to soften probabilities
	assert ts.T > 0.5 # just check it stays positive and reasonable

	def test_save_and_load(self, tmp_path):
	from uncertainty.temperature_scaling import TemperatureScaler
	ts = TemperatureScaler(T=2.345)
	ts._fitted = True
	ts.save(tmp_path / "ts.json")

	ts2 = TemperatureScaler.load(tmp_path / "ts.json")
	assert abs(ts2.T - 2.345) < 1e-6
	assert ts2._fitted is True

	def test_scale_score_single_value(self):
	from uncertainty.temperature_scaling import TemperatureScaler
	ts = TemperatureScaler(T=1.0)
	prob = ts.scale_score(2.0)
	assert 0 < prob < 1

	def test_reliability_diagram_data(self):
	from uncertainty.temperature_scaling import reliability_diagram_data
	np.random.seed(42)
	probs = np.random.uniform(0, 1, 100)
	labels = (probs + np.random.randn(100) * 0.2 > 0.5).astype(int)
	bins = reliability_diagram_data(probs, labels, n_bins=5)
	assert len(bins) > 0
	for b in bins:
	assert "confidence" in b
	assert "accuracy" in b
	assert "count" in b


	# ── RAPS ─────────────────────────────────────────────────────────────────────

	class TestRAPS:
	def test_raps_returns_nonempty(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore, raps_predict
	cs = CalibrationStore(tmp_path / "cal.json")
	for s in np.linspace(0, 1, 50):
	cs.add(float(s))
	files = ["a.py", "b.py", "c.py"]
	scores = np.array([0.6, 0.3, 0.1])
	result = raps_predict(files, scores, cs, alpha=0.10)
	assert len(result) >= 1

	def test_raps_top1_always_included(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore, raps_predict
	cs = CalibrationStore(tmp_path / "cal.json")
	# Empty calibration → fallback to top-k
	files = ["best.py", "ok.py"]
	scores = np.array([0.9, 0.1])
	result = raps_predict(files, scores, cs, alpha=0.10)
	paths = [r[0] for r in result]
	assert "best.py" in paths

	def test_raps_scores_positive(self, tmp_path):
	from uncertainty.conformal_predictor import CalibrationStore, raps_predict
	cs = CalibrationStore(tmp_path / "cal.json")
	for s in np.linspace(0.1, 0.9, 30):
	cs.add(float(s))
	files = [f"f{i}.py" for i in range(5)]
	scores = np.array([0.5, 0.2, 0.15, 0.1, 0.05])
	result = raps_predict(files, scores, cs)
	assert all(s > 0 for _, s in result)


	# ── UncertaintyAwarePipeline ──────────────────────────────────────────────────

	class TestUncertaintyAwarePipeline:

	def _mock_localisation_pipeline(self, files, scores):
	"""Create a mock pipeline that returns pre-set results."""
	from unittest.mock import MagicMock
	from localisation.pipeline import LocalisationResult, LocalisationHit

	mock = MagicMock()
	hits = [
	LocalisationHit(file_path=fp, relevance_score=s, rank=i + 1)
	for i, (fp, s) in enumerate(zip(files, scores))
	]
	mock.localise.return_value = LocalisationResult(hits=hits, elapsed_seconds=0.1)
	mock.index_repo.return_value = {"elapsed": 0.1}
	return mock

	def test_localise_with_uncertainty_returns_result(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline

	files = ["models.py", "views.py", "utils.py"]
	scores = [0.8, 0.5, 0.2]
	mock_pipeline = self._mock_localisation_pipeline(files, scores)

	up = UncertaintyAwarePipeline(
	localisation_pipeline=mock_pipeline,
	calibration_store_path=tmp_path / "cal.json",
	)
	result = up.localise_with_uncertainty("fix the bug", top_k=3)
	assert len(result.files) == 3
	assert len(result.prediction_set) >= 1

	def test_prediction_set_never_empty(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline

	mock = self._mock_localisation_pipeline(["only.py"], [0.9])
	up = UncertaintyAwarePipeline(
	localisation_pipeline=mock,
	calibration_store_path=tmp_path / "cal.json",
	)
	result = up.localise_with_uncertainty("some issue")
	assert len(result.prediction_set) >= 1

	def test_token_savings_computed(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline

	files = [f"f{i}.py" for i in range(10)]
	scores = list(np.linspace(0.9, 0.1, 10))
	mock = self._mock_localisation_pipeline(files, scores)

	up = UncertaintyAwarePipeline(
	localisation_pipeline=mock,
	calibration_store_path=tmp_path / "cal.json",
	tokens_per_file=1500,
	)
	result = up.localise_with_uncertainty("issue", top_k=10)
	assert result.token_budget_naive == 10 * 1500
	assert result.token_budget_used <= result.token_budget_naive

	def test_uncertainty_report_to_dict(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyReport
	report = UncertaintyReport(
	uncertainty_label="confident",
	prediction_set_size=2,
	coverage_guarantee=0.90,
	top_file_confidence=0.87,
	avg_confidence=0.65,
	estimated_token_savings=0.60,
	calibration_n=150,
	)
	d = report.to_dict()
	assert d["uncertainty_label"] == "confident"
	assert "90%" in d["coverage_guarantee"]
	assert "87.0%" in d["top_file_confidence"]

	def test_record_calibration_point(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline

	mock = self._mock_localisation_pipeline(["a.py"], [0.8])
	up = UncertaintyAwarePipeline(
	localisation_pipeline=mock,
	calibration_store_path=tmp_path / "cal.json",
	)
	up.record_calibration_point(
	rrf_scores={"a.py": 0.8, "b.py": 0.3},
	gold_files=["a.py"],
	instance_id="test-1",
	)
	assert up.cal_store.n == 1

	def test_calibration_stats(self, tmp_path):
	from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline

	mock = self._mock_localisation_pipeline(["a.py"], [0.8])
	up = UncertaintyAwarePipeline(
	localisation_pipeline=mock,
	calibration_store_path=tmp_path / "cal.json",
	)
	stats = up.calibration_stats()
	assert "n" in stats