Spaces:
Running
Running
| """ | |
| tests/test_phase6_uncertainty.py | |
| ββββββββββββββββββββββββββββββββββ | |
| Unit tests for Phase 6: Conformal Prediction + Temperature Scaling. | |
| Tests verify: | |
| - Coverage guarantee property (marginal coverage >= 1-alpha) | |
| - Prediction set size properties (non-emptiness, monotonicity w.r.t. alpha) | |
| - Temperature scaling NLL reduction and ECE improvement | |
| - CalibrationStore persistence (save/load) | |
| - RAPS prediction set properties | |
| - UncertaintyReport output format | |
| - Pipeline integration with mock localisation | |
| Run with: pytest tests/test_phase6_uncertainty.py -v | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import math | |
| import tempfile | |
| from pathlib import Path | |
| import numpy as np | |
| import pytest | |
| # ββ CalibrationStore ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestCalibrationStore: | |
| def test_add_and_scores(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| cs.add(0.8, "inst-1", "django/django") | |
| cs.add(0.3, "inst-2", "django/django") | |
| assert cs.n == 2 | |
| assert abs(cs.scores[0] - 0.2) < 1e-6 # 1 - 0.8 | |
| assert abs(cs.scores[1] - 0.7) < 1e-6 # 1 - 0.3 | |
| def test_save_and_load(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| for i in range(10): | |
| cs.add(float(i) / 10, f"inst-{i}") | |
| cs.save() | |
| cs2 = CalibrationStore(tmp_path / "cal.json") | |
| assert cs2.n == 10 | |
| assert abs(cs2.scores.mean() - cs.scores.mean()) < 1e-6 | |
| def test_quantile_increases_with_alpha(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| for s in np.linspace(0, 1, 50): | |
| cs.add(float(s)) | |
| q10 = cs.quantile(0.10) # 90th percentile | |
| q20 = cs.quantile(0.20) # 80th percentile | |
| # Higher alpha β lower quantile threshold (more permissive) | |
| assert q20 <= q10 | |
| def test_empty_store_quantile(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| # Should return 1.0 (worst case) when no calibration data | |
| assert cs.quantile(0.10) == 1.0 | |
| def test_stats_structure(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| for s in np.linspace(0.5, 1.0, 20): | |
| cs.add(float(s)) | |
| stats = cs.stats() | |
| assert "n" in stats | |
| assert "mean_nonconformity" in stats | |
| assert "q50" in stats | |
| def test_add_batch(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| batch = [(0.7, "a", "repo"), (0.5, "b", "repo"), (0.9, "c", "repo")] | |
| cs.add_batch(batch) | |
| assert cs.n == 3 | |
| # ββ ConformalPredictor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestConformalPredictor: | |
| def _make_predictor(self, tmp_path, n_cal=100, alpha=0.10): | |
| from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| # Simulate calibration scores from realistic localisation | |
| np.random.seed(42) | |
| cal_scores = np.random.beta(2, 5, n_cal) # most scores are low (model is good) | |
| for s in cal_scores: | |
| cs.add(float(s)) | |
| return ConformalPredictor(cs, alpha=alpha) | |
| def test_prediction_returns_correct_types(self, tmp_path): | |
| from uncertainty.conformal_predictor import LocalisationWithUncertainty | |
| cp = self._make_predictor(tmp_path) | |
| files = ["a.py", "b.py", "c.py"] | |
| scores = [0.8, 0.5, 0.2] | |
| result = cp.predict(files, scores) | |
| assert isinstance(result, LocalisationWithUncertainty) | |
| assert len(result.hits) == 3 | |
| def test_coverage_guarantee_satisfied(self, tmp_path): | |
| """ | |
| Core guarantee test: | |
| Empirical coverage >= 1 - alpha on synthetic test set. | |
| """ | |
| from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor | |
| np.random.seed(123) | |
| alpha = 0.10 | |
| # Large calibration set for stable quantile | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| n_cal = 500 | |
| cal_rrf_scores = np.random.beta(3, 2, n_cal) # gold file scores | |
| for s in cal_rrf_scores: | |
| cs.add(float(s)) | |
| cp = ConformalPredictor(cs, alpha=alpha) | |
| # Test instances: gold file has score sampled from same distribution | |
| n_test = 200 | |
| covered = 0 | |
| for _ in range(n_test): | |
| gold_score = float(np.random.beta(3, 2)) | |
| other_scores = list(np.random.beta(1, 3, 9)) # 9 non-gold files | |
| all_scores = sorted([gold_score] + other_scores, reverse=True) | |
| all_files = [f"file_{i}.py" for i in range(10)] | |
| gold_idx = all_scores.index(gold_score) | |
| gold_file = all_files[gold_idx] | |
| result = cp.predict(all_files, all_scores) | |
| pred_set = result.prediction_set_files | |
| if gold_file in pred_set: | |
| covered += 1 | |
| empirical_coverage = covered / n_test | |
| # Should be >= 1 - alpha with high probability | |
| assert empirical_coverage >= (1 - alpha - 0.08), ( | |
| f"Coverage {empirical_coverage:.3f} < guarantee {1-alpha:.2f}" | |
| ) | |
| def test_prediction_set_includes_high_score_file(self, tmp_path): | |
| """High-scoring file should always be in prediction set.""" | |
| cp = self._make_predictor(tmp_path) | |
| result = cp.predict(["best.py", "ok.py", "bad.py"], [0.99, 0.3, 0.01]) | |
| pred_paths = result.prediction_set_files | |
| assert "best.py" in pred_paths | |
| def test_confidence_in_0_1_range(self, tmp_path): | |
| cp = self._make_predictor(tmp_path) | |
| result = cp.predict(["a.py", "b.py"], [0.7, 0.4]) | |
| for hit in result.hits: | |
| assert 0.0 <= hit.confidence <= 1.0 | |
| assert 0.0 <= hit.p_value <= 1.0 | |
| def test_ranks_sequential(self, tmp_path): | |
| cp = self._make_predictor(tmp_path) | |
| result = cp.predict(["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2]) | |
| assert [h.rank for h in result.hits] == [1, 2, 3] | |
| def test_no_calibration_data_maximum_uncertainty(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| cp = ConformalPredictor(cs, alpha=0.10) | |
| result = cp.predict(["a.py"], [0.9]) | |
| # All files should be in prediction set (maximum uncertainty) | |
| assert result.hits[0].p_value == 1.0 # smoothed p-value with n=0 | |
| def test_tighter_alpha_gives_larger_set(self, tmp_path): | |
| """Lower alpha (e.g. 0.05) should produce larger prediction sets.""" | |
| cp_strict = self._make_predictor(tmp_path, alpha=0.05) | |
| cp_lenient = self._make_predictor(tmp_path, alpha=0.20) | |
| files = [f"f{i}.py" for i in range(10)] | |
| scores = list(np.linspace(0.9, 0.1, 10)) | |
| r_strict = cp_strict.predict(files, scores) | |
| r_lenient = cp_lenient.predict(files, scores) | |
| # Stricter coverage requirement β larger prediction set | |
| assert r_strict.prediction_set_size >= r_lenient.prediction_set_size | |
| def test_uncertainty_labels(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| # Calibrate so that only top-1 file is in prediction set | |
| for _ in range(100): | |
| cs.add(0.99) # all gold files have score=0.01 β high nonconformity | |
| cp = ConformalPredictor(cs, alpha=0.10) | |
| # One file with very high score β should be "confident" or "moderate" | |
| result = cp.predict(["only.py"], [0.99]) | |
| assert result.uncertainty_label in ("confident", "moderate", "uncertain", "very_uncertain") | |
| def test_evaluate_coverage_api(self, tmp_path): | |
| cp = self._make_predictor(tmp_path, n_cal=200) | |
| test_instances = [ | |
| (["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2], "a.py"), | |
| (["x.py", "y.py"], [0.9, 0.1], "x.py"), | |
| ] | |
| result = cp.evaluate_coverage(test_instances) | |
| assert "empirical_coverage" in result | |
| assert "avg_set_size" in result | |
| assert 0 <= result["empirical_coverage"] <= 1 | |
| def test_file_confidence_property(self, tmp_path): | |
| from uncertainty.conformal_predictor import FileConfidence | |
| fc = FileConfidence( | |
| file_path="test.py", | |
| rrf_score=0.75, | |
| p_value=0.15, | |
| in_prediction_set=True, | |
| confidence=0.85, | |
| rank=1, | |
| ) | |
| assert "85.0%" in fc.confidence_pct | |
| # ββ Temperature Scaling βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTemperatureScaler: | |
| def _make_overconfident_data(self, n=200, seed=42): | |
| """Simulate overconfident DeBERTa logits.""" | |
| np.random.seed(seed) | |
| labels = np.random.randint(0, 2, n) | |
| # Overconfident: logits have large magnitude | |
| logits = np.column_stack([ | |
| np.where(labels == 0, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)), | |
| np.where(labels == 1, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)), | |
| ]) | |
| return logits, labels | |
| def test_scale_output_sums_to_one(self): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| ts = TemperatureScaler(T=1.5) | |
| logits = np.array([[2.0, -1.0], [0.5, 0.8], [-3.0, 5.0]]) | |
| probs = ts.scale(logits) | |
| np.testing.assert_allclose(probs.sum(axis=1), 1.0, atol=1e-6) | |
| def test_scale_output_in_0_1(self): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| ts = TemperatureScaler(T=2.0) | |
| logits = np.random.randn(50, 2) | |
| probs = ts.scale(logits) | |
| assert probs.min() >= 0 | |
| assert probs.max() <= 1 | |
| def test_T_greater_than_1_softens(self): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| logits = np.array([[5.0, -5.0]]) # very confident | |
| ts1 = TemperatureScaler(T=1.0) | |
| ts2 = TemperatureScaler(T=3.0) | |
| prob1 = ts1.scale(logits)[0, 0] | |
| prob2 = ts2.scale(logits)[0, 0] | |
| # T=3 should produce softer (closer to 0.5) probability | |
| assert prob1 > prob2 # prob1 closer to 1.0, prob2 closer to 0.5 | |
| def test_fit_reduces_nll(self, tmp_path): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| logits, labels = self._make_overconfident_data() | |
| ts = TemperatureScaler(T=1.0) | |
| result = ts.fit(logits, labels) | |
| assert result["nll_after"] <= result["nll_before"] | |
| def test_fit_T_greater_than_1_for_overconfident(self, tmp_path): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| logits, labels = self._make_overconfident_data() | |
| ts = TemperatureScaler(T=1.0) | |
| ts.fit(logits, labels) | |
| # Overconfident model β T should increase to soften probabilities | |
| assert ts.T > 0.5 # just check it stays positive and reasonable | |
| def test_save_and_load(self, tmp_path): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| ts = TemperatureScaler(T=2.345) | |
| ts._fitted = True | |
| ts.save(tmp_path / "ts.json") | |
| ts2 = TemperatureScaler.load(tmp_path / "ts.json") | |
| assert abs(ts2.T - 2.345) < 1e-6 | |
| assert ts2._fitted is True | |
| def test_scale_score_single_value(self): | |
| from uncertainty.temperature_scaling import TemperatureScaler | |
| ts = TemperatureScaler(T=1.0) | |
| prob = ts.scale_score(2.0) | |
| assert 0 < prob < 1 | |
| def test_reliability_diagram_data(self): | |
| from uncertainty.temperature_scaling import reliability_diagram_data | |
| np.random.seed(42) | |
| probs = np.random.uniform(0, 1, 100) | |
| labels = (probs + np.random.randn(100) * 0.2 > 0.5).astype(int) | |
| bins = reliability_diagram_data(probs, labels, n_bins=5) | |
| assert len(bins) > 0 | |
| for b in bins: | |
| assert "confidence" in b | |
| assert "accuracy" in b | |
| assert "count" in b | |
| # ββ RAPS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRAPS: | |
| def test_raps_returns_nonempty(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore, raps_predict | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| for s in np.linspace(0, 1, 50): | |
| cs.add(float(s)) | |
| files = ["a.py", "b.py", "c.py"] | |
| scores = np.array([0.6, 0.3, 0.1]) | |
| result = raps_predict(files, scores, cs, alpha=0.10) | |
| assert len(result) >= 1 | |
| def test_raps_top1_always_included(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore, raps_predict | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| # Empty calibration β fallback to top-k | |
| files = ["best.py", "ok.py"] | |
| scores = np.array([0.9, 0.1]) | |
| result = raps_predict(files, scores, cs, alpha=0.10) | |
| paths = [r[0] for r in result] | |
| assert "best.py" in paths | |
| def test_raps_scores_positive(self, tmp_path): | |
| from uncertainty.conformal_predictor import CalibrationStore, raps_predict | |
| cs = CalibrationStore(tmp_path / "cal.json") | |
| for s in np.linspace(0.1, 0.9, 30): | |
| cs.add(float(s)) | |
| files = [f"f{i}.py" for i in range(5)] | |
| scores = np.array([0.5, 0.2, 0.15, 0.1, 0.05]) | |
| result = raps_predict(files, scores, cs) | |
| assert all(s > 0 for _, s in result) | |
| # ββ UncertaintyAwarePipeline ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestUncertaintyAwarePipeline: | |
| def _mock_localisation_pipeline(self, files, scores): | |
| """Create a mock pipeline that returns pre-set results.""" | |
| from unittest.mock import MagicMock | |
| from localisation.pipeline import LocalisationResult, LocalisationHit | |
| mock = MagicMock() | |
| hits = [ | |
| LocalisationHit(file_path=fp, relevance_score=s, rank=i + 1) | |
| for i, (fp, s) in enumerate(zip(files, scores)) | |
| ] | |
| mock.localise.return_value = LocalisationResult(hits=hits, elapsed_seconds=0.1) | |
| mock.index_repo.return_value = {"elapsed": 0.1} | |
| return mock | |
| def test_localise_with_uncertainty_returns_result(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline | |
| files = ["models.py", "views.py", "utils.py"] | |
| scores = [0.8, 0.5, 0.2] | |
| mock_pipeline = self._mock_localisation_pipeline(files, scores) | |
| up = UncertaintyAwarePipeline( | |
| localisation_pipeline=mock_pipeline, | |
| calibration_store_path=tmp_path / "cal.json", | |
| ) | |
| result = up.localise_with_uncertainty("fix the bug", top_k=3) | |
| assert len(result.files) == 3 | |
| assert len(result.prediction_set) >= 1 | |
| def test_prediction_set_never_empty(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline | |
| mock = self._mock_localisation_pipeline(["only.py"], [0.9]) | |
| up = UncertaintyAwarePipeline( | |
| localisation_pipeline=mock, | |
| calibration_store_path=tmp_path / "cal.json", | |
| ) | |
| result = up.localise_with_uncertainty("some issue") | |
| assert len(result.prediction_set) >= 1 | |
| def test_token_savings_computed(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline | |
| files = [f"f{i}.py" for i in range(10)] | |
| scores = list(np.linspace(0.9, 0.1, 10)) | |
| mock = self._mock_localisation_pipeline(files, scores) | |
| up = UncertaintyAwarePipeline( | |
| localisation_pipeline=mock, | |
| calibration_store_path=tmp_path / "cal.json", | |
| tokens_per_file=1500, | |
| ) | |
| result = up.localise_with_uncertainty("issue", top_k=10) | |
| assert result.token_budget_naive == 10 * 1500 | |
| assert result.token_budget_used <= result.token_budget_naive | |
| def test_uncertainty_report_to_dict(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyReport | |
| report = UncertaintyReport( | |
| uncertainty_label="confident", | |
| prediction_set_size=2, | |
| coverage_guarantee=0.90, | |
| top_file_confidence=0.87, | |
| avg_confidence=0.65, | |
| estimated_token_savings=0.60, | |
| calibration_n=150, | |
| ) | |
| d = report.to_dict() | |
| assert d["uncertainty_label"] == "confident" | |
| assert "90%" in d["coverage_guarantee"] | |
| assert "87.0%" in d["top_file_confidence"] | |
| def test_record_calibration_point(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline | |
| mock = self._mock_localisation_pipeline(["a.py"], [0.8]) | |
| up = UncertaintyAwarePipeline( | |
| localisation_pipeline=mock, | |
| calibration_store_path=tmp_path / "cal.json", | |
| ) | |
| up.record_calibration_point( | |
| rrf_scores={"a.py": 0.8, "b.py": 0.3}, | |
| gold_files=["a.py"], | |
| instance_id="test-1", | |
| ) | |
| assert up.cal_store.n == 1 | |
| def test_calibration_stats(self, tmp_path): | |
| from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline | |
| mock = self._mock_localisation_pipeline(["a.py"], [0.8]) | |
| up = UncertaintyAwarePipeline( | |
| localisation_pipeline=mock, | |
| calibration_store_path=tmp_path / "cal.json", | |
| ) | |
| stats = up.calibration_stats() | |
| assert "n" in stats | |