Spaces:
Running
Running
File size: 18,628 Bytes
dc71cad | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | """
tests/test_phase6_uncertainty.py
ββββββββββββββββββββββββββββββββββ
Unit tests for Phase 6: Conformal Prediction + Temperature Scaling.
Tests verify:
- Coverage guarantee property (marginal coverage >= 1-alpha)
- Prediction set size properties (non-emptiness, monotonicity w.r.t. alpha)
- Temperature scaling NLL reduction and ECE improvement
- CalibrationStore persistence (save/load)
- RAPS prediction set properties
- UncertaintyReport output format
- Pipeline integration with mock localisation
Run with: pytest tests/test_phase6_uncertainty.py -v
"""
from __future__ import annotations
import json
import math
import tempfile
from pathlib import Path
import numpy as np
import pytest
# ββ CalibrationStore ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestCalibrationStore:
def test_add_and_scores(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
cs.add(0.8, "inst-1", "django/django")
cs.add(0.3, "inst-2", "django/django")
assert cs.n == 2
assert abs(cs.scores[0] - 0.2) < 1e-6 # 1 - 0.8
assert abs(cs.scores[1] - 0.7) < 1e-6 # 1 - 0.3
def test_save_and_load(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
for i in range(10):
cs.add(float(i) / 10, f"inst-{i}")
cs.save()
cs2 = CalibrationStore(tmp_path / "cal.json")
assert cs2.n == 10
assert abs(cs2.scores.mean() - cs.scores.mean()) < 1e-6
def test_quantile_increases_with_alpha(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
for s in np.linspace(0, 1, 50):
cs.add(float(s))
q10 = cs.quantile(0.10) # 90th percentile
q20 = cs.quantile(0.20) # 80th percentile
# Higher alpha β lower quantile threshold (more permissive)
assert q20 <= q10
def test_empty_store_quantile(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
# Should return 1.0 (worst case) when no calibration data
assert cs.quantile(0.10) == 1.0
def test_stats_structure(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
for s in np.linspace(0.5, 1.0, 20):
cs.add(float(s))
stats = cs.stats()
assert "n" in stats
assert "mean_nonconformity" in stats
assert "q50" in stats
def test_add_batch(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore
cs = CalibrationStore(tmp_path / "cal.json")
batch = [(0.7, "a", "repo"), (0.5, "b", "repo"), (0.9, "c", "repo")]
cs.add_batch(batch)
assert cs.n == 3
# ββ ConformalPredictor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestConformalPredictor:
def _make_predictor(self, tmp_path, n_cal=100, alpha=0.10):
from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
cs = CalibrationStore(tmp_path / "cal.json")
# Simulate calibration scores from realistic localisation
np.random.seed(42)
cal_scores = np.random.beta(2, 5, n_cal) # most scores are low (model is good)
for s in cal_scores:
cs.add(float(s))
return ConformalPredictor(cs, alpha=alpha)
def test_prediction_returns_correct_types(self, tmp_path):
from uncertainty.conformal_predictor import LocalisationWithUncertainty
cp = self._make_predictor(tmp_path)
files = ["a.py", "b.py", "c.py"]
scores = [0.8, 0.5, 0.2]
result = cp.predict(files, scores)
assert isinstance(result, LocalisationWithUncertainty)
assert len(result.hits) == 3
def test_coverage_guarantee_satisfied(self, tmp_path):
"""
Core guarantee test:
Empirical coverage >= 1 - alpha on synthetic test set.
"""
from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
np.random.seed(123)
alpha = 0.10
# Large calibration set for stable quantile
cs = CalibrationStore(tmp_path / "cal.json")
n_cal = 500
cal_rrf_scores = np.random.beta(3, 2, n_cal) # gold file scores
for s in cal_rrf_scores:
cs.add(float(s))
cp = ConformalPredictor(cs, alpha=alpha)
# Test instances: gold file has score sampled from same distribution
n_test = 200
covered = 0
for _ in range(n_test):
gold_score = float(np.random.beta(3, 2))
other_scores = list(np.random.beta(1, 3, 9)) # 9 non-gold files
all_scores = sorted([gold_score] + other_scores, reverse=True)
all_files = [f"file_{i}.py" for i in range(10)]
gold_idx = all_scores.index(gold_score)
gold_file = all_files[gold_idx]
result = cp.predict(all_files, all_scores)
pred_set = result.prediction_set_files
if gold_file in pred_set:
covered += 1
empirical_coverage = covered / n_test
# Should be >= 1 - alpha with high probability
assert empirical_coverage >= (1 - alpha - 0.08), (
f"Coverage {empirical_coverage:.3f} < guarantee {1-alpha:.2f}"
)
def test_prediction_set_includes_high_score_file(self, tmp_path):
"""High-scoring file should always be in prediction set."""
cp = self._make_predictor(tmp_path)
result = cp.predict(["best.py", "ok.py", "bad.py"], [0.99, 0.3, 0.01])
pred_paths = result.prediction_set_files
assert "best.py" in pred_paths
def test_confidence_in_0_1_range(self, tmp_path):
cp = self._make_predictor(tmp_path)
result = cp.predict(["a.py", "b.py"], [0.7, 0.4])
for hit in result.hits:
assert 0.0 <= hit.confidence <= 1.0
assert 0.0 <= hit.p_value <= 1.0
def test_ranks_sequential(self, tmp_path):
cp = self._make_predictor(tmp_path)
result = cp.predict(["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2])
assert [h.rank for h in result.hits] == [1, 2, 3]
def test_no_calibration_data_maximum_uncertainty(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
cs = CalibrationStore(tmp_path / "cal.json")
cp = ConformalPredictor(cs, alpha=0.10)
result = cp.predict(["a.py"], [0.9])
# All files should be in prediction set (maximum uncertainty)
assert result.hits[0].p_value == 1.0 # smoothed p-value with n=0
def test_tighter_alpha_gives_larger_set(self, tmp_path):
"""Lower alpha (e.g. 0.05) should produce larger prediction sets."""
cp_strict = self._make_predictor(tmp_path, alpha=0.05)
cp_lenient = self._make_predictor(tmp_path, alpha=0.20)
files = [f"f{i}.py" for i in range(10)]
scores = list(np.linspace(0.9, 0.1, 10))
r_strict = cp_strict.predict(files, scores)
r_lenient = cp_lenient.predict(files, scores)
# Stricter coverage requirement β larger prediction set
assert r_strict.prediction_set_size >= r_lenient.prediction_set_size
def test_uncertainty_labels(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore, ConformalPredictor
cs = CalibrationStore(tmp_path / "cal.json")
# Calibrate so that only top-1 file is in prediction set
for _ in range(100):
cs.add(0.99) # all gold files have score=0.01 β high nonconformity
cp = ConformalPredictor(cs, alpha=0.10)
# One file with very high score β should be "confident" or "moderate"
result = cp.predict(["only.py"], [0.99])
assert result.uncertainty_label in ("confident", "moderate", "uncertain", "very_uncertain")
def test_evaluate_coverage_api(self, tmp_path):
cp = self._make_predictor(tmp_path, n_cal=200)
test_instances = [
(["a.py", "b.py", "c.py"], [0.8, 0.5, 0.2], "a.py"),
(["x.py", "y.py"], [0.9, 0.1], "x.py"),
]
result = cp.evaluate_coverage(test_instances)
assert "empirical_coverage" in result
assert "avg_set_size" in result
assert 0 <= result["empirical_coverage"] <= 1
def test_file_confidence_property(self, tmp_path):
from uncertainty.conformal_predictor import FileConfidence
fc = FileConfidence(
file_path="test.py",
rrf_score=0.75,
p_value=0.15,
in_prediction_set=True,
confidence=0.85,
rank=1,
)
assert "85.0%" in fc.confidence_pct
# ββ Temperature Scaling βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestTemperatureScaler:
def _make_overconfident_data(self, n=200, seed=42):
"""Simulate overconfident DeBERTa logits."""
np.random.seed(seed)
labels = np.random.randint(0, 2, n)
# Overconfident: logits have large magnitude
logits = np.column_stack([
np.where(labels == 0, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)),
np.where(labels == 1, np.random.uniform(3, 6, n), np.random.uniform(-2, 0, n)),
])
return logits, labels
def test_scale_output_sums_to_one(self):
from uncertainty.temperature_scaling import TemperatureScaler
ts = TemperatureScaler(T=1.5)
logits = np.array([[2.0, -1.0], [0.5, 0.8], [-3.0, 5.0]])
probs = ts.scale(logits)
np.testing.assert_allclose(probs.sum(axis=1), 1.0, atol=1e-6)
def test_scale_output_in_0_1(self):
from uncertainty.temperature_scaling import TemperatureScaler
ts = TemperatureScaler(T=2.0)
logits = np.random.randn(50, 2)
probs = ts.scale(logits)
assert probs.min() >= 0
assert probs.max() <= 1
def test_T_greater_than_1_softens(self):
from uncertainty.temperature_scaling import TemperatureScaler
logits = np.array([[5.0, -5.0]]) # very confident
ts1 = TemperatureScaler(T=1.0)
ts2 = TemperatureScaler(T=3.0)
prob1 = ts1.scale(logits)[0, 0]
prob2 = ts2.scale(logits)[0, 0]
# T=3 should produce softer (closer to 0.5) probability
assert prob1 > prob2 # prob1 closer to 1.0, prob2 closer to 0.5
def test_fit_reduces_nll(self, tmp_path):
from uncertainty.temperature_scaling import TemperatureScaler
logits, labels = self._make_overconfident_data()
ts = TemperatureScaler(T=1.0)
result = ts.fit(logits, labels)
assert result["nll_after"] <= result["nll_before"]
def test_fit_T_greater_than_1_for_overconfident(self, tmp_path):
from uncertainty.temperature_scaling import TemperatureScaler
logits, labels = self._make_overconfident_data()
ts = TemperatureScaler(T=1.0)
ts.fit(logits, labels)
# Overconfident model β T should increase to soften probabilities
assert ts.T > 0.5 # just check it stays positive and reasonable
def test_save_and_load(self, tmp_path):
from uncertainty.temperature_scaling import TemperatureScaler
ts = TemperatureScaler(T=2.345)
ts._fitted = True
ts.save(tmp_path / "ts.json")
ts2 = TemperatureScaler.load(tmp_path / "ts.json")
assert abs(ts2.T - 2.345) < 1e-6
assert ts2._fitted is True
def test_scale_score_single_value(self):
from uncertainty.temperature_scaling import TemperatureScaler
ts = TemperatureScaler(T=1.0)
prob = ts.scale_score(2.0)
assert 0 < prob < 1
def test_reliability_diagram_data(self):
from uncertainty.temperature_scaling import reliability_diagram_data
np.random.seed(42)
probs = np.random.uniform(0, 1, 100)
labels = (probs + np.random.randn(100) * 0.2 > 0.5).astype(int)
bins = reliability_diagram_data(probs, labels, n_bins=5)
assert len(bins) > 0
for b in bins:
assert "confidence" in b
assert "accuracy" in b
assert "count" in b
# ββ RAPS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRAPS:
def test_raps_returns_nonempty(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore, raps_predict
cs = CalibrationStore(tmp_path / "cal.json")
for s in np.linspace(0, 1, 50):
cs.add(float(s))
files = ["a.py", "b.py", "c.py"]
scores = np.array([0.6, 0.3, 0.1])
result = raps_predict(files, scores, cs, alpha=0.10)
assert len(result) >= 1
def test_raps_top1_always_included(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore, raps_predict
cs = CalibrationStore(tmp_path / "cal.json")
# Empty calibration β fallback to top-k
files = ["best.py", "ok.py"]
scores = np.array([0.9, 0.1])
result = raps_predict(files, scores, cs, alpha=0.10)
paths = [r[0] for r in result]
assert "best.py" in paths
def test_raps_scores_positive(self, tmp_path):
from uncertainty.conformal_predictor import CalibrationStore, raps_predict
cs = CalibrationStore(tmp_path / "cal.json")
for s in np.linspace(0.1, 0.9, 30):
cs.add(float(s))
files = [f"f{i}.py" for i in range(5)]
scores = np.array([0.5, 0.2, 0.15, 0.1, 0.05])
result = raps_predict(files, scores, cs)
assert all(s > 0 for _, s in result)
# ββ UncertaintyAwarePipeline ββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestUncertaintyAwarePipeline:
def _mock_localisation_pipeline(self, files, scores):
"""Create a mock pipeline that returns pre-set results."""
from unittest.mock import MagicMock
from localisation.pipeline import LocalisationResult, LocalisationHit
mock = MagicMock()
hits = [
LocalisationHit(file_path=fp, relevance_score=s, rank=i + 1)
for i, (fp, s) in enumerate(zip(files, scores))
]
mock.localise.return_value = LocalisationResult(hits=hits, elapsed_seconds=0.1)
mock.index_repo.return_value = {"elapsed": 0.1}
return mock
def test_localise_with_uncertainty_returns_result(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline
files = ["models.py", "views.py", "utils.py"]
scores = [0.8, 0.5, 0.2]
mock_pipeline = self._mock_localisation_pipeline(files, scores)
up = UncertaintyAwarePipeline(
localisation_pipeline=mock_pipeline,
calibration_store_path=tmp_path / "cal.json",
)
result = up.localise_with_uncertainty("fix the bug", top_k=3)
assert len(result.files) == 3
assert len(result.prediction_set) >= 1
def test_prediction_set_never_empty(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline
mock = self._mock_localisation_pipeline(["only.py"], [0.9])
up = UncertaintyAwarePipeline(
localisation_pipeline=mock,
calibration_store_path=tmp_path / "cal.json",
)
result = up.localise_with_uncertainty("some issue")
assert len(result.prediction_set) >= 1
def test_token_savings_computed(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline
files = [f"f{i}.py" for i in range(10)]
scores = list(np.linspace(0.9, 0.1, 10))
mock = self._mock_localisation_pipeline(files, scores)
up = UncertaintyAwarePipeline(
localisation_pipeline=mock,
calibration_store_path=tmp_path / "cal.json",
tokens_per_file=1500,
)
result = up.localise_with_uncertainty("issue", top_k=10)
assert result.token_budget_naive == 10 * 1500
assert result.token_budget_used <= result.token_budget_naive
def test_uncertainty_report_to_dict(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyReport
report = UncertaintyReport(
uncertainty_label="confident",
prediction_set_size=2,
coverage_guarantee=0.90,
top_file_confidence=0.87,
avg_confidence=0.65,
estimated_token_savings=0.60,
calibration_n=150,
)
d = report.to_dict()
assert d["uncertainty_label"] == "confident"
assert "90%" in d["coverage_guarantee"]
assert "87.0%" in d["top_file_confidence"]
def test_record_calibration_point(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline
mock = self._mock_localisation_pipeline(["a.py"], [0.8])
up = UncertaintyAwarePipeline(
localisation_pipeline=mock,
calibration_store_path=tmp_path / "cal.json",
)
up.record_calibration_point(
rrf_scores={"a.py": 0.8, "b.py": 0.3},
gold_files=["a.py"],
instance_id="test-1",
)
assert up.cal_store.n == 1
def test_calibration_stats(self, tmp_path):
from uncertainty.uncertainty_pipeline import UncertaintyAwarePipeline
mock = self._mock_localisation_pipeline(["a.py"], [0.8])
up = UncertaintyAwarePipeline(
localisation_pipeline=mock,
calibration_store_path=tmp_path / "cal.json",
)
stats = up.calibration_stats()
assert "n" in stats
|