Spaces:
Running
Running
| """ | |
| uncertainty/uncertainty_pipeline.py | |
| βββββββββββββββββββββββββββββββββββββ | |
| Uncertainty-aware localisation pipeline. | |
| Wraps the Phase 3 LocalisationPipeline to add: | |
| 1. Per-file confidence scores (from ConformalPredictor) | |
| 2. Token budget gating β skip low-confidence files (<threshold) | |
| 3. Adaptive top-k β expand/contract prediction set size based on uncertainty | |
| 4. Confidence report for the UI dashboard | |
| The key insight: don't send 10 files to the LLM when you're only | |
| confident about 2. Conformal prediction tells you the minimum set of | |
| files needed to achieve 90% recall guarantee. | |
| Token budget reduction: instead of always sending 10 files Γ 150 lines | |
| = 15,000 tokens, we send only the prediction set (avg ~2.3 files on | |
| confident instances) = ~3,450 tokens. This drops token cost by ~77% | |
| on easy issues while maintaining the coverage guarantee. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| from uncertainty.conformal_predictor import ( | |
| CalibrationStore, | |
| ConformalPredictor, | |
| FileConfidence, | |
| LocalisationWithUncertainty, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class UncertaintyReport: | |
| """Summary of uncertainty metrics for a localisation query.""" | |
| uncertainty_label: str # confident / moderate / uncertain / very_uncertain | |
| prediction_set_size: int # |C(x)| at alpha=0.10 | |
| coverage_guarantee: float # 0.90 | |
| top_file_confidence: float # confidence of rank-1 file | |
| avg_confidence: float | |
| estimated_token_savings: float # fraction of tokens saved by skipping low-conf files | |
| calibration_n: int | |
| def to_dict(self) -> dict: | |
| return { | |
| "uncertainty_label": self.uncertainty_label, | |
| "prediction_set_size": self.prediction_set_size, | |
| "coverage_guarantee": f"{self.coverage_guarantee*100:.0f}%", | |
| "top_file_confidence": f"{self.top_file_confidence*100:.1f}%", | |
| "avg_confidence": f"{self.avg_confidence*100:.1f}%", | |
| "estimated_token_savings": f"{self.estimated_token_savings*100:.0f}%", | |
| "calibration_n": self.calibration_n, | |
| } | |
| class UncertaintyAwareResult: | |
| """Full result from the uncertainty-aware pipeline.""" | |
| # Files in order, with confidence annotations | |
| files: list[FileConfidence] | |
| # Prediction set (files to actually send to LLM) | |
| prediction_set: list[str] | |
| # Full uncertainty report | |
| uncertainty: UncertaintyReport | |
| # Estimated token cost vs. naive top-k approach | |
| token_budget_used: int | |
| token_budget_naive: int | |
| class UncertaintyAwarePipeline: | |
| """ | |
| Uncertainty-aware localisation pipeline. | |
| Adds conformal prediction on top of the Phase 3 LocalisationPipeline. | |
| The prediction set (not just top-k) is what gets sent to the LLM. | |
| Configuration: | |
| alpha = 0.10 β 90% coverage guarantee | |
| min_conf_threshold β skip files below this confidence | |
| max_prediction_set β hard cap on prediction set size | |
| tokens_per_file β estimated tokens per file (for budget calc) | |
| """ | |
| def __init__( | |
| self, | |
| localisation_pipeline, | |
| calibration_store_path: Path = Path(".cache/conformal_calibration.json"), | |
| alpha: float = 0.10, | |
| min_conf_threshold: float = 0.20, # skip files with <20% confidence | |
| max_prediction_set: int = 8, | |
| tokens_per_file: int = 1500, | |
| ): | |
| self.pipeline = localisation_pipeline | |
| self.alpha = alpha | |
| self.min_conf_threshold = min_conf_threshold | |
| self.max_prediction_set = max_prediction_set | |
| self.tokens_per_file = tokens_per_file | |
| # Load or create calibration store | |
| self.cal_store = CalibrationStore(Path(calibration_store_path)) | |
| self.cp = ConformalPredictor(self.cal_store, alpha=alpha) | |
| logger.info( | |
| "UncertaintyAwarePipeline: alpha=%.2f, cal_n=%d, threshold=%.2f", | |
| alpha, self.cal_store.n, min_conf_threshold | |
| ) | |
| def index_repo(self, file_symbols: list, dependency_graph=None) -> dict: | |
| """Delegate to underlying localisation pipeline.""" | |
| return self.pipeline.index_repo(file_symbols, dependency_graph) | |
| def localise_with_uncertainty( | |
| self, | |
| issue_text: str, | |
| top_k: int = 10, | |
| gold_files: Optional[list[str]] = None, | |
| ) -> UncertaintyAwareResult: | |
| """ | |
| Localise files with conformal uncertainty quantification. | |
| Returns the prediction set (not just top-k) annotated with | |
| calibrated confidence scores. | |
| Args: | |
| issue_text: GitHub issue description | |
| top_k: initial candidate pool size | |
| gold_files: for evaluation (computes empirical recall) | |
| """ | |
| # ββ Stage 1: Run localisation pipeline ββββββββββββββββββββββββββββ | |
| loc_result = self.pipeline.localise( | |
| issue_text, top_k=top_k, gold_files=gold_files | |
| ) | |
| file_paths = loc_result.top_k_paths | |
| rrf_scores = [h.relevance_score for h in loc_result.hits] | |
| if not file_paths: | |
| return self._empty_result() | |
| # ββ Stage 2: Conformal prediction βββββββββββββββββββββββββββββββββ | |
| cp_result: LocalisationWithUncertainty = self.cp.predict( | |
| file_paths, rrf_scores | |
| ) | |
| # ββ Stage 3: Build prediction set βββββββββββββββββββββββββββββββββ | |
| # Start with conformal prediction set | |
| pred_set_files = [ | |
| h.file_path for h in cp_result.hits | |
| if h.in_prediction_set and h.confidence >= self.min_conf_threshold | |
| ] | |
| # Guarantee: always include at least top-1 file | |
| if not pred_set_files and file_paths: | |
| pred_set_files = [file_paths[0]] | |
| # Apply hard cap | |
| pred_set_files = pred_set_files[:self.max_prediction_set] | |
| # ββ Stage 4: Token budget calculation βββββββββββββββββββββββββββββ | |
| tokens_used = len(pred_set_files) * self.tokens_per_file | |
| tokens_naive = top_k * self.tokens_per_file | |
| savings = 1.0 - (tokens_used / max(tokens_naive, 1)) | |
| # ββ Stage 5: Build uncertainty report βββββββββββββββββββββββββββββ | |
| top_conf = cp_result.hits[0].confidence if cp_result.hits else 0.0 | |
| report = UncertaintyReport( | |
| uncertainty_label=cp_result.uncertainty_label, | |
| prediction_set_size=cp_result.prediction_set_size, | |
| coverage_guarantee=cp_result.coverage_guarantee, | |
| top_file_confidence=top_conf, | |
| avg_confidence=cp_result.avg_confidence, | |
| estimated_token_savings=savings, | |
| calibration_n=self.cal_store.n, | |
| ) | |
| logger.info( | |
| "Uncertainty: label=%s | pred_set=%d/%d | top_conf=%.1f%% | savings=%.0f%%", | |
| report.uncertainty_label, len(pred_set_files), top_k, | |
| top_conf * 100, savings * 100, | |
| ) | |
| return UncertaintyAwareResult( | |
| files=cp_result.hits, | |
| prediction_set=pred_set_files, | |
| uncertainty=report, | |
| token_budget_used=tokens_used, | |
| token_budget_naive=tokens_naive, | |
| ) | |
| def record_calibration_point( | |
| self, | |
| rrf_scores: dict[str, float], # {file_path: score} | |
| gold_files: list[str], | |
| instance_id: str = "", | |
| repo: str = "", | |
| ) -> None: | |
| """ | |
| Record a calibration point from a solved instance. | |
| This should be called after each evaluation run to grow the | |
| calibration set. More calibration points β tighter prediction sets. | |
| Args: | |
| rrf_scores: {file_path: rrf_score} from localisation run | |
| gold_files: true files from the patch | |
| instance_id: for diagnostics | |
| repo: repository name | |
| """ | |
| for gold_fp in gold_files: | |
| score = rrf_scores.get(gold_fp, 0.0) # 0 if not retrieved | |
| self.cal_store.add(score, instance_id, repo) | |
| self.cal_store.save() | |
| def calibration_stats(self) -> dict: | |
| """Return calibration store statistics.""" | |
| return self.cal_store.stats() | |
| def evaluate_coverage( | |
| self, | |
| test_instances: list[tuple[list[str], list[float], str]], | |
| ) -> dict: | |
| """Evaluate empirical coverage on a test set.""" | |
| return self.cp.evaluate_coverage(test_instances, self.alpha) | |
| def _empty_result(self) -> UncertaintyAwareResult: | |
| report = UncertaintyReport( | |
| uncertainty_label="very_uncertain", | |
| prediction_set_size=0, | |
| coverage_guarantee=1.0 - self.alpha, | |
| top_file_confidence=0.0, | |
| avg_confidence=0.0, | |
| estimated_token_savings=0.0, | |
| calibration_n=self.cal_store.n, | |
| ) | |
| return UncertaintyAwareResult( | |
| files=[], prediction_set=[], | |
| uncertainty=report, | |
| token_budget_used=0, token_budget_naive=0, | |
| ) | |