""" uncertainty/uncertainty_pipeline.py ───────────────────────────────────── Uncertainty-aware localisation pipeline. Wraps the Phase 3 LocalisationPipeline to add: 1. Per-file confidence scores (from ConformalPredictor) 2. Token budget gating — skip low-confidence files ( dict: return { "uncertainty_label": self.uncertainty_label, "prediction_set_size": self.prediction_set_size, "coverage_guarantee": f"{self.coverage_guarantee*100:.0f}%", "top_file_confidence": f"{self.top_file_confidence*100:.1f}%", "avg_confidence": f"{self.avg_confidence*100:.1f}%", "estimated_token_savings": f"{self.estimated_token_savings*100:.0f}%", "calibration_n": self.calibration_n, } @dataclass class UncertaintyAwareResult: """Full result from the uncertainty-aware pipeline.""" # Files in order, with confidence annotations files: list[FileConfidence] # Prediction set (files to actually send to LLM) prediction_set: list[str] # Full uncertainty report uncertainty: UncertaintyReport # Estimated token cost vs. naive top-k approach token_budget_used: int token_budget_naive: int class UncertaintyAwarePipeline: """ Uncertainty-aware localisation pipeline. Adds conformal prediction on top of the Phase 3 LocalisationPipeline. The prediction set (not just top-k) is what gets sent to the LLM. Configuration: alpha = 0.10 → 90% coverage guarantee min_conf_threshold → skip files below this confidence max_prediction_set → hard cap on prediction set size tokens_per_file → estimated tokens per file (for budget calc) """ def __init__( self, localisation_pipeline, calibration_store_path: Path = Path(".cache/conformal_calibration.json"), alpha: float = 0.10, min_conf_threshold: float = 0.20, # skip files with <20% confidence max_prediction_set: int = 8, tokens_per_file: int = 1500, ): self.pipeline = localisation_pipeline self.alpha = alpha self.min_conf_threshold = min_conf_threshold self.max_prediction_set = max_prediction_set self.tokens_per_file = tokens_per_file # Load or create calibration store self.cal_store = CalibrationStore(Path(calibration_store_path)) self.cp = ConformalPredictor(self.cal_store, alpha=alpha) logger.info( "UncertaintyAwarePipeline: alpha=%.2f, cal_n=%d, threshold=%.2f", alpha, self.cal_store.n, min_conf_threshold ) def index_repo(self, file_symbols: list, dependency_graph=None) -> dict: """Delegate to underlying localisation pipeline.""" return self.pipeline.index_repo(file_symbols, dependency_graph) def localise_with_uncertainty( self, issue_text: str, top_k: int = 10, gold_files: Optional[list[str]] = None, ) -> UncertaintyAwareResult: """ Localise files with conformal uncertainty quantification. Returns the prediction set (not just top-k) annotated with calibrated confidence scores. Args: issue_text: GitHub issue description top_k: initial candidate pool size gold_files: for evaluation (computes empirical recall) """ # ── Stage 1: Run localisation pipeline ──────────────────────────── loc_result = self.pipeline.localise( issue_text, top_k=top_k, gold_files=gold_files ) file_paths = loc_result.top_k_paths rrf_scores = [h.relevance_score for h in loc_result.hits] if not file_paths: return self._empty_result() # ── Stage 2: Conformal prediction ───────────────────────────────── cp_result: LocalisationWithUncertainty = self.cp.predict( file_paths, rrf_scores ) # ── Stage 3: Build prediction set ───────────────────────────────── # Start with conformal prediction set pred_set_files = [ h.file_path for h in cp_result.hits if h.in_prediction_set and h.confidence >= self.min_conf_threshold ] # Guarantee: always include at least top-1 file if not pred_set_files and file_paths: pred_set_files = [file_paths[0]] # Apply hard cap pred_set_files = pred_set_files[:self.max_prediction_set] # ── Stage 4: Token budget calculation ───────────────────────────── tokens_used = len(pred_set_files) * self.tokens_per_file tokens_naive = top_k * self.tokens_per_file savings = 1.0 - (tokens_used / max(tokens_naive, 1)) # ── Stage 5: Build uncertainty report ───────────────────────────── top_conf = cp_result.hits[0].confidence if cp_result.hits else 0.0 report = UncertaintyReport( uncertainty_label=cp_result.uncertainty_label, prediction_set_size=cp_result.prediction_set_size, coverage_guarantee=cp_result.coverage_guarantee, top_file_confidence=top_conf, avg_confidence=cp_result.avg_confidence, estimated_token_savings=savings, calibration_n=self.cal_store.n, ) logger.info( "Uncertainty: label=%s | pred_set=%d/%d | top_conf=%.1f%% | savings=%.0f%%", report.uncertainty_label, len(pred_set_files), top_k, top_conf * 100, savings * 100, ) return UncertaintyAwareResult( files=cp_result.hits, prediction_set=pred_set_files, uncertainty=report, token_budget_used=tokens_used, token_budget_naive=tokens_naive, ) def record_calibration_point( self, rrf_scores: dict[str, float], # {file_path: score} gold_files: list[str], instance_id: str = "", repo: str = "", ) -> None: """ Record a calibration point from a solved instance. This should be called after each evaluation run to grow the calibration set. More calibration points → tighter prediction sets. Args: rrf_scores: {file_path: rrf_score} from localisation run gold_files: true files from the patch instance_id: for diagnostics repo: repository name """ for gold_fp in gold_files: score = rrf_scores.get(gold_fp, 0.0) # 0 if not retrieved self.cal_store.add(score, instance_id, repo) self.cal_store.save() def calibration_stats(self) -> dict: """Return calibration store statistics.""" return self.cal_store.stats() def evaluate_coverage( self, test_instances: list[tuple[list[str], list[float], str]], ) -> dict: """Evaluate empirical coverage on a test set.""" return self.cp.evaluate_coverage(test_instances, self.alpha) def _empty_result(self) -> UncertaintyAwareResult: report = UncertaintyReport( uncertainty_label="very_uncertain", prediction_set_size=0, coverage_guarantee=1.0 - self.alpha, top_file_confidence=0.0, avg_confidence=0.0, estimated_token_savings=0.0, calibration_n=self.cal_store.n, ) return UncertaintyAwareResult( files=[], prediction_set=[], uncertainty=report, token_budget_used=0, token_budget_naive=0, )