Spaces:

SouravNath
/

repomind-api

Running

App Files Files Community

repomind-api / uncertainty /uncertainty_pipeline.py

SouravNath

Initial commit

dc71cad 6 days ago

raw

history blame contribute delete

9.44 kB

	"""
	uncertainty/uncertainty_pipeline.py
	─────────────────────────────────────
	Uncertainty-aware localisation pipeline.

	Wraps the Phase 3 LocalisationPipeline to add:
	1. Per-file confidence scores (from ConformalPredictor)
	2. Token budget gating — skip low-confidence files (<threshold)
	3. Adaptive top-k — expand/contract prediction set size based on uncertainty
	4. Confidence report for the UI dashboard

	The key insight: don't send 10 files to the LLM when you're only
	confident about 2. Conformal prediction tells you the minimum set of
	files needed to achieve 90% recall guarantee.

	Token budget reduction: instead of always sending 10 files × 150 lines
	= 15,000 tokens, we send only the prediction set (avg ~2.3 files on
	confident instances) = ~3,450 tokens. This drops token cost by ~77%
	on easy issues while maintaining the coverage guarantee.
	"""
	from __future__ import annotations

	import logging
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Optional

	from uncertainty.conformal_predictor import (
	CalibrationStore,
	ConformalPredictor,
	FileConfidence,
	LocalisationWithUncertainty,
	)

	logger = logging.getLogger(__name__)


	@dataclass
	class UncertaintyReport:
	"""Summary of uncertainty metrics for a localisation query."""
	uncertainty_label: str # confident / moderate / uncertain / very_uncertain
	prediction_set_size: int # \|C(x)\| at alpha=0.10
	coverage_guarantee: float # 0.90
	top_file_confidence: float # confidence of rank-1 file
	avg_confidence: float
	estimated_token_savings: float # fraction of tokens saved by skipping low-conf files
	calibration_n: int

	def to_dict(self) -> dict:
	return {
	"uncertainty_label": self.uncertainty_label,
	"prediction_set_size": self.prediction_set_size,
	"coverage_guarantee": f"{self.coverage_guarantee*100:.0f}%",
	"top_file_confidence": f"{self.top_file_confidence*100:.1f}%",
	"avg_confidence": f"{self.avg_confidence*100:.1f}%",
	"estimated_token_savings": f"{self.estimated_token_savings*100:.0f}%",
	"calibration_n": self.calibration_n,
	}


	@dataclass
	class UncertaintyAwareResult:
	"""Full result from the uncertainty-aware pipeline."""
	# Files in order, with confidence annotations
	files: list[FileConfidence]
	# Prediction set (files to actually send to LLM)
	prediction_set: list[str]
	# Full uncertainty report
	uncertainty: UncertaintyReport
	# Estimated token cost vs. naive top-k approach
	token_budget_used: int
	token_budget_naive: int


	class UncertaintyAwarePipeline:
	"""
	Uncertainty-aware localisation pipeline.

	Adds conformal prediction on top of the Phase 3 LocalisationPipeline.
	The prediction set (not just top-k) is what gets sent to the LLM.

	Configuration:
	alpha = 0.10 → 90% coverage guarantee
	min_conf_threshold → skip files below this confidence
	max_prediction_set → hard cap on prediction set size
	tokens_per_file → estimated tokens per file (for budget calc)
	"""

	def __init__(
	self,
	localisation_pipeline,
	calibration_store_path: Path = Path(".cache/conformal_calibration.json"),
	alpha: float = 0.10,
	min_conf_threshold: float = 0.20, # skip files with <20% confidence
	max_prediction_set: int = 8,
	tokens_per_file: int = 1500,
	):
	self.pipeline = localisation_pipeline
	self.alpha = alpha
	self.min_conf_threshold = min_conf_threshold
	self.max_prediction_set = max_prediction_set
	self.tokens_per_file = tokens_per_file

	# Load or create calibration store
	self.cal_store = CalibrationStore(Path(calibration_store_path))
	self.cp = ConformalPredictor(self.cal_store, alpha=alpha)

	logger.info(
	"UncertaintyAwarePipeline: alpha=%.2f, cal_n=%d, threshold=%.2f",
	alpha, self.cal_store.n, min_conf_threshold
	)

	def index_repo(self, file_symbols: list, dependency_graph=None) -> dict:
	"""Delegate to underlying localisation pipeline."""
	return self.pipeline.index_repo(file_symbols, dependency_graph)

	def localise_with_uncertainty(
	self,
	issue_text: str,
	top_k: int = 10,
	gold_files: Optional[list[str]] = None,
	) -> UncertaintyAwareResult:
	"""
	Localise files with conformal uncertainty quantification.

	Returns the prediction set (not just top-k) annotated with
	calibrated confidence scores.

	Args:
	issue_text: GitHub issue description
	top_k: initial candidate pool size
	gold_files: for evaluation (computes empirical recall)
	"""
	# ── Stage 1: Run localisation pipeline ────────────────────────────
	loc_result = self.pipeline.localise(
	issue_text, top_k=top_k, gold_files=gold_files
	)

	file_paths = loc_result.top_k_paths
	rrf_scores = [h.relevance_score for h in loc_result.hits]

	if not file_paths:
	return self._empty_result()

	# ── Stage 2: Conformal prediction ─────────────────────────────────
	cp_result: LocalisationWithUncertainty = self.cp.predict(
	file_paths, rrf_scores
	)

	# ── Stage 3: Build prediction set ─────────────────────────────────
	# Start with conformal prediction set
	pred_set_files = [
	h.file_path for h in cp_result.hits
	if h.in_prediction_set and h.confidence >= self.min_conf_threshold
	]

	# Guarantee: always include at least top-1 file
	if not pred_set_files and file_paths:
	pred_set_files = [file_paths[0]]

	# Apply hard cap
	pred_set_files = pred_set_files[:self.max_prediction_set]

	# ── Stage 4: Token budget calculation ─────────────────────────────
	tokens_used = len(pred_set_files) * self.tokens_per_file
	tokens_naive = top_k * self.tokens_per_file
	savings = 1.0 - (tokens_used / max(tokens_naive, 1))

	# ── Stage 5: Build uncertainty report ─────────────────────────────
	top_conf = cp_result.hits[0].confidence if cp_result.hits else 0.0
	report = UncertaintyReport(
	uncertainty_label=cp_result.uncertainty_label,
	prediction_set_size=cp_result.prediction_set_size,
	coverage_guarantee=cp_result.coverage_guarantee,
	top_file_confidence=top_conf,
	avg_confidence=cp_result.avg_confidence,
	estimated_token_savings=savings,
	calibration_n=self.cal_store.n,
	)

	logger.info(
	"Uncertainty: label=%s \| pred_set=%d/%d \| top_conf=%.1f%% \| savings=%.0f%%",
	report.uncertainty_label, len(pred_set_files), top_k,
	top_conf * 100, savings * 100,
	)

	return UncertaintyAwareResult(
	files=cp_result.hits,
	prediction_set=pred_set_files,
	uncertainty=report,
	token_budget_used=tokens_used,
	token_budget_naive=tokens_naive,
	)

	def record_calibration_point(
	self,
	rrf_scores: dict[str, float], # {file_path: score}
	gold_files: list[str],
	instance_id: str = "",
	repo: str = "",
	) -> None:
	"""
	Record a calibration point from a solved instance.

	This should be called after each evaluation run to grow the
	calibration set. More calibration points → tighter prediction sets.

	Args:
	rrf_scores: {file_path: rrf_score} from localisation run
	gold_files: true files from the patch
	instance_id: for diagnostics
	repo: repository name
	"""
	for gold_fp in gold_files:
	score = rrf_scores.get(gold_fp, 0.0) # 0 if not retrieved
	self.cal_store.add(score, instance_id, repo)
	self.cal_store.save()

	def calibration_stats(self) -> dict:
	"""Return calibration store statistics."""
	return self.cal_store.stats()

	def evaluate_coverage(
	self,
	test_instances: list[tuple[list[str], list[float], str]],
	) -> dict:
	"""Evaluate empirical coverage on a test set."""
	return self.cp.evaluate_coverage(test_instances, self.alpha)

	def _empty_result(self) -> UncertaintyAwareResult:
	report = UncertaintyReport(
	uncertainty_label="very_uncertain",
	prediction_set_size=0,
	coverage_guarantee=1.0 - self.alpha,
	top_file_confidence=0.0,
	avg_confidence=0.0,
	estimated_token_savings=0.0,
	calibration_n=self.cal_store.n,
	)
	return UncertaintyAwareResult(
	files=[], prediction_set=[],
	uncertainty=report,
	token_budget_used=0, token_budget_naive=0,
	)