Spaces:

Chirag0123
/

codebase-nav-env

Sleeping

App Files Files Community

codebase-nav-env / server /confidence_calibrator.py

Chirag0123

v4 Research Modules & Pre-submission tweaks

0b0338d 13 days ago

raw

history blame contribute delete

16.1 kB

	# server/confidence_calibrator.py
	"""
	Confidence Calibration Engine — v4.0

	The key scientific question: Is the agent calibrated?
	An agent is calibrated when its certainty level (inferred from behavior)
	matches its likelihood of being correct.

	Since agents don't expose probability distributions directly, we infer
	confidence from behavioral proxies:
	- How quickly did it commit to a hypothesis (read → write speed)?
	- How much did it re-explore after writing (re-reads after write)?
	- Did it verify (run_tests) before submitting?
	- How many steps did it spend before the first write?

	We then compare inferred confidence to actual accuracy (final_score).
	Overconfident agents submit fast but score poorly.
	Underconfident agents explore extensively but still score well.
	Well-calibrated agents: confidence ∝ accuracy.

	This is NOT measured by any existing benchmark or tracing tool.
	"""
	from __future__ import annotations
	import math
	from typing import List, Dict, Any, Optional
	from dataclasses import dataclass, field
	from enum import Enum


	class CalibrationProfile(str, Enum):
	WELL_CALIBRATED = "WELL_CALIBRATED" # Confidence ≈ accuracy
	OVERCONFIDENT = "OVERCONFIDENT" # High confidence, low accuracy
	UNDERCONFIDENT = "UNDERCONFIDENT" # Low confidence, high accuracy
	ERRATIC = "ERRATIC" # Confidence changes randomly


	@dataclass
	class ConfidenceSample:
	"""Inferred confidence at one point in the trajectory."""
	step: int
	action_type: str
	inferred_confidence: float # 0.0–1.0 based on behavioral proxy
	actual_accuracy: Optional[float] # test_pass_rate at this step if known
	calibration_error: Optional[float] # \|confidence - accuracy\| if both known


	@dataclass
	class CalibrationReport:
	"""Full confidence calibration analysis."""
	episode_id: str
	task: str

	profile: CalibrationProfile
	calibration_score: float # 1.0 = perfectly calibrated

	# Inferred overall confidence level (behavioral proxy)
	inferred_confidence: float # 0.0–1.0
	actual_performance: float # final_score

	# Decomposed signals
	commitment_speed: float # How fast did agent commit? (0=slow/careful, 1=fast)
	re_exploration_rate: float # Reads after first write / total reads
	verification_rate: float # run_tests per write_file
	submit_speed: float # Submit step / max_steps (early=overconfident)

	# Trajectory of inferred confidence
	confidence_trajectory: List[ConfidenceSample]

	# Calibration error
	expected_calibration_error: float # Mean(\|conf - acc\|) where acc is known
	confidence_accuracy_correlation: float # Should be high for good agents

	diagnosis: str
	recommendations: List[str]

	def to_dict(self) -> dict:
	return {
	"episode_id": self.episode_id,
	"task": self.task,
	"profile": self.profile.value,
	"calibration_score": round(self.calibration_score, 3),
	"inferred_confidence": round(self.inferred_confidence, 3),
	"actual_performance": round(self.actual_performance, 3),
	"signals": {
	"commitment_speed": round(self.commitment_speed, 3),
	"re_exploration_rate": round(self.re_exploration_rate, 3),
	"verification_rate": round(self.verification_rate, 3),
	"submit_speed": round(self.submit_speed, 3),
	},
	"expected_calibration_error": round(self.expected_calibration_error, 3),
	"confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
	"confidence_trajectory": [
	{
	"step": s.step,
	"action": s.action_type,
	"confidence": round(s.inferred_confidence, 3),
	"accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
	"error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
	}
	for s in self.confidence_trajectory
	],
	"diagnosis": self.diagnosis,
	"recommendations": self.recommendations,
	}


	class ConfidenceCalibrator:
	"""
	Infers behavioral confidence and compares to actual performance.

	Confidence proxy model:
	- Reading files = low confidence (still exploring)
	- Writing files = medium-high confidence (committed to hypothesis)
	- Running tests = verification (moderate, checking own hypothesis)
	- Submitting = maximum commitment (fully confident)

	Each action type has a confidence weight:
	read_file: 0.2 (exploring, uncertain)
	search_code: 0.3 (slightly more directed)
	run_tests: 0.6 (confident enough to test)
	write_file: 0.75 (committed to hypothesis)
	submit: 1.0 (maximum confidence)

	We track how this evolves over the trajectory.
	"""

	ACTION_CONFIDENCE = {
	"read_file": 0.2,
	"search_code": 0.3,
	"run_tests": 0.6,
	"write_file": 0.75,
	"submit": 1.0,
	}

	def calibrate(
	self,
	episode_id: str,
	task: str,
	trajectory_steps: List[dict],
	final_score: float,
	max_steps: int = 20,
	) -> CalibrationReport:
	"""Compute the full calibration report for one episode."""

	if not trajectory_steps:
	return self._empty_report(episode_id, task, final_score)

	action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
	total_steps = len(trajectory_steps)

	# ── Build confidence trajectory ───────────────────────────────────────
	confidence_traj: List[ConfidenceSample] = []
	running_conf = 0.0

	for s in trajectory_steps:
	atype = s.get("action_type", "read_file")
	base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)

	# Confidence grows as episode progresses
	step_n = s.get("step_number", 1)
	progress_bonus = (step_n / max(total_steps, 1)) * 0.1

	# Re-reads slightly lower confidence
	step_write_count = sum(
	1 for s2 in trajectory_steps
	if s2.get("action_type") == "write_file"
	and s2.get("step_number", 99) < step_n
	)
	step_reread = (
	s.get("action_type") == "read_file"
	and any(
	s2.get("action_path") == s.get("action_path")
	and s2.get("step_number", 0) < step_n
	for s2 in trajectory_steps
	)
	)
	reread_penalty = -0.1 if step_reread else 0.0

	# After a write, confidence should be higher
	post_write_bonus = min(0.15, step_write_count * 0.05)

	inferred = min(1.0, max(0.0,
	base_conf + progress_bonus + post_write_bonus + reread_penalty
	))

	# Actual accuracy at this step if test_pass_rate is known
	actual_acc = s.get("test_pass_rate")
	calib_err = abs(inferred - actual_acc) if actual_acc is not None else None

	confidence_traj.append(ConfidenceSample(
	step=step_n,
	action_type=atype,
	inferred_confidence=inferred,
	actual_accuracy=actual_acc,
	calibration_error=calib_err,
	))

	# ── Behavioral signal computation ─────────────────────────────────────
	total = max(total_steps, 1)

	# Commitment speed: how many reads before first write?
	read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
	write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
	submit_step = next(
	(s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
	total,
	)

	if write_steps:
	reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
	# Low reads before write = high commitment speed = overconfident
	commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
	else:
	commitment_speed = 0.0 # Never wrote = very cautious

	# Re-exploration rate: reads after first write / total reads
	if write_steps and read_steps:
	reads_after_write = len([r for r in read_steps if r > write_steps[0]])
	re_exploration_rate = reads_after_write / len(read_steps)
	else:
	re_exploration_rate = 0.0

	# Verification rate: run_tests per write
	test_count = action_types.count("run_tests")
	write_count = action_types.count("write_file")
	verification_rate = test_count / max(write_count, 1)

	# Submit speed: earlier = more overconfident
	submit_speed = 1.0 - (submit_step / max(max_steps, 1))
	submit_speed = max(0.0, min(1.0, submit_speed))

	# ── Inferred overall confidence ───────────────────────────────────────
	# Weighted behavioral proxy
	inferred_confidence = (
	commitment_speed * 0.30 +
	(1.0 - re_exploration_rate) * 0.15 +
	verification_rate * 0.15 +
	submit_speed * 0.20 +
	(confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
	)
	inferred_confidence = min(1.0, max(0.0, inferred_confidence))

	# ── Calibration error (where we have both conf + acc) ─────────────────
	calib_errors = [
	s.calibration_error for s in confidence_traj
	if s.calibration_error is not None
	]
	ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)

	# ── Confidence-accuracy correlation ────────────────────────────────────
	paired = [
	(s.inferred_confidence, s.actual_accuracy)
	for s in confidence_traj
	if s.actual_accuracy is not None
	]
	if len(paired) >= 2:
	corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
	else:
	# Fallback: use final point only
	conf_err = abs(inferred_confidence - final_score)
	corr = 1.0 - conf_err * 2

	corr = max(-1.0, min(1.0, corr))

	# ── Calibration score ─────────────────────────────────────────────────
	calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
	calibration_score = max(0.0, min(1.0, calibration_score))

	# ── Profile classification ─────────────────────────────────────────────
	conf_diff = inferred_confidence - final_score
	if abs(conf_diff) <= 0.2:
	profile = CalibrationProfile.WELL_CALIBRATED
	elif conf_diff > 0.2:
	profile = CalibrationProfile.OVERCONFIDENT
	elif conf_diff < -0.2:
	profile = CalibrationProfile.UNDERCONFIDENT
	else:
	profile = CalibrationProfile.ERRATIC

	# ── Diagnosis ─────────────────────────────────────────────────────────
	diagnoses = {
	CalibrationProfile.WELL_CALIBRATED: (
	f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
	f"closely matches actual performance ({final_score:.2f}). "
	"This indicates genuine self-awareness — the agent commits when ready and "
	"explores when uncertain."
	),
	CalibrationProfile.OVERCONFIDENT: (
	f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
	f"significantly exceeds actual performance ({final_score:.2f}). "
	"Agent committed to a hypothesis too early, skipped verification, "
	"or submitted without adequate exploration. This is the profile of agents "
	"that 'feel certain but are wrong'."
	),
	CalibrationProfile.UNDERCONFIDENT: (
	f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
	f"is well below actual performance ({final_score:.2f}). "
	"Agent explored far more than necessary, re-read files unnecessarily, "
	"or hesitated to commit despite having the right information. "
	"This wastes compute and steps without improving accuracy."
	),
	CalibrationProfile.ERRATIC: (
	"Agent calibration is erratic — confidence signals are inconsistent "
	"with behavior. The agent may be applying a rigid strategy regardless "
	"of the task difficulty."
	),
	}

	recs = []
	if profile == CalibrationProfile.OVERCONFIDENT:
	recs.append("Read more files before writing — commit only when you've seen the full causal chain.")
	recs.append("Always run_tests after writing — don't trust your fix without verification.")
	elif profile == CalibrationProfile.UNDERCONFIDENT:
	recs.append("Commit to hypotheses earlier — excessive re-reading wastes steps.")
	recs.append("After reading tests + source files, write your fix. Stop re-reading.")
	if verification_rate < 0.5:
	recs.append("Increase test verification rate: run_tests after each write.")
	if re_exploration_rate > 0.5:
	recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")

	return CalibrationReport(
	episode_id=episode_id,
	task=task,
	profile=profile,
	calibration_score=calibration_score,
	inferred_confidence=inferred_confidence,
	actual_performance=final_score,
	commitment_speed=commitment_speed,
	re_exploration_rate=re_exploration_rate,
	verification_rate=verification_rate,
	submit_speed=submit_speed,
	confidence_trajectory=confidence_traj,
	expected_calibration_error=ece,
	confidence_accuracy_correlation=corr,
	diagnosis=diagnoses[profile],
	recommendations=recs,
	)

	def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
	n = len(xs)
	if n < 2:
	return 0.0
	mx, my = sum(xs) / n, sum(ys) / n
	num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
	dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
	dy = math.sqrt(sum((y - my) ** 2 for y in ys))
	if dx * dy == 0:
	return 0.0
	return num / (dx * dy)

	def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
	return CalibrationReport(
	episode_id=episode_id, task=task,
	profile=CalibrationProfile.ERRATIC,
	calibration_score=0.0,
	inferred_confidence=0.0, actual_performance=final_score,
	commitment_speed=0.0, re_exploration_rate=0.0,
	verification_rate=0.0, submit_speed=0.0,
	confidence_trajectory=[],
	expected_calibration_error=1.0,
	confidence_accuracy_correlation=0.0,
	diagnosis="No trajectory data.", recommendations=[],
	)