codebase-nav-env / server /confidence_calibrator.py
Chirag0123's picture
v4 Research Modules & Pre-submission tweaks
0b0338d
# server/confidence_calibrator.py
"""
Confidence Calibration Engine β€” v4.0
The key scientific question: Is the agent calibrated?
An agent is calibrated when its certainty level (inferred from behavior)
matches its likelihood of being correct.
Since agents don't expose probability distributions directly, we infer
confidence from behavioral proxies:
- How quickly did it commit to a hypothesis (read β†’ write speed)?
- How much did it re-explore after writing (re-reads after write)?
- Did it verify (run_tests) before submitting?
- How many steps did it spend before the first write?
We then compare inferred confidence to actual accuracy (final_score).
Overconfident agents submit fast but score poorly.
Underconfident agents explore extensively but still score well.
Well-calibrated agents: confidence ∝ accuracy.
This is NOT measured by any existing benchmark or tracing tool.
"""
from __future__ import annotations
import math
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from enum import Enum
class CalibrationProfile(str, Enum):
WELL_CALIBRATED = "WELL_CALIBRATED" # Confidence β‰ˆ accuracy
OVERCONFIDENT = "OVERCONFIDENT" # High confidence, low accuracy
UNDERCONFIDENT = "UNDERCONFIDENT" # Low confidence, high accuracy
ERRATIC = "ERRATIC" # Confidence changes randomly
@dataclass
class ConfidenceSample:
"""Inferred confidence at one point in the trajectory."""
step: int
action_type: str
inferred_confidence: float # 0.0–1.0 based on behavioral proxy
actual_accuracy: Optional[float] # test_pass_rate at this step if known
calibration_error: Optional[float] # |confidence - accuracy| if both known
@dataclass
class CalibrationReport:
"""Full confidence calibration analysis."""
episode_id: str
task: str
profile: CalibrationProfile
calibration_score: float # 1.0 = perfectly calibrated
# Inferred overall confidence level (behavioral proxy)
inferred_confidence: float # 0.0–1.0
actual_performance: float # final_score
# Decomposed signals
commitment_speed: float # How fast did agent commit? (0=slow/careful, 1=fast)
re_exploration_rate: float # Reads after first write / total reads
verification_rate: float # run_tests per write_file
submit_speed: float # Submit step / max_steps (early=overconfident)
# Trajectory of inferred confidence
confidence_trajectory: List[ConfidenceSample]
# Calibration error
expected_calibration_error: float # Mean(|conf - acc|) where acc is known
confidence_accuracy_correlation: float # Should be high for good agents
diagnosis: str
recommendations: List[str]
def to_dict(self) -> dict:
return {
"episode_id": self.episode_id,
"task": self.task,
"profile": self.profile.value,
"calibration_score": round(self.calibration_score, 3),
"inferred_confidence": round(self.inferred_confidence, 3),
"actual_performance": round(self.actual_performance, 3),
"signals": {
"commitment_speed": round(self.commitment_speed, 3),
"re_exploration_rate": round(self.re_exploration_rate, 3),
"verification_rate": round(self.verification_rate, 3),
"submit_speed": round(self.submit_speed, 3),
},
"expected_calibration_error": round(self.expected_calibration_error, 3),
"confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3),
"confidence_trajectory": [
{
"step": s.step,
"action": s.action_type,
"confidence": round(s.inferred_confidence, 3),
"accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None,
"error": round(s.calibration_error, 3) if s.calibration_error is not None else None,
}
for s in self.confidence_trajectory
],
"diagnosis": self.diagnosis,
"recommendations": self.recommendations,
}
class ConfidenceCalibrator:
"""
Infers behavioral confidence and compares to actual performance.
Confidence proxy model:
- Reading files = low confidence (still exploring)
- Writing files = medium-high confidence (committed to hypothesis)
- Running tests = verification (moderate, checking own hypothesis)
- Submitting = maximum commitment (fully confident)
Each action type has a confidence weight:
read_file: 0.2 (exploring, uncertain)
search_code: 0.3 (slightly more directed)
run_tests: 0.6 (confident enough to test)
write_file: 0.75 (committed to hypothesis)
submit: 1.0 (maximum confidence)
We track how this evolves over the trajectory.
"""
ACTION_CONFIDENCE = {
"read_file": 0.2,
"search_code": 0.3,
"run_tests": 0.6,
"write_file": 0.75,
"submit": 1.0,
}
def calibrate(
self,
episode_id: str,
task: str,
trajectory_steps: List[dict],
final_score: float,
max_steps: int = 20,
) -> CalibrationReport:
"""Compute the full calibration report for one episode."""
if not trajectory_steps:
return self._empty_report(episode_id, task, final_score)
action_types = [s.get("action_type", "read_file") for s in trajectory_steps]
total_steps = len(trajectory_steps)
# ── Build confidence trajectory ───────────────────────────────────────
confidence_traj: List[ConfidenceSample] = []
running_conf = 0.0
for s in trajectory_steps:
atype = s.get("action_type", "read_file")
base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3)
# Confidence grows as episode progresses
step_n = s.get("step_number", 1)
progress_bonus = (step_n / max(total_steps, 1)) * 0.1
# Re-reads slightly lower confidence
step_write_count = sum(
1 for s2 in trajectory_steps
if s2.get("action_type") == "write_file"
and s2.get("step_number", 99) < step_n
)
step_reread = (
s.get("action_type") == "read_file"
and any(
s2.get("action_path") == s.get("action_path")
and s2.get("step_number", 0) < step_n
for s2 in trajectory_steps
)
)
reread_penalty = -0.1 if step_reread else 0.0
# After a write, confidence should be higher
post_write_bonus = min(0.15, step_write_count * 0.05)
inferred = min(1.0, max(0.0,
base_conf + progress_bonus + post_write_bonus + reread_penalty
))
# Actual accuracy at this step if test_pass_rate is known
actual_acc = s.get("test_pass_rate")
calib_err = abs(inferred - actual_acc) if actual_acc is not None else None
confidence_traj.append(ConfidenceSample(
step=step_n,
action_type=atype,
inferred_confidence=inferred,
actual_accuracy=actual_acc,
calibration_error=calib_err,
))
# ── Behavioral signal computation ─────────────────────────────────────
total = max(total_steps, 1)
# Commitment speed: how many reads before first write?
read_steps = [i for i, a in enumerate(action_types) if a == "read_file"]
write_steps = [i for i, a in enumerate(action_types) if a == "write_file"]
submit_step = next(
(s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"),
total,
)
if write_steps:
reads_before_first_write = len([r for r in read_steps if r < write_steps[0]])
# Low reads before write = high commitment speed = overconfident
commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1))
else:
commitment_speed = 0.0 # Never wrote = very cautious
# Re-exploration rate: reads after first write / total reads
if write_steps and read_steps:
reads_after_write = len([r for r in read_steps if r > write_steps[0]])
re_exploration_rate = reads_after_write / len(read_steps)
else:
re_exploration_rate = 0.0
# Verification rate: run_tests per write
test_count = action_types.count("run_tests")
write_count = action_types.count("write_file")
verification_rate = test_count / max(write_count, 1)
# Submit speed: earlier = more overconfident
submit_speed = 1.0 - (submit_step / max(max_steps, 1))
submit_speed = max(0.0, min(1.0, submit_speed))
# ── Inferred overall confidence ───────────────────────────────────────
# Weighted behavioral proxy
inferred_confidence = (
commitment_speed * 0.30 +
(1.0 - re_exploration_rate) * 0.15 +
verification_rate * 0.15 +
submit_speed * 0.20 +
(confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20
)
inferred_confidence = min(1.0, max(0.0, inferred_confidence))
# ── Calibration error (where we have both conf + acc) ─────────────────
calib_errors = [
s.calibration_error for s in confidence_traj
if s.calibration_error is not None
]
ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score)
# ── Confidence-accuracy correlation ────────────────────────────────────
paired = [
(s.inferred_confidence, s.actual_accuracy)
for s in confidence_traj
if s.actual_accuracy is not None
]
if len(paired) >= 2:
corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired])
else:
# Fallback: use final point only
conf_err = abs(inferred_confidence - final_score)
corr = 1.0 - conf_err * 2
corr = max(-1.0, min(1.0, corr))
# ── Calibration score ─────────────────────────────────────────────────
calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5
calibration_score = max(0.0, min(1.0, calibration_score))
# ── Profile classification ─────────────────────────────────────────────
conf_diff = inferred_confidence - final_score
if abs(conf_diff) <= 0.2:
profile = CalibrationProfile.WELL_CALIBRATED
elif conf_diff > 0.2:
profile = CalibrationProfile.OVERCONFIDENT
elif conf_diff < -0.2:
profile = CalibrationProfile.UNDERCONFIDENT
else:
profile = CalibrationProfile.ERRATIC
# ── Diagnosis ─────────────────────────────────────────────────────────
diagnoses = {
CalibrationProfile.WELL_CALIBRATED: (
f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) "
f"closely matches actual performance ({final_score:.2f}). "
"This indicates genuine self-awareness β€” the agent commits when ready and "
"explores when uncertain."
),
CalibrationProfile.OVERCONFIDENT: (
f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) "
f"significantly exceeds actual performance ({final_score:.2f}). "
"Agent committed to a hypothesis too early, skipped verification, "
"or submitted without adequate exploration. This is the profile of agents "
"that 'feel certain but are wrong'."
),
CalibrationProfile.UNDERCONFIDENT: (
f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) "
f"is well below actual performance ({final_score:.2f}). "
"Agent explored far more than necessary, re-read files unnecessarily, "
"or hesitated to commit despite having the right information. "
"This wastes compute and steps without improving accuracy."
),
CalibrationProfile.ERRATIC: (
"Agent calibration is erratic β€” confidence signals are inconsistent "
"with behavior. The agent may be applying a rigid strategy regardless "
"of the task difficulty."
),
}
recs = []
if profile == CalibrationProfile.OVERCONFIDENT:
recs.append("Read more files before writing β€” commit only when you've seen the full causal chain.")
recs.append("Always run_tests after writing β€” don't trust your fix without verification.")
elif profile == CalibrationProfile.UNDERCONFIDENT:
recs.append("Commit to hypotheses earlier β€” excessive re-reading wastes steps.")
recs.append("After reading tests + source files, write your fix. Stop re-reading.")
if verification_rate < 0.5:
recs.append("Increase test verification rate: run_tests after each write.")
if re_exploration_rate > 0.5:
recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.")
return CalibrationReport(
episode_id=episode_id,
task=task,
profile=profile,
calibration_score=calibration_score,
inferred_confidence=inferred_confidence,
actual_performance=final_score,
commitment_speed=commitment_speed,
re_exploration_rate=re_exploration_rate,
verification_rate=verification_rate,
submit_speed=submit_speed,
confidence_trajectory=confidence_traj,
expected_calibration_error=ece,
confidence_accuracy_correlation=corr,
diagnosis=diagnoses[profile],
recommendations=recs,
)
def _pearson_r(self, xs: List[float], ys: List[float]) -> float:
n = len(xs)
if n < 2:
return 0.0
mx, my = sum(xs) / n, sum(ys) / n
num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
dx = math.sqrt(sum((x - mx) ** 2 for x in xs))
dy = math.sqrt(sum((y - my) ** 2 for y in ys))
if dx * dy == 0:
return 0.0
return num / (dx * dy)
def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport:
return CalibrationReport(
episode_id=episode_id, task=task,
profile=CalibrationProfile.ERRATIC,
calibration_score=0.0,
inferred_confidence=0.0, actual_performance=final_score,
commitment_speed=0.0, re_exploration_rate=0.0,
verification_rate=0.0, submit_speed=0.0,
confidence_trajectory=[],
expected_calibration_error=1.0,
confidence_accuracy_correlation=0.0,
diagnosis="No trajectory data.", recommendations=[],
)