Spaces:
Sleeping
Sleeping
| # server/confidence_calibrator.py | |
| """ | |
| Confidence Calibration Engine β v4.0 | |
| The key scientific question: Is the agent calibrated? | |
| An agent is calibrated when its certainty level (inferred from behavior) | |
| matches its likelihood of being correct. | |
| Since agents don't expose probability distributions directly, we infer | |
| confidence from behavioral proxies: | |
| - How quickly did it commit to a hypothesis (read β write speed)? | |
| - How much did it re-explore after writing (re-reads after write)? | |
| - Did it verify (run_tests) before submitting? | |
| - How many steps did it spend before the first write? | |
| We then compare inferred confidence to actual accuracy (final_score). | |
| Overconfident agents submit fast but score poorly. | |
| Underconfident agents explore extensively but still score well. | |
| Well-calibrated agents: confidence β accuracy. | |
| This is NOT measured by any existing benchmark or tracing tool. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from typing import List, Dict, Any, Optional | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| class CalibrationProfile(str, Enum): | |
| WELL_CALIBRATED = "WELL_CALIBRATED" # Confidence β accuracy | |
| OVERCONFIDENT = "OVERCONFIDENT" # High confidence, low accuracy | |
| UNDERCONFIDENT = "UNDERCONFIDENT" # Low confidence, high accuracy | |
| ERRATIC = "ERRATIC" # Confidence changes randomly | |
| class ConfidenceSample: | |
| """Inferred confidence at one point in the trajectory.""" | |
| step: int | |
| action_type: str | |
| inferred_confidence: float # 0.0β1.0 based on behavioral proxy | |
| actual_accuracy: Optional[float] # test_pass_rate at this step if known | |
| calibration_error: Optional[float] # |confidence - accuracy| if both known | |
| class CalibrationReport: | |
| """Full confidence calibration analysis.""" | |
| episode_id: str | |
| task: str | |
| profile: CalibrationProfile | |
| calibration_score: float # 1.0 = perfectly calibrated | |
| # Inferred overall confidence level (behavioral proxy) | |
| inferred_confidence: float # 0.0β1.0 | |
| actual_performance: float # final_score | |
| # Decomposed signals | |
| commitment_speed: float # How fast did agent commit? (0=slow/careful, 1=fast) | |
| re_exploration_rate: float # Reads after first write / total reads | |
| verification_rate: float # run_tests per write_file | |
| submit_speed: float # Submit step / max_steps (early=overconfident) | |
| # Trajectory of inferred confidence | |
| confidence_trajectory: List[ConfidenceSample] | |
| # Calibration error | |
| expected_calibration_error: float # Mean(|conf - acc|) where acc is known | |
| confidence_accuracy_correlation: float # Should be high for good agents | |
| diagnosis: str | |
| recommendations: List[str] | |
| def to_dict(self) -> dict: | |
| return { | |
| "episode_id": self.episode_id, | |
| "task": self.task, | |
| "profile": self.profile.value, | |
| "calibration_score": round(self.calibration_score, 3), | |
| "inferred_confidence": round(self.inferred_confidence, 3), | |
| "actual_performance": round(self.actual_performance, 3), | |
| "signals": { | |
| "commitment_speed": round(self.commitment_speed, 3), | |
| "re_exploration_rate": round(self.re_exploration_rate, 3), | |
| "verification_rate": round(self.verification_rate, 3), | |
| "submit_speed": round(self.submit_speed, 3), | |
| }, | |
| "expected_calibration_error": round(self.expected_calibration_error, 3), | |
| "confidence_accuracy_correlation": round(self.confidence_accuracy_correlation, 3), | |
| "confidence_trajectory": [ | |
| { | |
| "step": s.step, | |
| "action": s.action_type, | |
| "confidence": round(s.inferred_confidence, 3), | |
| "accuracy": round(s.actual_accuracy, 3) if s.actual_accuracy is not None else None, | |
| "error": round(s.calibration_error, 3) if s.calibration_error is not None else None, | |
| } | |
| for s in self.confidence_trajectory | |
| ], | |
| "diagnosis": self.diagnosis, | |
| "recommendations": self.recommendations, | |
| } | |
| class ConfidenceCalibrator: | |
| """ | |
| Infers behavioral confidence and compares to actual performance. | |
| Confidence proxy model: | |
| - Reading files = low confidence (still exploring) | |
| - Writing files = medium-high confidence (committed to hypothesis) | |
| - Running tests = verification (moderate, checking own hypothesis) | |
| - Submitting = maximum commitment (fully confident) | |
| Each action type has a confidence weight: | |
| read_file: 0.2 (exploring, uncertain) | |
| search_code: 0.3 (slightly more directed) | |
| run_tests: 0.6 (confident enough to test) | |
| write_file: 0.75 (committed to hypothesis) | |
| submit: 1.0 (maximum confidence) | |
| We track how this evolves over the trajectory. | |
| """ | |
| ACTION_CONFIDENCE = { | |
| "read_file": 0.2, | |
| "search_code": 0.3, | |
| "run_tests": 0.6, | |
| "write_file": 0.75, | |
| "submit": 1.0, | |
| } | |
| def calibrate( | |
| self, | |
| episode_id: str, | |
| task: str, | |
| trajectory_steps: List[dict], | |
| final_score: float, | |
| max_steps: int = 20, | |
| ) -> CalibrationReport: | |
| """Compute the full calibration report for one episode.""" | |
| if not trajectory_steps: | |
| return self._empty_report(episode_id, task, final_score) | |
| action_types = [s.get("action_type", "read_file") for s in trajectory_steps] | |
| total_steps = len(trajectory_steps) | |
| # ββ Build confidence trajectory βββββββββββββββββββββββββββββββββββββββ | |
| confidence_traj: List[ConfidenceSample] = [] | |
| running_conf = 0.0 | |
| for s in trajectory_steps: | |
| atype = s.get("action_type", "read_file") | |
| base_conf = self.ACTION_CONFIDENCE.get(atype, 0.3) | |
| # Confidence grows as episode progresses | |
| step_n = s.get("step_number", 1) | |
| progress_bonus = (step_n / max(total_steps, 1)) * 0.1 | |
| # Re-reads slightly lower confidence | |
| step_write_count = sum( | |
| 1 for s2 in trajectory_steps | |
| if s2.get("action_type") == "write_file" | |
| and s2.get("step_number", 99) < step_n | |
| ) | |
| step_reread = ( | |
| s.get("action_type") == "read_file" | |
| and any( | |
| s2.get("action_path") == s.get("action_path") | |
| and s2.get("step_number", 0) < step_n | |
| for s2 in trajectory_steps | |
| ) | |
| ) | |
| reread_penalty = -0.1 if step_reread else 0.0 | |
| # After a write, confidence should be higher | |
| post_write_bonus = min(0.15, step_write_count * 0.05) | |
| inferred = min(1.0, max(0.0, | |
| base_conf + progress_bonus + post_write_bonus + reread_penalty | |
| )) | |
| # Actual accuracy at this step if test_pass_rate is known | |
| actual_acc = s.get("test_pass_rate") | |
| calib_err = abs(inferred - actual_acc) if actual_acc is not None else None | |
| confidence_traj.append(ConfidenceSample( | |
| step=step_n, | |
| action_type=atype, | |
| inferred_confidence=inferred, | |
| actual_accuracy=actual_acc, | |
| calibration_error=calib_err, | |
| )) | |
| # ββ Behavioral signal computation βββββββββββββββββββββββββββββββββββββ | |
| total = max(total_steps, 1) | |
| # Commitment speed: how many reads before first write? | |
| read_steps = [i for i, a in enumerate(action_types) if a == "read_file"] | |
| write_steps = [i for i, a in enumerate(action_types) if a == "write_file"] | |
| submit_step = next( | |
| (s.get("step_number", total) for s in trajectory_steps if s.get("action_type") == "submit"), | |
| total, | |
| ) | |
| if write_steps: | |
| reads_before_first_write = len([r for r in read_steps if r < write_steps[0]]) | |
| # Low reads before write = high commitment speed = overconfident | |
| commitment_speed = max(0.0, 1.0 - reads_before_first_write / max(total, 1)) | |
| else: | |
| commitment_speed = 0.0 # Never wrote = very cautious | |
| # Re-exploration rate: reads after first write / total reads | |
| if write_steps and read_steps: | |
| reads_after_write = len([r for r in read_steps if r > write_steps[0]]) | |
| re_exploration_rate = reads_after_write / len(read_steps) | |
| else: | |
| re_exploration_rate = 0.0 | |
| # Verification rate: run_tests per write | |
| test_count = action_types.count("run_tests") | |
| write_count = action_types.count("write_file") | |
| verification_rate = test_count / max(write_count, 1) | |
| # Submit speed: earlier = more overconfident | |
| submit_speed = 1.0 - (submit_step / max(max_steps, 1)) | |
| submit_speed = max(0.0, min(1.0, submit_speed)) | |
| # ββ Inferred overall confidence βββββββββββββββββββββββββββββββββββββββ | |
| # Weighted behavioral proxy | |
| inferred_confidence = ( | |
| commitment_speed * 0.30 + | |
| (1.0 - re_exploration_rate) * 0.15 + | |
| verification_rate * 0.15 + | |
| submit_speed * 0.20 + | |
| (confidence_traj[-1].inferred_confidence if confidence_traj else 0.5) * 0.20 | |
| ) | |
| inferred_confidence = min(1.0, max(0.0, inferred_confidence)) | |
| # ββ Calibration error (where we have both conf + acc) βββββββββββββββββ | |
| calib_errors = [ | |
| s.calibration_error for s in confidence_traj | |
| if s.calibration_error is not None | |
| ] | |
| ece = sum(calib_errors) / len(calib_errors) if calib_errors else abs(inferred_confidence - final_score) | |
| # ββ Confidence-accuracy correlation ββββββββββββββββββββββββββββββββββββ | |
| paired = [ | |
| (s.inferred_confidence, s.actual_accuracy) | |
| for s in confidence_traj | |
| if s.actual_accuracy is not None | |
| ] | |
| if len(paired) >= 2: | |
| corr = self._pearson_r([p[0] for p in paired], [p[1] for p in paired]) | |
| else: | |
| # Fallback: use final point only | |
| conf_err = abs(inferred_confidence - final_score) | |
| corr = 1.0 - conf_err * 2 | |
| corr = max(-1.0, min(1.0, corr)) | |
| # ββ Calibration score βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| calibration_score = max(0.0, 1.0 - ece) * 0.5 + max(0.0, corr) * 0.5 | |
| calibration_score = max(0.0, min(1.0, calibration_score)) | |
| # ββ Profile classification βββββββββββββββββββββββββββββββββββββββββββββ | |
| conf_diff = inferred_confidence - final_score | |
| if abs(conf_diff) <= 0.2: | |
| profile = CalibrationProfile.WELL_CALIBRATED | |
| elif conf_diff > 0.2: | |
| profile = CalibrationProfile.OVERCONFIDENT | |
| elif conf_diff < -0.2: | |
| profile = CalibrationProfile.UNDERCONFIDENT | |
| else: | |
| profile = CalibrationProfile.ERRATIC | |
| # ββ Diagnosis βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| diagnoses = { | |
| CalibrationProfile.WELL_CALIBRATED: ( | |
| f"Agent is well-calibrated: inferred confidence ({inferred_confidence:.2f}) " | |
| f"closely matches actual performance ({final_score:.2f}). " | |
| "This indicates genuine self-awareness β the agent commits when ready and " | |
| "explores when uncertain." | |
| ), | |
| CalibrationProfile.OVERCONFIDENT: ( | |
| f"Agent is overconfident: behavioral confidence ({inferred_confidence:.2f}) " | |
| f"significantly exceeds actual performance ({final_score:.2f}). " | |
| "Agent committed to a hypothesis too early, skipped verification, " | |
| "or submitted without adequate exploration. This is the profile of agents " | |
| "that 'feel certain but are wrong'." | |
| ), | |
| CalibrationProfile.UNDERCONFIDENT: ( | |
| f"Agent is underconfident: behavioral confidence ({inferred_confidence:.2f}) " | |
| f"is well below actual performance ({final_score:.2f}). " | |
| "Agent explored far more than necessary, re-read files unnecessarily, " | |
| "or hesitated to commit despite having the right information. " | |
| "This wastes compute and steps without improving accuracy." | |
| ), | |
| CalibrationProfile.ERRATIC: ( | |
| "Agent calibration is erratic β confidence signals are inconsistent " | |
| "with behavior. The agent may be applying a rigid strategy regardless " | |
| "of the task difficulty." | |
| ), | |
| } | |
| recs = [] | |
| if profile == CalibrationProfile.OVERCONFIDENT: | |
| recs.append("Read more files before writing β commit only when you've seen the full causal chain.") | |
| recs.append("Always run_tests after writing β don't trust your fix without verification.") | |
| elif profile == CalibrationProfile.UNDERCONFIDENT: | |
| recs.append("Commit to hypotheses earlier β excessive re-reading wastes steps.") | |
| recs.append("After reading tests + source files, write your fix. Stop re-reading.") | |
| if verification_rate < 0.5: | |
| recs.append("Increase test verification rate: run_tests after each write.") | |
| if re_exploration_rate > 0.5: | |
| recs.append("High re-exploration after writing suggests uncalibrated hypothesis formation.") | |
| return CalibrationReport( | |
| episode_id=episode_id, | |
| task=task, | |
| profile=profile, | |
| calibration_score=calibration_score, | |
| inferred_confidence=inferred_confidence, | |
| actual_performance=final_score, | |
| commitment_speed=commitment_speed, | |
| re_exploration_rate=re_exploration_rate, | |
| verification_rate=verification_rate, | |
| submit_speed=submit_speed, | |
| confidence_trajectory=confidence_traj, | |
| expected_calibration_error=ece, | |
| confidence_accuracy_correlation=corr, | |
| diagnosis=diagnoses[profile], | |
| recommendations=recs, | |
| ) | |
| def _pearson_r(self, xs: List[float], ys: List[float]) -> float: | |
| n = len(xs) | |
| if n < 2: | |
| return 0.0 | |
| mx, my = sum(xs) / n, sum(ys) / n | |
| num = sum((x - mx) * (y - my) for x, y in zip(xs, ys)) | |
| dx = math.sqrt(sum((x - mx) ** 2 for x in xs)) | |
| dy = math.sqrt(sum((y - my) ** 2 for y in ys)) | |
| if dx * dy == 0: | |
| return 0.0 | |
| return num / (dx * dy) | |
| def _empty_report(self, episode_id: str, task: str, final_score: float) -> CalibrationReport: | |
| return CalibrationReport( | |
| episode_id=episode_id, task=task, | |
| profile=CalibrationProfile.ERRATIC, | |
| calibration_score=0.0, | |
| inferred_confidence=0.0, actual_performance=final_score, | |
| commitment_speed=0.0, re_exploration_rate=0.0, | |
| verification_rate=0.0, submit_speed=0.0, | |
| confidence_trajectory=[], | |
| expected_calibration_error=1.0, | |
| confidence_accuracy_correlation=0.0, | |
| diagnosis="No trajectory data.", recommendations=[], | |
| ) | |