codebase-nav-env / server /benchmark_runner.py
Chirag0123's picture
v4 Research Modules & Pre-submission tweaks
0b0338d
# server/benchmark_runner.py
"""
Benchmark Runner + Leaderboard — v4.0
Automatically runs ALL tasks × selected agent configurations and generates
a research-grade leaderboard output with per-task, per-strategy breakdowns.
Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup,
this runs end-to-end in-process with deterministic strategies.
Output format:
- Leaderboard table (ranked by composite score)
- Per-task breakdown
- Per-failure-type breakdown
- Generalization score (variance across tasks)
- Robustness score (from counterfactual engine)
- A "benchmark JSON" suitable for publishing or comparing systems
"""
from __future__ import annotations
import time
import json
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
@dataclass
class BenchmarkResult:
"""Result of running one agent on one task variant."""
agent_name: str
task: str
variant_id: str
final_score: float
total_steps: int
cumulative_reward: float
duration_seconds: float
strategy: str
failure_type: str
reliability_index: float
causal_score: float
robustness_score: float
calibration_score: float
action_sequence: List[str]
@dataclass
class AgentBenchmarkSummary:
"""Aggregated results for one agent across all tasks."""
agent_name: str
tasks_run: int
mean_score: float
std_score: float
generalization_score: float # 1 - std (lower variance = more generalizable)
mean_steps: float
best_task: str
worst_task: str
mean_reliability: float
mean_causal_score: float
mean_robustness_score: float
mean_calibration_score: float
dominant_strategy: str
dominant_failure: str
composite_rank_score: float # Weighted final score for leaderboard
per_task_scores: Dict[str, float]
def to_dict(self) -> dict:
return {
"agent_name": self.agent_name,
"tasks_run": self.tasks_run,
"scores": {
"mean": round(self.mean_score, 3),
"std": round(self.std_score, 3),
"generalization": round(self.generalization_score, 3),
"reliability": round(self.mean_reliability, 3),
"causal_reasoning": round(self.mean_causal_score, 3),
"robustness": round(self.mean_robustness_score, 3),
"calibration": round(self.mean_calibration_score, 3),
"composite": round(self.composite_rank_score, 3),
},
"efficiency": {
"mean_steps": round(self.mean_steps, 1),
},
"behavior": {
"dominant_strategy": self.dominant_strategy,
"dominant_failure": self.dominant_failure,
},
"per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()},
"best_task": self.best_task,
"worst_task": self.worst_task,
}
@dataclass
class LeaderboardReport:
"""Full benchmark leaderboard."""
benchmark_id: str
tasks_evaluated: List[str]
agents_evaluated: List[str]
total_episodes: int
run_duration_seconds: float
rankings: List[AgentBenchmarkSummary]
raw_results: List[BenchmarkResult]
def to_dict(self) -> dict:
return {
"benchmark_id": self.benchmark_id,
"tasks_evaluated": self.tasks_evaluated,
"agents_evaluated": self.agents_evaluated,
"total_episodes": self.total_episodes,
"run_duration_seconds": round(self.run_duration_seconds, 2),
"leaderboard": [r.to_dict() for r in self.rankings],
"winner": self.rankings[0].agent_name if self.rankings else "none",
"insights": self._generate_insights(),
}
def _generate_insights(self) -> List[str]:
if not self.rankings:
return []
insights = []
top = self.rankings[0]
bottom = self.rankings[-1]
if top.composite_rank_score - bottom.composite_rank_score > 0.2:
insights.append(
f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) "
f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})"
)
if top.generalization_score > 0.7:
insights.append(
f"'{top.agent_name}' shows strong generalization "
f"(std={top.std_score:.3f} across {top.tasks_run} tasks)"
)
for r in self.rankings:
if r.mean_causal_score > 0.6:
insights.append(
f"'{r.agent_name}' demonstrated genuine causal reasoning "
f"(causal_score={r.mean_causal_score:.2f})"
)
strategies = [r.dominant_strategy for r in self.rankings]
if len(set(strategies)) > 1:
best_strategy = self.rankings[0].dominant_strategy
insights.append(
f"Strategy '{best_strategy}' produced the highest composite score."
)
return insights
def render_table(self) -> str:
"""Render ASCII leaderboard table."""
if not self.rankings:
return "No results."
lines = [
f"{'═'*90}",
f" 🏆 BENCHMARK LEADERBOARD — {self.benchmark_id}",
f" Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}",
f"{'═'*90}",
f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}",
f"{'─'*90}",
]
for i, r in enumerate(self.rankings):
medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f" #{i+1}"
lines.append(
f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} "
f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} "
f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} "
f"{r.mean_steps:<7.1f} {r.dominant_strategy}"
)
lines.append(f"{'═'*90}")
lines.append("\n📊 Per-Task Breakdown:")
for r in self.rankings:
task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items()))
lines.append(f" {r.agent_name:<16} {task_str}")
if self._generate_insights():
lines.append("\n💡 Insights:")
lines.extend(f" → {i}" for i in self._generate_insights())
return "\n".join(lines)
class BenchmarkRunner:
"""
Automated benchmark runner.
Runs each agent in AGENT_CONFIGS across each task, collecting:
- Final score
- All intelligence metrics (causal, counterfactual, confidence)
- Strategy and failure classification
- Reliability index
Then generates a ranked leaderboard.
"""
def run(
self,
env,
tasks: Optional[List[str]] = None,
agents: Optional[List[str]] = None,
benchmark_id: Optional[str] = None,
) -> LeaderboardReport:
"""Run the full benchmark."""
import uuid
from server.models import RepoAction
from server.strategy_detector import StrategyDetector
from server.failure_classifier import FailureClassifier
from server.advanced_metrics import AdvancedMetricsEngine
from server.causal_probe import CausalProbe
from server.counterfactual_engine import CounterfactualEngine
from server.confidence_calibrator import ConfidenceCalibrator
benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}"
tasks = tasks or ["task1", "task2", "task3"]
agent_configs = self._get_agent_configs()
if agents:
agent_configs = {k: v for k, v in agent_configs.items() if k in agents}
clf = FailureClassifier()
det = StrategyDetector()
adv = AdvancedMetricsEngine()
causal = CausalProbe()
counter = CounterfactualEngine()
calibrator = ConfidenceCalibrator()
start_time = time.time()
all_results: List[BenchmarkResult] = []
for task in tasks:
for agent_name, agent_fn in agent_configs.items():
try:
result = self._run_episode(
env, task, agent_name, agent_fn,
clf, det, adv, causal, counter, calibrator
)
all_results.append(result)
except Exception as e:
# Don't crash the whole benchmark on one failure
all_results.append(BenchmarkResult(
agent_name=agent_name, task=task, variant_id="error",
final_score=0.0, total_steps=0, cumulative_reward=0.0,
duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR",
reliability_index=0.0, causal_score=0.0, robustness_score=0.0,
calibration_score=0.0, action_sequence=[],
))
total_duration = time.time() - start_time
rankings = self._compute_rankings(all_results, tasks)
return LeaderboardReport(
benchmark_id=benchmark_id,
tasks_evaluated=tasks,
agents_evaluated=list(agent_configs.keys()),
total_episodes=len(all_results),
run_duration_seconds=total_duration,
rankings=rankings,
raw_results=all_results,
)
def _run_episode(
self, env, task, agent_name, agent_fn,
clf, det, adv, causal, counter, calibrator
) -> BenchmarkResult:
from server.models import RepoAction
reset_result = env.reset(task=task)
obs = reset_result.observation
variant_id = reset_result.info.get("variant_id", "unknown")
context = {}
obs_dict = obs.model_dump()
start = time.time()
cumulative_reward = 0.0
files_read, files_written, action_sequence = [], [], []
max_steps = 15
for step_num in range(1, max_steps + 1):
if env.done:
break
action_dict = agent_fn(obs_dict, step_num, context)
action = RepoAction(
action_type=action_dict.get("action_type", "submit"),
path=action_dict.get("path"),
query=action_dict.get("query"),
)
result = env.step(action)
obs = result.observation
obs_dict = obs.model_dump()
cumulative_reward += result.reward
action_sequence.append(action.action_type)
if action.path and action.action_type == "read_file":
files_read.append(action.path)
if action.path and action.action_type == "write_file":
files_written.append(action.path)
if result.done:
break
if not env.done:
r = env.step(RepoAction(action_type="submit"))
cumulative_reward += r.reward
action_sequence.append("submit")
duration = time.time() - start
final_score = env.final_score
traj = env.get_trajectory()
steps = traj.get("steps", []) if traj else []
meta = env.variant.meta if env.variant else {}
# Intelligence metrics
fail_r = clf.classify(
traj.get("episode_id", "") if traj else "", task,
steps, meta, files_read, files_written, final_score
)
strat_r = det.detect(steps, task, meta, files_read, final_score)
adv_r = adv.compute(steps, meta, final_score, files_read, files_written)
causal_r = causal.probe(
traj.get("episode_id", "") if traj else "", task,
steps, meta, files_read, files_written, final_score
)
counter_r = counter.analyze(
traj.get("episode_id", "") if traj else "", task,
steps, meta, files_read, files_written, final_score
)
calib_r = calibrator.calibrate(
traj.get("episode_id", "") if traj else "", task,
steps, final_score,
)
return BenchmarkResult(
agent_name=agent_name,
task=task,
variant_id=variant_id,
final_score=final_score,
total_steps=len(action_sequence),
cumulative_reward=cumulative_reward,
duration_seconds=duration,
strategy=strat_r.strategy,
failure_type=fail_r.primary_failure,
reliability_index=adv_r.reliability_index,
causal_score=causal_r.causal_score,
robustness_score=counter_r.robustness_score,
calibration_score=calib_r.calibration_score,
action_sequence=action_sequence,
)
def _compute_rankings(
self, results: List[BenchmarkResult], tasks: List[str]
) -> List[AgentBenchmarkSummary]:
import math
from collections import Counter
# Group by agent
agent_results: Dict[str, List[BenchmarkResult]] = {}
for r in results:
agent_results.setdefault(r.agent_name, []).append(r)
summaries = []
for agent_name, agent_res in agent_results.items():
scores = [r.final_score for r in agent_res]
mean_score = sum(scores) / len(scores)
if len(scores) > 1:
variance = sum((s - mean_score) ** 2 for s in scores) / len(scores)
std_score = math.sqrt(variance)
else:
std_score = 0.0
generalization_score = max(0.0, 1.0 - std_score)
per_task = {r.task: r.final_score for r in agent_res}
strategies = Counter(r.strategy for r in agent_res)
failures = Counter(r.failure_type for r in agent_res)
mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res)
mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res)
mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res)
mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res)
mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res)
# Composite leaderboard score — weighted across all dimensions
composite = (
mean_score * 0.35 +
mean_causal * 0.20 +
mean_robustness * 0.15 +
mean_calibration * 0.15 +
generalization_score * 0.15
)
best_task = max(per_task, key=per_task.get)
worst_task = min(per_task, key=per_task.get)
summaries.append(AgentBenchmarkSummary(
agent_name=agent_name,
tasks_run=len(agent_res),
mean_score=mean_score,
std_score=std_score,
generalization_score=generalization_score,
mean_steps=mean_steps,
best_task=best_task,
worst_task=worst_task,
mean_reliability=mean_reliability,
mean_causal_score=mean_causal,
mean_robustness_score=mean_robustness,
mean_calibration_score=mean_calibration,
dominant_strategy=strategies.most_common(1)[0][0],
dominant_failure=failures.most_common(1)[0][0],
composite_rank_score=composite,
per_task_scores=per_task,
))
summaries.sort(key=lambda s: -s.composite_rank_score)
return summaries
def _get_agent_configs(self) -> Dict:
"""Reuse built-in strategies from multi_agent.py."""
from server.multi_agent import MultiAgentComparison
return MultiAgentComparison.AGENT_CONFIGS