Spaces:
Sleeping
Sleeping
| # server/benchmark_runner.py | |
| """ | |
| Benchmark Runner + Leaderboard — v4.0 | |
| Automatically runs ALL tasks × selected agent configurations and generates | |
| a research-grade leaderboard output with per-task, per-strategy breakdowns. | |
| Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup, | |
| this runs end-to-end in-process with deterministic strategies. | |
| Output format: | |
| - Leaderboard table (ranked by composite score) | |
| - Per-task breakdown | |
| - Per-failure-type breakdown | |
| - Generalization score (variance across tasks) | |
| - Robustness score (from counterfactual engine) | |
| - A "benchmark JSON" suitable for publishing or comparing systems | |
| """ | |
| from __future__ import annotations | |
| import time | |
| import json | |
| from typing import List, Dict, Any, Optional | |
| from dataclasses import dataclass, field | |
| class BenchmarkResult: | |
| """Result of running one agent on one task variant.""" | |
| agent_name: str | |
| task: str | |
| variant_id: str | |
| final_score: float | |
| total_steps: int | |
| cumulative_reward: float | |
| duration_seconds: float | |
| strategy: str | |
| failure_type: str | |
| reliability_index: float | |
| causal_score: float | |
| robustness_score: float | |
| calibration_score: float | |
| action_sequence: List[str] | |
| class AgentBenchmarkSummary: | |
| """Aggregated results for one agent across all tasks.""" | |
| agent_name: str | |
| tasks_run: int | |
| mean_score: float | |
| std_score: float | |
| generalization_score: float # 1 - std (lower variance = more generalizable) | |
| mean_steps: float | |
| best_task: str | |
| worst_task: str | |
| mean_reliability: float | |
| mean_causal_score: float | |
| mean_robustness_score: float | |
| mean_calibration_score: float | |
| dominant_strategy: str | |
| dominant_failure: str | |
| composite_rank_score: float # Weighted final score for leaderboard | |
| per_task_scores: Dict[str, float] | |
| def to_dict(self) -> dict: | |
| return { | |
| "agent_name": self.agent_name, | |
| "tasks_run": self.tasks_run, | |
| "scores": { | |
| "mean": round(self.mean_score, 3), | |
| "std": round(self.std_score, 3), | |
| "generalization": round(self.generalization_score, 3), | |
| "reliability": round(self.mean_reliability, 3), | |
| "causal_reasoning": round(self.mean_causal_score, 3), | |
| "robustness": round(self.mean_robustness_score, 3), | |
| "calibration": round(self.mean_calibration_score, 3), | |
| "composite": round(self.composite_rank_score, 3), | |
| }, | |
| "efficiency": { | |
| "mean_steps": round(self.mean_steps, 1), | |
| }, | |
| "behavior": { | |
| "dominant_strategy": self.dominant_strategy, | |
| "dominant_failure": self.dominant_failure, | |
| }, | |
| "per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()}, | |
| "best_task": self.best_task, | |
| "worst_task": self.worst_task, | |
| } | |
| class LeaderboardReport: | |
| """Full benchmark leaderboard.""" | |
| benchmark_id: str | |
| tasks_evaluated: List[str] | |
| agents_evaluated: List[str] | |
| total_episodes: int | |
| run_duration_seconds: float | |
| rankings: List[AgentBenchmarkSummary] | |
| raw_results: List[BenchmarkResult] | |
| def to_dict(self) -> dict: | |
| return { | |
| "benchmark_id": self.benchmark_id, | |
| "tasks_evaluated": self.tasks_evaluated, | |
| "agents_evaluated": self.agents_evaluated, | |
| "total_episodes": self.total_episodes, | |
| "run_duration_seconds": round(self.run_duration_seconds, 2), | |
| "leaderboard": [r.to_dict() for r in self.rankings], | |
| "winner": self.rankings[0].agent_name if self.rankings else "none", | |
| "insights": self._generate_insights(), | |
| } | |
| def _generate_insights(self) -> List[str]: | |
| if not self.rankings: | |
| return [] | |
| insights = [] | |
| top = self.rankings[0] | |
| bottom = self.rankings[-1] | |
| if top.composite_rank_score - bottom.composite_rank_score > 0.2: | |
| insights.append( | |
| f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) " | |
| f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})" | |
| ) | |
| if top.generalization_score > 0.7: | |
| insights.append( | |
| f"'{top.agent_name}' shows strong generalization " | |
| f"(std={top.std_score:.3f} across {top.tasks_run} tasks)" | |
| ) | |
| for r in self.rankings: | |
| if r.mean_causal_score > 0.6: | |
| insights.append( | |
| f"'{r.agent_name}' demonstrated genuine causal reasoning " | |
| f"(causal_score={r.mean_causal_score:.2f})" | |
| ) | |
| strategies = [r.dominant_strategy for r in self.rankings] | |
| if len(set(strategies)) > 1: | |
| best_strategy = self.rankings[0].dominant_strategy | |
| insights.append( | |
| f"Strategy '{best_strategy}' produced the highest composite score." | |
| ) | |
| return insights | |
| def render_table(self) -> str: | |
| """Render ASCII leaderboard table.""" | |
| if not self.rankings: | |
| return "No results." | |
| lines = [ | |
| f"{'═'*90}", | |
| f" 🏆 BENCHMARK LEADERBOARD — {self.benchmark_id}", | |
| f" Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}", | |
| f"{'═'*90}", | |
| f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}", | |
| f"{'─'*90}", | |
| ] | |
| for i, r in enumerate(self.rankings): | |
| medal = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else f" #{i+1}" | |
| lines.append( | |
| f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} " | |
| f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} " | |
| f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} " | |
| f"{r.mean_steps:<7.1f} {r.dominant_strategy}" | |
| ) | |
| lines.append(f"{'═'*90}") | |
| lines.append("\n📊 Per-Task Breakdown:") | |
| for r in self.rankings: | |
| task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items())) | |
| lines.append(f" {r.agent_name:<16} {task_str}") | |
| if self._generate_insights(): | |
| lines.append("\n💡 Insights:") | |
| lines.extend(f" → {i}" for i in self._generate_insights()) | |
| return "\n".join(lines) | |
| class BenchmarkRunner: | |
| """ | |
| Automated benchmark runner. | |
| Runs each agent in AGENT_CONFIGS across each task, collecting: | |
| - Final score | |
| - All intelligence metrics (causal, counterfactual, confidence) | |
| - Strategy and failure classification | |
| - Reliability index | |
| Then generates a ranked leaderboard. | |
| """ | |
| def run( | |
| self, | |
| env, | |
| tasks: Optional[List[str]] = None, | |
| agents: Optional[List[str]] = None, | |
| benchmark_id: Optional[str] = None, | |
| ) -> LeaderboardReport: | |
| """Run the full benchmark.""" | |
| import uuid | |
| from server.models import RepoAction | |
| from server.strategy_detector import StrategyDetector | |
| from server.failure_classifier import FailureClassifier | |
| from server.advanced_metrics import AdvancedMetricsEngine | |
| from server.causal_probe import CausalProbe | |
| from server.counterfactual_engine import CounterfactualEngine | |
| from server.confidence_calibrator import ConfidenceCalibrator | |
| benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}" | |
| tasks = tasks or ["task1", "task2", "task3"] | |
| agent_configs = self._get_agent_configs() | |
| if agents: | |
| agent_configs = {k: v for k, v in agent_configs.items() if k in agents} | |
| clf = FailureClassifier() | |
| det = StrategyDetector() | |
| adv = AdvancedMetricsEngine() | |
| causal = CausalProbe() | |
| counter = CounterfactualEngine() | |
| calibrator = ConfidenceCalibrator() | |
| start_time = time.time() | |
| all_results: List[BenchmarkResult] = [] | |
| for task in tasks: | |
| for agent_name, agent_fn in agent_configs.items(): | |
| try: | |
| result = self._run_episode( | |
| env, task, agent_name, agent_fn, | |
| clf, det, adv, causal, counter, calibrator | |
| ) | |
| all_results.append(result) | |
| except Exception as e: | |
| # Don't crash the whole benchmark on one failure | |
| all_results.append(BenchmarkResult( | |
| agent_name=agent_name, task=task, variant_id="error", | |
| final_score=0.0, total_steps=0, cumulative_reward=0.0, | |
| duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR", | |
| reliability_index=0.0, causal_score=0.0, robustness_score=0.0, | |
| calibration_score=0.0, action_sequence=[], | |
| )) | |
| total_duration = time.time() - start_time | |
| rankings = self._compute_rankings(all_results, tasks) | |
| return LeaderboardReport( | |
| benchmark_id=benchmark_id, | |
| tasks_evaluated=tasks, | |
| agents_evaluated=list(agent_configs.keys()), | |
| total_episodes=len(all_results), | |
| run_duration_seconds=total_duration, | |
| rankings=rankings, | |
| raw_results=all_results, | |
| ) | |
| def _run_episode( | |
| self, env, task, agent_name, agent_fn, | |
| clf, det, adv, causal, counter, calibrator | |
| ) -> BenchmarkResult: | |
| from server.models import RepoAction | |
| reset_result = env.reset(task=task) | |
| obs = reset_result.observation | |
| variant_id = reset_result.info.get("variant_id", "unknown") | |
| context = {} | |
| obs_dict = obs.model_dump() | |
| start = time.time() | |
| cumulative_reward = 0.0 | |
| files_read, files_written, action_sequence = [], [], [] | |
| max_steps = 15 | |
| for step_num in range(1, max_steps + 1): | |
| if env.done: | |
| break | |
| action_dict = agent_fn(obs_dict, step_num, context) | |
| action = RepoAction( | |
| action_type=action_dict.get("action_type", "submit"), | |
| path=action_dict.get("path"), | |
| query=action_dict.get("query"), | |
| ) | |
| result = env.step(action) | |
| obs = result.observation | |
| obs_dict = obs.model_dump() | |
| cumulative_reward += result.reward | |
| action_sequence.append(action.action_type) | |
| if action.path and action.action_type == "read_file": | |
| files_read.append(action.path) | |
| if action.path and action.action_type == "write_file": | |
| files_written.append(action.path) | |
| if result.done: | |
| break | |
| if not env.done: | |
| r = env.step(RepoAction(action_type="submit")) | |
| cumulative_reward += r.reward | |
| action_sequence.append("submit") | |
| duration = time.time() - start | |
| final_score = env.final_score | |
| traj = env.get_trajectory() | |
| steps = traj.get("steps", []) if traj else [] | |
| meta = env.variant.meta if env.variant else {} | |
| # Intelligence metrics | |
| fail_r = clf.classify( | |
| traj.get("episode_id", "") if traj else "", task, | |
| steps, meta, files_read, files_written, final_score | |
| ) | |
| strat_r = det.detect(steps, task, meta, files_read, final_score) | |
| adv_r = adv.compute(steps, meta, final_score, files_read, files_written) | |
| causal_r = causal.probe( | |
| traj.get("episode_id", "") if traj else "", task, | |
| steps, meta, files_read, files_written, final_score | |
| ) | |
| counter_r = counter.analyze( | |
| traj.get("episode_id", "") if traj else "", task, | |
| steps, meta, files_read, files_written, final_score | |
| ) | |
| calib_r = calibrator.calibrate( | |
| traj.get("episode_id", "") if traj else "", task, | |
| steps, final_score, | |
| ) | |
| return BenchmarkResult( | |
| agent_name=agent_name, | |
| task=task, | |
| variant_id=variant_id, | |
| final_score=final_score, | |
| total_steps=len(action_sequence), | |
| cumulative_reward=cumulative_reward, | |
| duration_seconds=duration, | |
| strategy=strat_r.strategy, | |
| failure_type=fail_r.primary_failure, | |
| reliability_index=adv_r.reliability_index, | |
| causal_score=causal_r.causal_score, | |
| robustness_score=counter_r.robustness_score, | |
| calibration_score=calib_r.calibration_score, | |
| action_sequence=action_sequence, | |
| ) | |
| def _compute_rankings( | |
| self, results: List[BenchmarkResult], tasks: List[str] | |
| ) -> List[AgentBenchmarkSummary]: | |
| import math | |
| from collections import Counter | |
| # Group by agent | |
| agent_results: Dict[str, List[BenchmarkResult]] = {} | |
| for r in results: | |
| agent_results.setdefault(r.agent_name, []).append(r) | |
| summaries = [] | |
| for agent_name, agent_res in agent_results.items(): | |
| scores = [r.final_score for r in agent_res] | |
| mean_score = sum(scores) / len(scores) | |
| if len(scores) > 1: | |
| variance = sum((s - mean_score) ** 2 for s in scores) / len(scores) | |
| std_score = math.sqrt(variance) | |
| else: | |
| std_score = 0.0 | |
| generalization_score = max(0.0, 1.0 - std_score) | |
| per_task = {r.task: r.final_score for r in agent_res} | |
| strategies = Counter(r.strategy for r in agent_res) | |
| failures = Counter(r.failure_type for r in agent_res) | |
| mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res) | |
| mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res) | |
| mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res) | |
| mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res) | |
| mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res) | |
| # Composite leaderboard score — weighted across all dimensions | |
| composite = ( | |
| mean_score * 0.35 + | |
| mean_causal * 0.20 + | |
| mean_robustness * 0.15 + | |
| mean_calibration * 0.15 + | |
| generalization_score * 0.15 | |
| ) | |
| best_task = max(per_task, key=per_task.get) | |
| worst_task = min(per_task, key=per_task.get) | |
| summaries.append(AgentBenchmarkSummary( | |
| agent_name=agent_name, | |
| tasks_run=len(agent_res), | |
| mean_score=mean_score, | |
| std_score=std_score, | |
| generalization_score=generalization_score, | |
| mean_steps=mean_steps, | |
| best_task=best_task, | |
| worst_task=worst_task, | |
| mean_reliability=mean_reliability, | |
| mean_causal_score=mean_causal, | |
| mean_robustness_score=mean_robustness, | |
| mean_calibration_score=mean_calibration, | |
| dominant_strategy=strategies.most_common(1)[0][0], | |
| dominant_failure=failures.most_common(1)[0][0], | |
| composite_rank_score=composite, | |
| per_task_scores=per_task, | |
| )) | |
| summaries.sort(key=lambda s: -s.composite_rank_score) | |
| return summaries | |
| def _get_agent_configs(self) -> Dict: | |
| """Reuse built-in strategies from multi_agent.py.""" | |
| from server.multi_agent import MultiAgentComparison | |
| return MultiAgentComparison.AGENT_CONFIGS | |