Spaces:
Running
Running
| """ | |
| swe_bench/evaluator.py | |
| ββββββββββββββββββββββ | |
| Evaluation harness for measuring agent performance on SWE-bench Lite. | |
| Metrics tracked: | |
| - resolved_count : how many issues the agent fixed (tests pass) | |
| - resolved_rate : resolved_count / total_instances | |
| - avg_attempts : average number of attempts taken per issue | |
| - token_cost : total token usage | |
| - per_instance : dict keyed by instance_id with detailed results | |
| A result is 'resolved' if ALL fail_to_pass tests now pass AND | |
| all pass_to_pass tests still pass (no regressions). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import time | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Literal | |
| logger = logging.getLogger(__name__) | |
| # ββ Result dataclasses ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class AttemptResult: | |
| """Result of a single patch attempt.""" | |
| attempt_num: int | |
| patch: str # unified diff generated | |
| test_stdout: str # raw pytest output | |
| fail_to_pass_results: dict[str, bool] # test_id β passed | |
| pass_to_pass_results: dict[str, bool] # test_id β still passing | |
| resolved: bool | |
| failure_category: Literal[ | |
| "syntax_error", | |
| "hallucinated_api", | |
| "wrong_file_edit", | |
| "incomplete_patch", | |
| "flaky_test", | |
| "retrieval_miss", | |
| "success", | |
| "unknown", | |
| ] = "unknown" | |
| elapsed_seconds: float = 0.0 | |
| token_cost: dict[str, int] = field(default_factory=dict) | |
| class InstanceResult: | |
| """Aggregated result for one SWE-bench instance.""" | |
| instance_id: str | |
| repo: str | |
| resolved: bool | |
| attempts: list[AttemptResult] | |
| total_attempts: int | |
| total_tokens: int = 0 | |
| total_elapsed: float = 0.0 | |
| error: str = "" # non-empty if agent crashed entirely | |
| def attempts_to_fix(self) -> int: | |
| """Returns attempt number that resolved it, or max_attempts if not.""" | |
| for a in self.attempts: | |
| if a.resolved: | |
| return a.attempt_num | |
| return self.total_attempts | |
| class EvalReport: | |
| """Aggregate evaluation metrics over all instances.""" | |
| total_instances: int | |
| resolved_count: int | |
| resolved_rate: float | |
| avg_attempts: float | |
| total_tokens: int | |
| avg_tokens_per_instance: float | |
| avg_elapsed_seconds: float | |
| failure_categories: dict[str, int] # category β count | |
| per_instance: dict[str, InstanceResult] | |
| def to_dict(self) -> dict: | |
| return { | |
| "total_instances": self.total_instances, | |
| "resolved_count": self.resolved_count, | |
| "resolved_rate": round(self.resolved_rate, 4), | |
| "avg_attempts": round(self.avg_attempts, 3), | |
| "total_tokens": self.total_tokens, | |
| "avg_tokens_per_instance": round(self.avg_tokens_per_instance, 1), | |
| "avg_elapsed_seconds": round(self.avg_elapsed_seconds, 2), | |
| "failure_categories": self.failure_categories, | |
| } | |
| def print_summary(self) -> None: | |
| """Pretty-print summary to stdout.""" | |
| try: | |
| from rich.console import Console | |
| from rich.table import Table | |
| console = Console() | |
| console.print("\n[bold cyan]βββ SWE-bench Lite Evaluation Summary βββ[/bold cyan]") | |
| table = Table(show_header=True, header_style="bold magenta") | |
| table.add_column("Metric", style="dim") | |
| table.add_column("Value", justify="right") | |
| table.add_row("Total instances", str(self.total_instances)) | |
| table.add_row("Resolved count", f"[green]{self.resolved_count}[/green]") | |
| table.add_row("Resolved rate", f"[green]{self.resolved_rate:.1%}[/green]") | |
| table.add_row("Avg attempts to fix", str(round(self.avg_attempts, 2))) | |
| table.add_row("Total tokens", f"{self.total_tokens:,}") | |
| table.add_row("Avg tokens / issue", f"{self.avg_tokens_per_instance:,.0f}") | |
| table.add_row("Avg elapsed (s)", str(round(self.avg_elapsed_seconds, 1))) | |
| console.print(table) | |
| if self.failure_categories: | |
| console.print("\n[bold]Failure categories:[/bold]") | |
| for cat, cnt in sorted( | |
| self.failure_categories.items(), key=lambda x: -x[1] | |
| ): | |
| console.print(f" {cat}: {cnt}") | |
| except ImportError: | |
| # Fallback if rich is not installed | |
| print("\n=== SWE-bench Lite Evaluation Summary ===") | |
| print(f"Total instances : {self.total_instances}") | |
| print(f"Resolved count : {self.resolved_count}") | |
| print(f"Resolved rate : {self.resolved_rate:.1%}") | |
| print(f"Avg attempts : {self.avg_attempts:.2f}") | |
| print(f"Total tokens : {self.total_tokens:,}") | |
| print(f"Failure categories: {self.failure_categories}") | |
| # ββ Aggregation helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def aggregate_results(instance_results: list[InstanceResult]) -> EvalReport: | |
| """Compute aggregate metrics from a list of per-instance results.""" | |
| n = len(instance_results) | |
| if n == 0: | |
| return EvalReport(0, 0, 0.0, 0.0, 0, 0.0, 0.0, {}, {}) | |
| resolved = [r for r in instance_results if r.resolved] | |
| resolved_count = len(resolved) | |
| attempts_list = [r.attempts_to_fix for r in instance_results] | |
| avg_attempts = sum(attempts_list) / n | |
| total_tokens = sum(r.total_tokens for r in instance_results) | |
| total_elapsed = sum(r.total_elapsed for r in instance_results) | |
| # Collect failure categories from last attempt of unresolved instances | |
| failure_categories: dict[str, int] = {} | |
| for r in instance_results: | |
| if not r.resolved and r.attempts: | |
| cat = r.attempts[-1].failure_category | |
| failure_categories[cat] = failure_categories.get(cat, 0) + 1 | |
| per_instance = {r.instance_id: r for r in instance_results} | |
| return EvalReport( | |
| total_instances=n, | |
| resolved_count=resolved_count, | |
| resolved_rate=resolved_count / n, | |
| avg_attempts=avg_attempts, | |
| total_tokens=total_tokens, | |
| avg_tokens_per_instance=total_tokens / n, | |
| avg_elapsed_seconds=total_elapsed / n, | |
| failure_categories=failure_categories, | |
| per_instance=per_instance, | |
| ) | |
| def save_results(report: EvalReport, output_dir: Path) -> None: | |
| """Persist evaluation report as JSON.""" | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| summary_path = output_dir / "eval_summary.json" | |
| summary_path.write_text(json.dumps(report.to_dict(), indent=2)) | |
| logger.info("Summary saved to %s", summary_path) | |
| details_path = output_dir / "per_instance_results.jsonl" | |
| with details_path.open("w") as f: | |
| for instance_id, r in report.per_instance.items(): | |
| record = { | |
| "instance_id": instance_id, | |
| "repo": r.repo, | |
| "resolved": r.resolved, | |
| "total_attempts": r.total_attempts, | |
| "attempts_to_fix": r.attempts_to_fix, | |
| "total_tokens": r.total_tokens, | |
| "error": r.error, | |
| } | |
| f.write(json.dumps(record) + "\n") | |
| logger.info("Per-instance results saved to %s", details_path) | |