| """Evaluation runner for Agent Cost Optimizer benchmarks.""" |
|
|
| import argparse |
| import json |
| import sys |
| from datetime import datetime |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| from aco.benchmarks.benchmark_suite import BenchmarkSuite |
| from aco.config import ACOConfig |
|
|
|
|
| def run_evaluation(num_tasks: int = 1000, seed: int = 42, output_dir: str = "./eval_results"): |
| """Run full evaluation suite.""" |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation") |
| print(f" Tasks: {num_tasks}") |
| print(f" Seed: {seed}") |
| print() |
| |
| config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig() |
| suite = BenchmarkSuite(config) |
| |
| |
| print(f"[{datetime.now().isoformat()}] Generating synthetic traces...") |
| traces = suite.generate_benchmark_data(num_tasks, seed=seed) |
| |
| |
| traces_path = output_path / "traces.jsonl" |
| with open(traces_path, "w") as f: |
| for trace in traces: |
| f.write(json.dumps(trace.to_dict()) + "\n") |
| print(f" Saved {len(traces)} traces to {traces_path}") |
| |
| |
| print(f"\n[{datetime.now().isoformat()}] Running baselines...") |
| baseline_results = suite.run_all_baselines(traces) |
| |
| baseline_path = output_path / "baseline_results.json" |
| suite.export(baseline_results, baseline_path) |
| print(f" Saved baseline results to {baseline_path}") |
| |
| |
| print(f"\n[{datetime.now().isoformat()}] Running ablations...") |
| ablation_results = suite.run_ablations(traces) |
| |
| ablation_path = output_path / "ablation_results.json" |
| suite.export(ablation_results, ablation_path) |
| print(f" Saved ablation results to {ablation_path}") |
| |
| |
| all_results = {**baseline_results, **ablation_results} |
| |
| |
| report = suite.report(all_results) |
| report_path = output_path / "report.txt" |
| with open(report_path, "w") as f: |
| f.write(report) |
| print(f"\n Saved report to {report_path}") |
| |
| |
| frontier = analyze_cost_quality_frontier(all_results) |
| frontier_path = output_path / "cost_quality_frontier.json" |
| with open(frontier_path, "w") as f: |
| json.dump(frontier, indent=2, fp=f) |
| print(f" Saved cost-quality frontier to {frontier_path}") |
| |
| |
| print("\n" + "=" * 80) |
| print(report) |
| print("=" * 80) |
| |
| return all_results |
|
|
|
|
| def analyze_cost_quality_frontier(results): |
| """Analyze the cost-quality Pareto frontier.""" |
| points = [] |
| for name, result in results.items(): |
| success_rate = (result.num_success + result.num_partial) / result.num_tasks |
| avg_cost = result.avg_cost_success |
| points.append({ |
| "baseline": name, |
| "success_rate": success_rate, |
| "avg_cost_per_success": avg_cost, |
| "total_cost": result.total_cost, |
| "latency_ms": result.avg_latency_ms, |
| "regression_rate": result.regression_rate, |
| "false_done_rate": result.false_done_rate, |
| "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate, |
| "missed_escalation_rate": result.missed_escalation_rate, |
| }) |
| |
| |
| frontier = [] |
| for p in points: |
| dominated = False |
| for q in points: |
| if q["baseline"] == p["baseline"]: |
| continue |
| if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]: |
| if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]: |
| dominated = True |
| break |
| if not dominated: |
| frontier.append(p) |
| |
| frontier.sort(key=lambda x: x["success_rate"], reverse=True) |
| |
| return { |
| "all_points": points, |
| "pareto_frontier": frontier, |
| "frontier_baselines": [p["baseline"] for p in frontier], |
| } |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="ACO Evaluation Runner") |
| parser.add_argument("--tasks", "-n", type=int, default=1000, help="Number of tasks") |
| parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed") |
| parser.add_argument("--output", "-o", default="./eval_results", help="Output directory") |
| args = parser.parse_args() |
| |
| run_evaluation(args.tasks, args.seed, args.output) |
|
|