narcolepticchicken commited on
Commit
c315ccc
·
verified ·
1 Parent(s): 2a55899

Upload run_benchmark.py

Browse files
Files changed (1) hide show
  1. run_benchmark.py +124 -0
run_benchmark.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Self-contained benchmark runner for Agent Cost Optimizer."""
3
+
4
+ import sys
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ from datetime import datetime
9
+
10
+ # Add project root to path when running as script
11
+ sys.path.insert(0, str(Path(__file__).parent))
12
+
13
+ from aco.benchmarks.benchmark_suite import BenchmarkSuite
14
+ from aco.config import ACOConfig
15
+
16
+
17
+ def analyze_cost_quality_frontier(results):
18
+ """Analyze the cost-quality Pareto frontier."""
19
+ points = []
20
+ for name, result in results.items():
21
+ success_rate = (result.num_success + result.num_partial) / result.num_tasks
22
+ avg_cost = result.avg_cost_success
23
+ points.append({
24
+ "baseline": name,
25
+ "success_rate": success_rate,
26
+ "avg_cost_per_success": avg_cost,
27
+ "total_cost": result.total_cost,
28
+ "latency_ms": result.avg_latency_ms,
29
+ "regression_rate": result.regression_rate,
30
+ "false_done_rate": result.false_done_rate,
31
+ "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
32
+ "missed_escalation_rate": result.missed_escalation_rate,
33
+ })
34
+
35
+ frontier = []
36
+ for p in points:
37
+ dominated = False
38
+ for q in points:
39
+ if q["baseline"] == p["baseline"]:
40
+ continue
41
+ if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
42
+ if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
43
+ dominated = True
44
+ break
45
+ if not dominated:
46
+ frontier.append(p)
47
+
48
+ frontier.sort(key=lambda x: x["success_rate"], reverse=True)
49
+
50
+ return {
51
+ "all_points": points,
52
+ "pareto_frontier": frontier,
53
+ "frontier_baselines": [p["baseline"] for p in frontier],
54
+ }
55
+
56
+
57
+ def main():
58
+ num_tasks = 1000
59
+ seed = 42
60
+ output_dir = "./eval_results"
61
+
62
+ output_path = Path(output_dir)
63
+ output_path.mkdir(parents=True, exist_ok=True)
64
+
65
+ print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
66
+ print(f" Tasks: {num_tasks}")
67
+ print(f" Seed: {seed}")
68
+ print()
69
+
70
+ config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
71
+ suite = BenchmarkSuite(config)
72
+
73
+ # Generate data
74
+ print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
75
+ traces = suite.generate_benchmark_data(num_tasks, seed=seed)
76
+
77
+ # Save traces
78
+ traces_path = output_path / "traces.jsonl"
79
+ with open(traces_path, "w") as f:
80
+ for trace in traces:
81
+ f.write(json.dumps(trace.to_dict()) + "\n")
82
+ print(f" Saved {len(traces)} traces to {traces_path}")
83
+
84
+ # Run main baselines
85
+ print(f"\n[{datetime.now().isoformat()}] Running baselines...")
86
+ baseline_results = suite.run_all_baselines(traces)
87
+
88
+ baseline_path = output_path / "baseline_results.json"
89
+ suite.export(baseline_results, str(baseline_path))
90
+ print(f" Saved baseline results to {baseline_path}")
91
+
92
+ # Run ablations
93
+ print(f"\n[{datetime.now().isoformat()}] Running ablations...")
94
+ ablation_results = suite.run_ablations(traces)
95
+
96
+ ablation_path = output_path / "ablation_results.json"
97
+ suite.export(ablation_results, str(ablation_path))
98
+ print(f" Saved ablation results to {ablation_path}")
99
+
100
+ # Combined report
101
+ all_results = {**baseline_results, **ablation_results}
102
+
103
+ # Generate text report
104
+ report = suite.report(all_results)
105
+ report_path = output_path / "report.txt"
106
+ with open(report_path, "w") as f:
107
+ f.write(report)
108
+ print(f"\n Saved report to {report_path}")
109
+
110
+ # Generate cost-quality frontier analysis
111
+ frontier = analyze_cost_quality_frontier(all_results)
112
+ frontier_path = output_path / "cost_quality_frontier.json"
113
+ with open(frontier_path, "w") as f:
114
+ json.dump(frontier, indent=2, fp=f)
115
+ print(f" Saved cost-quality frontier to {frontier_path}")
116
+
117
+ # Print to stdout
118
+ print("\n" + "=" * 80)
119
+ print(report)
120
+ print("=" * 80)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()