narcolepticchicken commited on
Commit
6537c78
·
verified ·
1 Parent(s): 9a3f54b

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +125 -0
eval_runner.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluation runner for Agent Cost Optimizer benchmarks."""
2
+
3
+ import argparse
4
+ import json
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ from aco.benchmarks.benchmark_suite import BenchmarkSuite
9
+ from aco.config import ACOConfig
10
+
11
+
12
+ def run_evaluation(num_tasks: int = 1000, seed: int = 42, output_dir: str = "./eval_results"):
13
+ """Run full evaluation suite."""
14
+ output_path = Path(output_dir)
15
+ output_path.mkdir(parents=True, exist_ok=True)
16
+
17
+ print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
18
+ print(f" Tasks: {num_tasks}")
19
+ print(f" Seed: {seed}")
20
+ print()
21
+
22
+ config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
23
+ suite = BenchmarkSuite(config)
24
+
25
+ # Generate data
26
+ print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
27
+ traces = suite.generate_benchmark_data(num_tasks, seed=seed)
28
+
29
+ # Save traces
30
+ traces_path = output_path / "traces.jsonl"
31
+ with open(traces_path, "w") as f:
32
+ for trace in traces:
33
+ f.write(json.dumps(trace.to_dict()) + "\n")
34
+ print(f" Saved {len(traces)} traces to {traces_path}")
35
+
36
+ # Run main baselines
37
+ print(f"\n[{datetime.now().isoformat()}] Running baselines...")
38
+ baseline_results = suite.run_all_baselines(traces)
39
+
40
+ baseline_path = output_path / "baseline_results.json"
41
+ suite.export(baseline_results, baseline_path)
42
+ print(f" Saved baseline results to {baseline_path}")
43
+
44
+ # Run ablations
45
+ print(f"\n[{datetime.now().isoformat()}] Running ablations...")
46
+ ablation_results = suite.run_ablations(traces)
47
+
48
+ ablation_path = output_path / "ablation_results.json"
49
+ suite.export(ablation_results, ablation_path)
50
+ print(f" Saved ablation results to {ablation_path}")
51
+
52
+ # Combined report
53
+ all_results = {**baseline_results, **ablation_results}
54
+
55
+ # Generate text report
56
+ report = suite.report(all_results)
57
+ report_path = output_path / "report.txt"
58
+ with open(report_path, "w") as f:
59
+ f.write(report)
60
+ print(f"\n Saved report to {report_path}")
61
+
62
+ # Generate cost-quality frontier analysis
63
+ frontier = analyze_cost_quality_frontier(all_results)
64
+ frontier_path = output_path / "cost_quality_frontier.json"
65
+ with open(frontier_path, "w") as f:
66
+ json.dump(frontier, indent=2, fp=f)
67
+ print(f" Saved cost-quality frontier to {frontier_path}")
68
+
69
+ # Print to stdout
70
+ print("\n" + "=" * 80)
71
+ print(report)
72
+ print("=" * 80)
73
+
74
+ return all_results
75
+
76
+
77
+ def analyze_cost_quality_frontier(results):
78
+ """Analyze the cost-quality Pareto frontier."""
79
+ points = []
80
+ for name, result in results.items():
81
+ success_rate = (result.num_success + result.num_partial) / result.num_tasks
82
+ avg_cost = result.avg_cost_success
83
+ points.append({
84
+ "baseline": name,
85
+ "success_rate": success_rate,
86
+ "avg_cost_per_success": avg_cost,
87
+ "total_cost": result.total_cost,
88
+ "latency_ms": result.avg_latency_ms,
89
+ "regression_rate": result.regression_rate,
90
+ "false_done_rate": result.false_done_rate,
91
+ "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
92
+ "missed_escalation_rate": result.missed_escalation_rate,
93
+ })
94
+
95
+ # Find Pareto frontier: no other point has both higher success and lower cost
96
+ frontier = []
97
+ for p in points:
98
+ dominated = False
99
+ for q in points:
100
+ if q["baseline"] == p["baseline"]:
101
+ continue
102
+ if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
103
+ if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
104
+ dominated = True
105
+ break
106
+ if not dominated:
107
+ frontier.append(p)
108
+
109
+ frontier.sort(key=lambda x: x["success_rate"], reverse=True)
110
+
111
+ return {
112
+ "all_points": points,
113
+ "pareto_frontier": frontier,
114
+ "frontier_baselines": [p["baseline"] for p in frontier],
115
+ }
116
+
117
+
118
+ if __name__ == "__main__":
119
+ parser = argparse.ArgumentParser(description="ACO Evaluation Runner")
120
+ parser.add_argument("--tasks", "-n", type=int, default=1000, help="Number of tasks")
121
+ parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed")
122
+ parser.add_argument("--output", "-o", default="./eval_results", help="Output directory")
123
+ args = parser.parse_args()
124
+
125
+ run_evaluation(args.tasks, args.seed, args.output)