Upload run_benchmark.py
Browse files- run_benchmark.py +124 -0
run_benchmark.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Self-contained benchmark runner for Agent Cost Optimizer."""
|
| 3 |
+
|
| 4 |
+
import sys
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
# Add project root to path when running as script
|
| 11 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 12 |
+
|
| 13 |
+
from aco.benchmarks.benchmark_suite import BenchmarkSuite
|
| 14 |
+
from aco.config import ACOConfig
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def analyze_cost_quality_frontier(results):
|
| 18 |
+
"""Analyze the cost-quality Pareto frontier."""
|
| 19 |
+
points = []
|
| 20 |
+
for name, result in results.items():
|
| 21 |
+
success_rate = (result.num_success + result.num_partial) / result.num_tasks
|
| 22 |
+
avg_cost = result.avg_cost_success
|
| 23 |
+
points.append({
|
| 24 |
+
"baseline": name,
|
| 25 |
+
"success_rate": success_rate,
|
| 26 |
+
"avg_cost_per_success": avg_cost,
|
| 27 |
+
"total_cost": result.total_cost,
|
| 28 |
+
"latency_ms": result.avg_latency_ms,
|
| 29 |
+
"regression_rate": result.regression_rate,
|
| 30 |
+
"false_done_rate": result.false_done_rate,
|
| 31 |
+
"unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
|
| 32 |
+
"missed_escalation_rate": result.missed_escalation_rate,
|
| 33 |
+
})
|
| 34 |
+
|
| 35 |
+
frontier = []
|
| 36 |
+
for p in points:
|
| 37 |
+
dominated = False
|
| 38 |
+
for q in points:
|
| 39 |
+
if q["baseline"] == p["baseline"]:
|
| 40 |
+
continue
|
| 41 |
+
if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
|
| 42 |
+
if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
|
| 43 |
+
dominated = True
|
| 44 |
+
break
|
| 45 |
+
if not dominated:
|
| 46 |
+
frontier.append(p)
|
| 47 |
+
|
| 48 |
+
frontier.sort(key=lambda x: x["success_rate"], reverse=True)
|
| 49 |
+
|
| 50 |
+
return {
|
| 51 |
+
"all_points": points,
|
| 52 |
+
"pareto_frontier": frontier,
|
| 53 |
+
"frontier_baselines": [p["baseline"] for p in frontier],
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def main():
|
| 58 |
+
num_tasks = 1000
|
| 59 |
+
seed = 42
|
| 60 |
+
output_dir = "./eval_results"
|
| 61 |
+
|
| 62 |
+
output_path = Path(output_dir)
|
| 63 |
+
output_path.mkdir(parents=True, exist_ok=True)
|
| 64 |
+
|
| 65 |
+
print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
|
| 66 |
+
print(f" Tasks: {num_tasks}")
|
| 67 |
+
print(f" Seed: {seed}")
|
| 68 |
+
print()
|
| 69 |
+
|
| 70 |
+
config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
|
| 71 |
+
suite = BenchmarkSuite(config)
|
| 72 |
+
|
| 73 |
+
# Generate data
|
| 74 |
+
print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
|
| 75 |
+
traces = suite.generate_benchmark_data(num_tasks, seed=seed)
|
| 76 |
+
|
| 77 |
+
# Save traces
|
| 78 |
+
traces_path = output_path / "traces.jsonl"
|
| 79 |
+
with open(traces_path, "w") as f:
|
| 80 |
+
for trace in traces:
|
| 81 |
+
f.write(json.dumps(trace.to_dict()) + "\n")
|
| 82 |
+
print(f" Saved {len(traces)} traces to {traces_path}")
|
| 83 |
+
|
| 84 |
+
# Run main baselines
|
| 85 |
+
print(f"\n[{datetime.now().isoformat()}] Running baselines...")
|
| 86 |
+
baseline_results = suite.run_all_baselines(traces)
|
| 87 |
+
|
| 88 |
+
baseline_path = output_path / "baseline_results.json"
|
| 89 |
+
suite.export(baseline_results, str(baseline_path))
|
| 90 |
+
print(f" Saved baseline results to {baseline_path}")
|
| 91 |
+
|
| 92 |
+
# Run ablations
|
| 93 |
+
print(f"\n[{datetime.now().isoformat()}] Running ablations...")
|
| 94 |
+
ablation_results = suite.run_ablations(traces)
|
| 95 |
+
|
| 96 |
+
ablation_path = output_path / "ablation_results.json"
|
| 97 |
+
suite.export(ablation_results, str(ablation_path))
|
| 98 |
+
print(f" Saved ablation results to {ablation_path}")
|
| 99 |
+
|
| 100 |
+
# Combined report
|
| 101 |
+
all_results = {**baseline_results, **ablation_results}
|
| 102 |
+
|
| 103 |
+
# Generate text report
|
| 104 |
+
report = suite.report(all_results)
|
| 105 |
+
report_path = output_path / "report.txt"
|
| 106 |
+
with open(report_path, "w") as f:
|
| 107 |
+
f.write(report)
|
| 108 |
+
print(f"\n Saved report to {report_path}")
|
| 109 |
+
|
| 110 |
+
# Generate cost-quality frontier analysis
|
| 111 |
+
frontier = analyze_cost_quality_frontier(all_results)
|
| 112 |
+
frontier_path = output_path / "cost_quality_frontier.json"
|
| 113 |
+
with open(frontier_path, "w") as f:
|
| 114 |
+
json.dump(frontier, indent=2, fp=f)
|
| 115 |
+
print(f" Saved cost-quality frontier to {frontier_path}")
|
| 116 |
+
|
| 117 |
+
# Print to stdout
|
| 118 |
+
print("\n" + "=" * 80)
|
| 119 |
+
print(report)
|
| 120 |
+
print("=" * 80)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
main()
|