File size: 4,177 Bytes
c315ccc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | #!/usr/bin/env python3
"""Self-contained benchmark runner for Agent Cost Optimizer."""
import sys
import json
import os
from pathlib import Path
from datetime import datetime
# Add project root to path when running as script
sys.path.insert(0, str(Path(__file__).parent))
from aco.benchmarks.benchmark_suite import BenchmarkSuite
from aco.config import ACOConfig
def analyze_cost_quality_frontier(results):
"""Analyze the cost-quality Pareto frontier."""
points = []
for name, result in results.items():
success_rate = (result.num_success + result.num_partial) / result.num_tasks
avg_cost = result.avg_cost_success
points.append({
"baseline": name,
"success_rate": success_rate,
"avg_cost_per_success": avg_cost,
"total_cost": result.total_cost,
"latency_ms": result.avg_latency_ms,
"regression_rate": result.regression_rate,
"false_done_rate": result.false_done_rate,
"unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
"missed_escalation_rate": result.missed_escalation_rate,
})
frontier = []
for p in points:
dominated = False
for q in points:
if q["baseline"] == p["baseline"]:
continue
if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
dominated = True
break
if not dominated:
frontier.append(p)
frontier.sort(key=lambda x: x["success_rate"], reverse=True)
return {
"all_points": points,
"pareto_frontier": frontier,
"frontier_baselines": [p["baseline"] for p in frontier],
}
def main():
num_tasks = 1000
seed = 42
output_dir = "./eval_results"
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
print(f" Tasks: {num_tasks}")
print(f" Seed: {seed}")
print()
config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
suite = BenchmarkSuite(config)
# Generate data
print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
traces = suite.generate_benchmark_data(num_tasks, seed=seed)
# Save traces
traces_path = output_path / "traces.jsonl"
with open(traces_path, "w") as f:
for trace in traces:
f.write(json.dumps(trace.to_dict()) + "\n")
print(f" Saved {len(traces)} traces to {traces_path}")
# Run main baselines
print(f"\n[{datetime.now().isoformat()}] Running baselines...")
baseline_results = suite.run_all_baselines(traces)
baseline_path = output_path / "baseline_results.json"
suite.export(baseline_results, str(baseline_path))
print(f" Saved baseline results to {baseline_path}")
# Run ablations
print(f"\n[{datetime.now().isoformat()}] Running ablations...")
ablation_results = suite.run_ablations(traces)
ablation_path = output_path / "ablation_results.json"
suite.export(ablation_results, str(ablation_path))
print(f" Saved ablation results to {ablation_path}")
# Combined report
all_results = {**baseline_results, **ablation_results}
# Generate text report
report = suite.report(all_results)
report_path = output_path / "report.txt"
with open(report_path, "w") as f:
f.write(report)
print(f"\n Saved report to {report_path}")
# Generate cost-quality frontier analysis
frontier = analyze_cost_quality_frontier(all_results)
frontier_path = output_path / "cost_quality_frontier.json"
with open(frontier_path, "w") as f:
json.dump(frontier, indent=2, fp=f)
print(f" Saved cost-quality frontier to {frontier_path}")
# Print to stdout
print("\n" + "=" * 80)
print(report)
print("=" * 80)
if __name__ == "__main__":
main()
|