File size: 4,706 Bytes
6537c78 aad1d80 6537c78 aad1d80 6537c78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | """Evaluation runner for Agent Cost Optimizer benchmarks."""
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
# Ensure aco package is importable
sys.path.insert(0, str(Path(__file__).parent))
from aco.benchmarks.benchmark_suite import BenchmarkSuite
from aco.config import ACOConfig
def run_evaluation(num_tasks: int = 1000, seed: int = 42, output_dir: str = "./eval_results"):
"""Run full evaluation suite."""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
print(f" Tasks: {num_tasks}")
print(f" Seed: {seed}")
print()
config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
suite = BenchmarkSuite(config)
# Generate data
print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
traces = suite.generate_benchmark_data(num_tasks, seed=seed)
# Save traces
traces_path = output_path / "traces.jsonl"
with open(traces_path, "w") as f:
for trace in traces:
f.write(json.dumps(trace.to_dict()) + "\n")
print(f" Saved {len(traces)} traces to {traces_path}")
# Run main baselines
print(f"\n[{datetime.now().isoformat()}] Running baselines...")
baseline_results = suite.run_all_baselines(traces)
baseline_path = output_path / "baseline_results.json"
suite.export(baseline_results, baseline_path)
print(f" Saved baseline results to {baseline_path}")
# Run ablations
print(f"\n[{datetime.now().isoformat()}] Running ablations...")
ablation_results = suite.run_ablations(traces)
ablation_path = output_path / "ablation_results.json"
suite.export(ablation_results, ablation_path)
print(f" Saved ablation results to {ablation_path}")
# Combined report
all_results = {**baseline_results, **ablation_results}
# Generate text report
report = suite.report(all_results)
report_path = output_path / "report.txt"
with open(report_path, "w") as f:
f.write(report)
print(f"\n Saved report to {report_path}")
# Generate cost-quality frontier analysis
frontier = analyze_cost_quality_frontier(all_results)
frontier_path = output_path / "cost_quality_frontier.json"
with open(frontier_path, "w") as f:
json.dump(frontier, indent=2, fp=f)
print(f" Saved cost-quality frontier to {frontier_path}")
# Print to stdout
print("\n" + "=" * 80)
print(report)
print("=" * 80)
return all_results
def analyze_cost_quality_frontier(results):
"""Analyze the cost-quality Pareto frontier."""
points = []
for name, result in results.items():
success_rate = (result.num_success + result.num_partial) / result.num_tasks
avg_cost = result.avg_cost_success
points.append({
"baseline": name,
"success_rate": success_rate,
"avg_cost_per_success": avg_cost,
"total_cost": result.total_cost,
"latency_ms": result.avg_latency_ms,
"regression_rate": result.regression_rate,
"false_done_rate": result.false_done_rate,
"unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
"missed_escalation_rate": result.missed_escalation_rate,
})
# Find Pareto frontier: no other point has both higher success and lower cost
frontier = []
for p in points:
dominated = False
for q in points:
if q["baseline"] == p["baseline"]:
continue
if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
dominated = True
break
if not dominated:
frontier.append(p)
frontier.sort(key=lambda x: x["success_rate"], reverse=True)
return {
"all_points": points,
"pareto_frontier": frontier,
"frontier_baselines": [p["baseline"] for p in frontier],
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ACO Evaluation Runner")
parser.add_argument("--tasks", "-n", type=int, default=1000, help="Number of tasks")
parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed")
parser.add_argument("--output", "-o", default="./eval_results", help="Output directory")
args = parser.parse_args()
run_evaluation(args.tasks, args.seed, args.output)
|