File size: 4,706 Bytes
6537c78
 
 
 
aad1d80
6537c78
 
 
aad1d80
 
 
6537c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Evaluation runner for Agent Cost Optimizer benchmarks."""

import argparse
import json
import sys
from datetime import datetime
from pathlib import Path

# Ensure aco package is importable
sys.path.insert(0, str(Path(__file__).parent))

from aco.benchmarks.benchmark_suite import BenchmarkSuite
from aco.config import ACOConfig


def run_evaluation(num_tasks: int = 1000, seed: int = 42, output_dir: str = "./eval_results"):
    """Run full evaluation suite."""
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
    print(f"  Tasks: {num_tasks}")
    print(f"  Seed: {seed}")
    print()
    
    config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
    suite = BenchmarkSuite(config)
    
    # Generate data
    print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
    traces = suite.generate_benchmark_data(num_tasks, seed=seed)
    
    # Save traces
    traces_path = output_path / "traces.jsonl"
    with open(traces_path, "w") as f:
        for trace in traces:
            f.write(json.dumps(trace.to_dict()) + "\n")
    print(f"  Saved {len(traces)} traces to {traces_path}")
    
    # Run main baselines
    print(f"\n[{datetime.now().isoformat()}] Running baselines...")
    baseline_results = suite.run_all_baselines(traces)
    
    baseline_path = output_path / "baseline_results.json"
    suite.export(baseline_results, baseline_path)
    print(f"  Saved baseline results to {baseline_path}")
    
    # Run ablations
    print(f"\n[{datetime.now().isoformat()}] Running ablations...")
    ablation_results = suite.run_ablations(traces)
    
    ablation_path = output_path / "ablation_results.json"
    suite.export(ablation_results, ablation_path)
    print(f"  Saved ablation results to {ablation_path}")
    
    # Combined report
    all_results = {**baseline_results, **ablation_results}
    
    # Generate text report
    report = suite.report(all_results)
    report_path = output_path / "report.txt"
    with open(report_path, "w") as f:
        f.write(report)
    print(f"\n  Saved report to {report_path}")
    
    # Generate cost-quality frontier analysis
    frontier = analyze_cost_quality_frontier(all_results)
    frontier_path = output_path / "cost_quality_frontier.json"
    with open(frontier_path, "w") as f:
        json.dump(frontier, indent=2, fp=f)
    print(f"  Saved cost-quality frontier to {frontier_path}")
    
    # Print to stdout
    print("\n" + "=" * 80)
    print(report)
    print("=" * 80)
    
    return all_results


def analyze_cost_quality_frontier(results):
    """Analyze the cost-quality Pareto frontier."""
    points = []
    for name, result in results.items():
        success_rate = (result.num_success + result.num_partial) / result.num_tasks
        avg_cost = result.avg_cost_success
        points.append({
            "baseline": name,
            "success_rate": success_rate,
            "avg_cost_per_success": avg_cost,
            "total_cost": result.total_cost,
            "latency_ms": result.avg_latency_ms,
            "regression_rate": result.regression_rate,
            "false_done_rate": result.false_done_rate,
            "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
            "missed_escalation_rate": result.missed_escalation_rate,
        })
    
    # Find Pareto frontier: no other point has both higher success and lower cost
    frontier = []
    for p in points:
        dominated = False
        for q in points:
            if q["baseline"] == p["baseline"]:
                continue
            if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
                if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
                    dominated = True
                    break
        if not dominated:
            frontier.append(p)
    
    frontier.sort(key=lambda x: x["success_rate"], reverse=True)
    
    return {
        "all_points": points,
        "pareto_frontier": frontier,
        "frontier_baselines": [p["baseline"] for p in frontier],
    }


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="ACO Evaluation Runner")
    parser.add_argument("--tasks", "-n", type=int, default=1000, help="Number of tasks")
    parser.add_argument("--seed", "-s", type=int, default=42, help="Random seed")
    parser.add_argument("--output", "-o", default="./eval_results", help="Output directory")
    args = parser.parse_args()
    
    run_evaluation(args.tasks, args.seed, args.output)