File size: 4,177 Bytes
c315ccc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
"""Self-contained benchmark runner for Agent Cost Optimizer."""

import sys
import json
import os
from pathlib import Path
from datetime import datetime

# Add project root to path when running as script
sys.path.insert(0, str(Path(__file__).parent))

from aco.benchmarks.benchmark_suite import BenchmarkSuite
from aco.config import ACOConfig


def analyze_cost_quality_frontier(results):
    """Analyze the cost-quality Pareto frontier."""
    points = []
    for name, result in results.items():
        success_rate = (result.num_success + result.num_partial) / result.num_tasks
        avg_cost = result.avg_cost_success
        points.append({
            "baseline": name,
            "success_rate": success_rate,
            "avg_cost_per_success": avg_cost,
            "total_cost": result.total_cost,
            "latency_ms": result.avg_latency_ms,
            "regression_rate": result.regression_rate,
            "false_done_rate": result.false_done_rate,
            "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
            "missed_escalation_rate": result.missed_escalation_rate,
        })
    
    frontier = []
    for p in points:
        dominated = False
        for q in points:
            if q["baseline"] == p["baseline"]:
                continue
            if q["success_rate"] >= p["success_rate"] and q["avg_cost_per_success"] <= p["avg_cost_per_success"]:
                if q["success_rate"] > p["success_rate"] or q["avg_cost_per_success"] < p["avg_cost_per_success"]:
                    dominated = True
                    break
        if not dominated:
            frontier.append(p)
    
    frontier.sort(key=lambda x: x["success_rate"], reverse=True)
    
    return {
        "all_points": points,
        "pareto_frontier": frontier,
        "frontier_baselines": [p["baseline"] for p in frontier],
    }


def main():
    num_tasks = 1000
    seed = 42
    output_dir = "./eval_results"
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    print(f"[{datetime.now().isoformat()}] Starting ACO Evaluation")
    print(f"  Tasks: {num_tasks}")
    print(f"  Seed: {seed}")
    print()
    
    config = ACOConfig.from_yaml("config.yaml") if Path("config.yaml").exists() else ACOConfig()
    suite = BenchmarkSuite(config)
    
    # Generate data
    print(f"[{datetime.now().isoformat()}] Generating synthetic traces...")
    traces = suite.generate_benchmark_data(num_tasks, seed=seed)
    
    # Save traces
    traces_path = output_path / "traces.jsonl"
    with open(traces_path, "w") as f:
        for trace in traces:
            f.write(json.dumps(trace.to_dict()) + "\n")
    print(f"  Saved {len(traces)} traces to {traces_path}")
    
    # Run main baselines
    print(f"\n[{datetime.now().isoformat()}] Running baselines...")
    baseline_results = suite.run_all_baselines(traces)
    
    baseline_path = output_path / "baseline_results.json"
    suite.export(baseline_results, str(baseline_path))
    print(f"  Saved baseline results to {baseline_path}")
    
    # Run ablations
    print(f"\n[{datetime.now().isoformat()}] Running ablations...")
    ablation_results = suite.run_ablations(traces)
    
    ablation_path = output_path / "ablation_results.json"
    suite.export(ablation_results, str(ablation_path))
    print(f"  Saved ablation results to {ablation_path}")
    
    # Combined report
    all_results = {**baseline_results, **ablation_results}
    
    # Generate text report
    report = suite.report(all_results)
    report_path = output_path / "report.txt"
    with open(report_path, "w") as f:
        f.write(report)
    print(f"\n  Saved report to {report_path}")
    
    # Generate cost-quality frontier analysis
    frontier = analyze_cost_quality_frontier(all_results)
    frontier_path = output_path / "cost_quality_frontier.json"
    with open(frontier_path, "w") as f:
        json.dump(frontier, indent=2, fp=f)
    print(f"  Saved cost-quality frontier to {frontier_path}")
    
    # Print to stdout
    print("\n" + "=" * 80)
    print(report)
    print("=" * 80)


if __name__ == "__main__":
    main()