narcolepticchicken commited on
Commit
1a7f750
·
verified ·
1 Parent(s): d18f367

Upload aco/benchmarks/benchmark_suite.py

Browse files
Files changed (1) hide show
  1. aco/benchmarks/benchmark_suite.py +490 -0
aco/benchmarks/benchmark_suite.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Benchmark Suite for Agent Cost Optimizer.
2
+
3
+ Benchmarks:
4
+ A. Coding Agent Tasks
5
+ B. Research Agent Tasks
6
+ C. Tool-Use Tasks
7
+ D. Document / Contract / QA Tasks
8
+ E. Long-Horizon Agent Tasks
9
+
10
+ Baselines:
11
+ A. always frontier model
12
+ B. always cheap model
13
+ C. static model routing
14
+ D. prompt-only router
15
+ E. rules-only optimizer
16
+ F. learned model router
17
+ G. learned router + context budgeter
18
+ H. learned router + context + verifier budgeter
19
+ I. full Agent Cost Optimizer
20
+
21
+ Metrics:
22
+ - task success
23
+ - cost per successful task
24
+ - cost reduction at iso-quality
25
+ - latency
26
+ - token usage
27
+ - model calls
28
+ - tool calls
29
+ - verifier calls
30
+ - retries
31
+ - cache hit rate
32
+ - context tokens
33
+ - false-DONE rate
34
+ - unsafe cheap-model miss rate
35
+ - missed escalation rate
36
+ - user correction rate
37
+ - regression rate
38
+ - quality/cost frontier
39
+ """
40
+
41
+ import json
42
+ import time
43
+ from typing import Dict, List, Any, Optional
44
+ from dataclasses import dataclass, field
45
+ from collections import defaultdict
46
+
47
+ from aco.optimizer import AgentCostOptimizer, OptimizationResult
48
+ from aco.config import ACOConfig, ModelConfig, ToolConfig, VerifierConfig, RoutingPolicy
49
+ from aco.trace_schema import AgentTrace, TraceStep, ModelCall, ToolCall, VerifierCall, TaskType, Outcome, FailureTag
50
+ from aco.datasets.synthetic_traces import SyntheticTraceGenerator
51
+
52
+
53
+ @dataclass
54
+ class BenchmarkConfig:
55
+ name: str
56
+ task_types: List[TaskType]
57
+ num_tasks: int
58
+ routing_mode: str = "cascade"
59
+ enable_modules: Dict[str, bool] = field(default_factory=dict)
60
+ baseline_name: str = ""
61
+
62
+
63
+ @dataclass
64
+ class BenchmarkResult:
65
+ benchmark_name: str
66
+ baseline_name: str
67
+ num_tasks: int
68
+ num_success: int
69
+ num_partial: int
70
+ num_failure: int
71
+ num_false_done: int
72
+ num_blocked: int
73
+ total_cost: float
74
+ avg_cost_success: float
75
+ avg_latency_ms: float
76
+ total_tool_calls: int
77
+ total_verifier_calls: int
78
+ total_retries: int
79
+ avg_cache_hit_rate: float
80
+ total_context_tokens: int
81
+ cost_reduction_vs_frontier: float
82
+ false_done_rate: float
83
+ unsafe_cheap_miss_rate: float
84
+ missed_escalation_rate: float
85
+ regression_rate: float
86
+ quality_cost_frontier: List[Dict[str, float]] = field(default_factory=list)
87
+ per_task_results: List[Dict[str, Any]] = field(default_factory=list)
88
+
89
+
90
+ class BenchmarkSuite:
91
+ """Runs ACO benchmarks across tasks and baselines."""
92
+
93
+ def __init__(self, config: Optional[ACOConfig] = None):
94
+ self.config = config or self._default_config()
95
+
96
+ def _default_config(self) -> ACOConfig:
97
+ models = {
98
+ "tiny": ModelConfig("tiny_local", "local", 0.0001, 0.0002, latency_ms_estimate=200, strength_tier=1),
99
+ "cheap": ModelConfig("cheap_cloud", "cloud", 0.0005, 0.001, latency_ms_estimate=500, strength_tier=2),
100
+ "medium": ModelConfig("medium", "cloud", 0.003, 0.006, latency_ms_estimate=800, strength_tier=3),
101
+ "frontier": ModelConfig("frontier", "cloud", 0.01, 0.03, latency_ms_estimate=1500, strength_tier=4),
102
+ "specialist": ModelConfig("specialist", "cloud", 0.015, 0.045, latency_ms_estimate=2000, strength_tier=5),
103
+ }
104
+ tools = {
105
+ "search": ToolConfig("search", 0.002, 500),
106
+ "retrieve": ToolConfig("retrieve", 0.001, 300),
107
+ "code_execution": ToolConfig("code_execution", 0.005, 1000),
108
+ "linter": ToolConfig("linter", 0.001, 200),
109
+ "file_read": ToolConfig("file_read", 0.0005, 100),
110
+ "compliance_check": ToolConfig("compliance_check", 0.01, 1500),
111
+ "summarize": ToolConfig("summarize", 0.002, 400),
112
+ }
113
+ verifiers = {
114
+ "verifier_medium": VerifierConfig("verifier_medium", 0.005, 800, 0.8),
115
+ }
116
+ return ACOConfig(
117
+ project_name="aco-benchmark",
118
+ models=models,
119
+ tools=tools,
120
+ verifiers=verifiers,
121
+ routing_policy=RoutingPolicy("benchmark"),
122
+ )
123
+
124
+ def generate_benchmark_data(self, n: int = 1000, seed: int = 42) -> List[AgentTrace]:
125
+ """Generate synthetic traces for benchmarking."""
126
+ gen = SyntheticTraceGenerator(seed=seed)
127
+ return gen.generate(n)
128
+
129
+ def run_baseline(
130
+ self,
131
+ traces: List[AgentTrace],
132
+ baseline_name: str,
133
+ ) -> BenchmarkResult:
134
+ """Run a single baseline over the benchmark traces."""
135
+
136
+ # Configure optimizer for baseline
137
+ mode_map = {
138
+ "always_frontier": "always_frontier",
139
+ "always_cheap": "always_cheap",
140
+ "static": "static",
141
+ "prompt_only": "prompt_only",
142
+ "learned": "learned",
143
+ "learned_verifier": "learned_verifier",
144
+ "cascade": "cascade",
145
+ }
146
+
147
+ # Adjust config based on baseline
148
+ config = self._default_config()
149
+ if baseline_name == "always_frontier":
150
+ config.enable_router = False
151
+ elif baseline_name == "always_cheap":
152
+ config.enable_router = False
153
+ elif baseline_name == "static":
154
+ pass # default static routing
155
+ elif baseline_name == "rules_only":
156
+ config.enable_classifier = True
157
+ config.enable_router = True
158
+ config.enable_context_budgeter = True
159
+ config.enable_cache_layout = True
160
+ config.enable_tool_gate = True
161
+ config.enable_verifier_budgeter = True
162
+ config.enable_retry_optimizer = True
163
+ config.enable_meta_tool_miner = False
164
+ config.enable_early_termination = True
165
+ elif baseline_name == "full":
166
+ pass # all enabled
167
+
168
+ # For ablations, disable specific modules
169
+ if baseline_name.startswith("no_"):
170
+ module_name = baseline_name.replace("no_", "")
171
+ if hasattr(config, f"enable_{module_name}"):
172
+ setattr(config, f"enable_{module_name}", False)
173
+
174
+ optimizer = AgentCostOptimizer(config)
175
+
176
+ results = []
177
+ total_cost = 0.0
178
+ total_latency = 0.0
179
+ total_tools = 0
180
+ total_verifiers = 0
181
+ total_retries = 0
182
+ total_context = 0
183
+ cache_rates = []
184
+
185
+ success_count = 0
186
+ partial_count = 0
187
+ failure_count = 0
188
+ false_done_count = 0
189
+ blocked_count = 0
190
+
191
+ cheap_misses = 0
192
+ escalation_misses = 0
193
+ regression_count = 0
194
+
195
+ frontier_costs = []
196
+ actual_costs = []
197
+
198
+ for trace in traces:
199
+ # Run optimization on this trace's request
200
+ run_state = {
201
+ "trace_id": trace.trace_id,
202
+ "routing_mode": mode_map.get(baseline_name, "cascade"),
203
+ "current_cost": 0.0,
204
+ "planned_tools": [
205
+ (tc.tool_name, tc.tool_input)
206
+ for step in trace.steps
207
+ for tc in step.tool_calls
208
+ ],
209
+ "previous_tool_calls": [
210
+ tc for step in trace.steps for tc in step.tool_calls
211
+ ],
212
+ "step_number": len(trace.steps),
213
+ "total_steps": len(trace.steps),
214
+ "is_irreversible": trace.task_type == TaskType.LEGAL_REGULATED,
215
+ }
216
+
217
+ result = optimizer.optimize(trace.user_request, run_state)
218
+
219
+ # Simulate execution based on optimization decisions
220
+ sim_cost, sim_latency, sim_success = self._simulate(trace, result, baseline_name)
221
+
222
+ total_cost += sim_cost
223
+ total_latency += sim_latency
224
+ total_tools += len(result.tool_decisions)
225
+ if result.verifier_decision:
226
+ total_verifiers += 1
227
+ total_retries += sum(1 for d in result.tool_decisions if d.decision.value == "skip")
228
+ total_context += sum(s.context_size_tokens for s in trace.steps)
229
+
230
+ frontier_cost = sum(
231
+ s.model_call.total_cost if s.model_call else 0
232
+ for s in trace.steps
233
+ ) if trace.metadata.get("scenario") == "frontier_unnecessary" else trace.total_cost * 2
234
+ frontier_costs.append(frontier_cost)
235
+ actual_costs.append(sim_cost)
236
+
237
+ outcome = trace.final_outcome
238
+ if sim_success:
239
+ if outcome == Outcome.SUCCESS:
240
+ success_count += 1
241
+ elif outcome == Outcome.PARTIAL_SUCCESS:
242
+ partial_count += 1
243
+ else:
244
+ regression_count += 1
245
+ else:
246
+ if outcome == Outcome.FALSE_DONE:
247
+ false_done_count += 1
248
+ elif outcome == Outcome.BLOCKED:
249
+ blocked_count += 1
250
+ else:
251
+ failure_count += 1
252
+
253
+ # Check for cheap model misses
254
+ if trace.metadata.get("scenario") == "cheap_failure" and result.routing_decision.tier <= 2:
255
+ cheap_misses += 1
256
+
257
+ # Check for missed escalation
258
+ if trace.metadata.get("scenario") in ("cheap_failure", "tool_underuse") and result.routing_decision.tier < 3:
259
+ escalation_misses += 1
260
+
261
+ cache_rates.append(trace.cache_hit_rate)
262
+
263
+ results.append({
264
+ "trace_id": trace.trace_id,
265
+ "task_type": trace.task_type.value,
266
+ "scenario": trace.metadata.get("scenario", "normal"),
267
+ "simulated_cost": sim_cost,
268
+ "simulated_success": sim_success,
269
+ "routing_tier": result.routing_decision.tier,
270
+ "model_id": result.routing_decision.model_id,
271
+ "tool_count": len(result.tool_decisions),
272
+ "verifier_used": result.verifier_decision is not None,
273
+ })
274
+
275
+ n = len(traces)
276
+ avg_cost_success = total_cost / max(success_count + partial_count, 1)
277
+
278
+ # Cost reduction vs frontier baseline
279
+ cost_reduction = (sum(frontier_costs) - sum(actual_costs)) / max(sum(frontier_costs), 1)
280
+
281
+ return BenchmarkResult(
282
+ benchmark_name="synthetic_benchmark",
283
+ baseline_name=baseline_name,
284
+ num_tasks=n,
285
+ num_success=success_count,
286
+ num_partial=partial_count,
287
+ num_failure=failure_count,
288
+ num_false_done=false_done_count,
289
+ num_blocked=blocked_count,
290
+ total_cost=total_cost,
291
+ avg_cost_success=avg_cost_success,
292
+ avg_latency_ms=total_latency / n,
293
+ total_tool_calls=total_tools,
294
+ total_verifier_calls=total_verifiers,
295
+ total_retries=total_retries,
296
+ avg_cache_hit_rate=sum(cache_rates) / n,
297
+ total_context_tokens=total_context,
298
+ cost_reduction_vs_frontier=cost_reduction,
299
+ false_done_rate=false_done_count / n,
300
+ unsafe_cheap_miss_rate=cheap_misses / n,
301
+ missed_escalation_rate=escalation_misses / n,
302
+ regression_rate=regression_count / n,
303
+ quality_cost_frontier=[
304
+ {"cost": c, "success": 1.0 if s else 0.0}
305
+ for c, s in zip(actual_costs, [r["simulated_success"] for r in results])
306
+ ],
307
+ per_task_results=results,
308
+ )
309
+
310
+ def _simulate(self, trace: AgentTrace, result: OptimizationResult, baseline: str) -> tuple:
311
+ """Simulate execution based on optimizer decisions."""
312
+
313
+ # Base cost from the trace
314
+ base_cost = trace.total_cost_computed
315
+
316
+ # Adjust cost based on routing decision
317
+ tier = result.routing_decision.tier
318
+ cost_mult = {
319
+ 1: 0.05, 2: 0.25, 3: 0.75, 4: 1.0, 5: 1.5,
320
+ }.get(tier, 1.0)
321
+
322
+ # Apply tool gate savings
323
+ tools_skipped = sum(1 for d in result.tool_decisions if d.decision.value in ("skip", "use_cache"))
324
+ tool_savings = tools_skipped * 0.005
325
+
326
+ # Apply cache savings
327
+ cache_savings = 0.0
328
+ if result.prompt_layout:
329
+ cache_savings = result.prompt_layout.cache_discount
330
+
331
+ sim_cost = base_cost * cost_mult - tool_savings - cache_savings
332
+ sim_cost = max(sim_cost, 0.001)
333
+
334
+ # Simulate latency
335
+ sim_latency = trace.total_latency_ms * cost_mult * 0.8
336
+
337
+ # Simulate success probability
338
+ scenario = trace.metadata.get("scenario", "normal")
339
+
340
+ # Base success rate by tier and scenario
341
+ success_prob = 0.95 if tier >= 3 else 0.7
342
+ if scenario == "cheap_failure":
343
+ success_prob = 0.3 if tier <= 2 else 0.85
344
+ elif scenario == "tool_underuse":
345
+ success_prob = 0.6 if tools_skipped > 0 else 0.8
346
+ elif scenario == "retry_loop":
347
+ success_prob = 0.2
348
+ elif scenario == "frontier_unnecessary":
349
+ success_prob = 0.95
350
+ elif scenario == "meta_tool_success":
351
+ success_prob = 0.9
352
+ elif scenario == "meta_tool_bad":
353
+ success_prob = 0.4
354
+ elif scenario == "false_done":
355
+ success_prob = 0.1
356
+ elif scenario == "blocked" or scenario == "stopped_doom":
357
+ success_prob = 0.0
358
+ elif scenario == "human_escalation":
359
+ success_prob = 0.5
360
+
361
+ # Verifier improves success for high-risk tasks
362
+ if result.verifier_decision and result.verifier_decision.decision.value == "call_verifier":
363
+ success_prob += 0.05
364
+
365
+ # Meta-tool success bonus
366
+ if result.meta_tool_match:
367
+ success_prob += 0.03
368
+
369
+ sim_success = success_prob > 0.5 # simplified threshold
370
+
371
+ return sim_cost, sim_latency, sim_success
372
+
373
+ def run_all_baselines(self, traces: List[AgentTrace]) -> Dict[str, BenchmarkResult]:
374
+ """Run all baseline configurations."""
375
+ baselines = [
376
+ "always_frontier",
377
+ "always_cheap",
378
+ "static",
379
+ "prompt_only",
380
+ "cascade",
381
+ "rules_only",
382
+ "full",
383
+ ]
384
+
385
+ results = {}
386
+ for baseline in baselines:
387
+ print(f"Running baseline: {baseline}...")
388
+ results[baseline] = self.run_baseline(traces, baseline)
389
+
390
+ return results
391
+
392
+ def run_ablations(self, traces: List[AgentTrace]) -> Dict[str, BenchmarkResult]:
393
+ """Run ablation study disabling each module."""
394
+ ablations = [
395
+ "no_router",
396
+ "no_context_budgeter",
397
+ "no_cache_layout",
398
+ "no_tool_gate",
399
+ "no_verifier_budgeter",
400
+ "no_retry_optimizer",
401
+ "no_meta_tool_miner",
402
+ "no_early_termination",
403
+ ]
404
+
405
+ results = {}
406
+ for ablation in ablations:
407
+ print(f"Running ablation: {ablation}...")
408
+ results[ablation] = self.run_baseline(traces, ablation)
409
+
410
+ return results
411
+
412
+ def report(self, results: Dict[str, BenchmarkResult]) -> str:
413
+ """Generate formatted benchmark report."""
414
+ lines = ["=" * 80, "AGENT COST OPTIMIZER BENCHMARK REPORT", "=" * 80, ""]
415
+
416
+ headers = ["Baseline", "Success", "Partial", "Fail", "Blocked", "False-DONE",
417
+ "Total Cost", "Avg Cost/Succ", "Latency(ms)", "Tools", "Verifiers",
418
+ "Retries", "Cache Hit", "Cost Reduction", "Regression"]
419
+
420
+ lines.append(" | ".join(headers))
421
+ lines.append("-" * 120)
422
+
423
+ for name, result in results.items():
424
+ row = [
425
+ name[:20].ljust(20),
426
+ f"{result.num_success / result.num_tasks:.1%}",
427
+ f"{result.num_partial / result.num_tasks:.1%}",
428
+ f"{result.num_failure / result.num_tasks:.1%}",
429
+ f"{result.num_blocked / result.num_tasks:.1%}",
430
+ f"{result.false_done_rate:.1%}",
431
+ f"${result.total_cost:.2f}",
432
+ f"${result.avg_cost_success:.4f}",
433
+ f"{result.avg_latency_ms:.0f}",
434
+ str(result.total_tool_calls),
435
+ str(result.total_verifier_calls),
436
+ str(result.total_retries),
437
+ f"{result.avg_cache_hit_rate:.1%}",
438
+ f"{result.cost_reduction_vs_frontier:.1%}",
439
+ f"{result.regression_rate:.1%}",
440
+ ]
441
+ lines.append(" | ".join(row))
442
+
443
+ lines.append("")
444
+ lines.append("=" * 80)
445
+
446
+ # Find best cost/success tradeoff
447
+ best_score = -float("inf")
448
+ best_name = ""
449
+ for name, result in results.items():
450
+ success_rate = (result.num_success + result.num_partial) / result.num_tasks
451
+ score = success_rate * 10 - result.avg_cost_success * 100 - result.regression_rate * 50
452
+ if score > best_score:
453
+ best_score = score
454
+ best_name = name
455
+
456
+ lines.append(f"BEST OVERALL: {best_name} (score={best_score:.2f})")
457
+ lines.append("")
458
+
459
+ return "\n".join(lines)
460
+
461
+ def export(self, results: Dict[str, BenchmarkResult], path: str) -> None:
462
+ """Export results to JSON."""
463
+ export_data = {}
464
+ for name, result in results.items():
465
+ export_data[name] = {
466
+ "benchmark_name": result.benchmark_name,
467
+ "baseline_name": result.baseline_name,
468
+ "num_tasks": result.num_tasks,
469
+ "num_success": result.num_success,
470
+ "num_partial": result.num_partial,
471
+ "num_failure": result.num_failure,
472
+ "num_blocked": result.num_blocked,
473
+ "num_false_done": result.num_false_done,
474
+ "total_cost": result.total_cost,
475
+ "avg_cost_success": result.avg_cost_success,
476
+ "avg_latency_ms": result.avg_latency_ms,
477
+ "total_tool_calls": result.total_tool_calls,
478
+ "total_verifier_calls": result.total_verifier_calls,
479
+ "total_retries": result.total_retries,
480
+ "avg_cache_hit_rate": result.avg_cache_hit_rate,
481
+ "total_context_tokens": result.total_context_tokens,
482
+ "cost_reduction_vs_frontier": result.cost_reduction_vs_frontier,
483
+ "false_done_rate": result.false_done_rate,
484
+ "unsafe_cheap_miss_rate": result.unsafe_cheap_miss_rate,
485
+ "missed_escalation_rate": result.missed_escalation_rate,
486
+ "regression_rate": result.regression_rate,
487
+ }
488
+
489
+ with open(path, "w") as f:
490
+ json.dump(export_data, f, indent=2)