narcolepticchicken commited on
Commit
4358507
·
verified ·
1 Parent(s): f5d8bfc

Upload standalone_eval_v2.py

Browse files
Files changed (1) hide show
  1. standalone_eval_v2.py +498 -0
standalone_eval_v2.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Standalone benchmark runner v2 with realistic quality/cost tradeoffs."""
3
+ import sys, json, os, uuid, random, argparse
4
+ from datetime import datetime, timedelta
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import Dict, List, Optional, Any, Tuple
8
+
9
+ class TaskType(Enum):
10
+ QUICK_ANSWER="quick_answer"; RESEARCH="research"; CODING="coding"
11
+ DOCUMENT_DRAFTING="document_drafting"; LEGAL_REGULATED="legal_regulated"
12
+ TOOL_HEAVY="tool_heavy"; RETRIEVAL_HEAVY="retrieval_heavy"
13
+ LONG_HORIZON="long_horizon"; UNKNOWN_AMBIGUOUS="unknown_ambiguous"
14
+
15
+ class Outcome(Enum):
16
+ SUCCESS="success"; PARTIAL_SUCCESS="partial_success"; FAILURE="failure"
17
+ FALSE_DONE="false_done"; BLOCKED="blocked"; ESCALATED_HUMAN="escalated_human"
18
+ STOPPED_DOOM="stopped_doom"
19
+
20
+ class FailureTag(Enum):
21
+ MODEL_TOO_WEAK="model_too_weak"; CONTEXT_TOO_SMALL="context_too_small"
22
+ TOOL_FAILED="tool_failed"; TOOL_UNNECESSARY="tool_unnecessary"
23
+ TOOL_MISSED="tool_missed"; RETRY_LOOP="retry_loop"
24
+ CACHE_BREAK="cache_break"; HALLUCINATION="hallucination"
25
+ TIMEOUT="timeout"; COST_EXCEEDED="cost_exceeded"
26
+ UNSAFE_CHEAP_MODEL="unsafe_cheap_model"; MISSED_ESCALATION="missed_escalation"
27
+ VERIFIER_FALSE_PASS="verifier_false_pass"
28
+
29
+ @dataclass
30
+ class ToolCall:
31
+ tool_name:str; tool_input:Dict[str,Any]; tool_output:Optional[str]=None
32
+ tool_cost:float=0.0; tool_latency_ms:float=0.0; cache_hit:bool=False
33
+ repeated:bool=False; ignored_result:bool=False; failed:bool=False
34
+
35
+ @dataclass
36
+ class ModelCall:
37
+ model_id:str; provider:str; input_tokens:int=0; output_tokens:int=0
38
+ reasoning_tokens:int=0; cost_per_1k_input:float=0.0; cost_per_1k_output:float=0.0
39
+ cache_hit_input_tokens:int=0; latency_ms:float=0.0
40
+ @property
41
+ def total_cost(self): return (self.input_tokens/1000)*self.cost_per_1k_input + (self.output_tokens/1000)*self.cost_per_1k_output - (self.cache_hit_input_tokens/1000)*self.cost_per_1k_input*0.5
42
+
43
+ @dataclass
44
+ class VerifierCall:
45
+ verifier_model_id:str; target_step_id:str; passed:bool=False
46
+ confidence:float=0.0; cost:float=0.0; latency_ms:float=0.0
47
+
48
+ @dataclass
49
+ class TraceStep:
50
+ step_id:str; timestamp:datetime; task_type:TaskType; model_call:ModelCall
51
+ tool_calls:List[ToolCall]=field(default_factory=list)
52
+ verifier_calls:List[VerifierCall]=field(default_factory=list)
53
+ context_size_tokens:int=0; context_sources:List[str]=field(default_factory=list)
54
+ retry_count:int=0; recovery_action:Optional[str]=None
55
+ artifacts_created:List[str]=field(default_factory=list)
56
+ step_outcome:Optional[Outcome]=None
57
+ @property
58
+ def step_cost(self): return (self.model_call.total_cost if self.model_call else 0.0)+sum(t.tool_cost for t in self.tool_calls)+sum(v.cost for v in self.verifier_calls)
59
+ @property
60
+ def step_latency_ms(self): return (self.model_call.latency_ms if self.model_call else 0.0)+sum(t.tool_latency_ms for t in self.tool_calls)+sum(v.latency_ms for v in self.verifier_calls)
61
+
62
+ @dataclass
63
+ class AgentTrace:
64
+ trace_id:str; user_request:str; task_type:TaskType
65
+ steps:List[TraceStep]=field(default_factory=list)
66
+ final_outcome:Optional[Outcome]=None; final_artifacts:List[str]=field(default_factory=list)
67
+ failure_tags:List[FailureTag]=field(default_factory=list)
68
+ user_satisfaction:Optional[float]=None
69
+ total_cost:Optional[float]=None
70
+ metadata:Dict[str,Any]=field(default_factory=dict)
71
+ @property
72
+ def total_cost_computed(self): return sum(s.step_cost for s in self.steps)
73
+ @property
74
+ def total_latency_ms(self): return sum(s.step_latency_ms for s in self.steps)
75
+ @property
76
+ def total_retries(self): return sum(s.retry_count for s in self.steps)
77
+ @property
78
+ def total_tool_calls(self): return sum(len(s.tool_calls) for s in self.steps)
79
+ @property
80
+ def total_verifier_calls(self): return sum(len(s.verifier_calls) for s in self.steps)
81
+ @property
82
+ def cache_hit_rate(self):
83
+ mc=[s.model_call for s in self.steps if s.model_call]
84
+ if not mc: return 0.0
85
+ ti=sum(m.input_tokens for m in mc)
86
+ return sum(m.cache_hit_input_tokens for m in mc)/ti if ti>0 else 0.0
87
+ def to_dict(self):
88
+ return {"trace_id":self.trace_id,"user_request":self.user_request,"task_type":self.task_type.value,
89
+ "steps":[{"step_id":s.step_id,"timestamp":s.timestamp.isoformat(),"task_type":s.task_type.value,
90
+ "model_call":{"model_id":s.model_call.model_id,"provider":s.model_call.provider,
91
+ "input_tokens":s.model_call.input_tokens,"output_tokens":s.model_call.output_tokens,
92
+ "reasoning_tokens":s.model_call.reasoning_tokens,"cost":s.model_call.total_cost,
93
+ "latency_ms":s.model_call.latency_ms,"cache_hit_input_tokens":s.model_call.cache_hit_input_tokens},
94
+ "tool_calls":[{"tool_name":t.tool_name,"tool_cost":t.tool_cost,"tool_latency_ms":t.tool_latency_ms,
95
+ "cache_hit":t.cache_hit,"repeated":t.repeated,"ignored_result":t.ignored_result,"failed":t.failed} for t in s.tool_calls],
96
+ "verifier_calls":[{"verifier_model_id":v.verifier_model_id,"passed":v.passed,
97
+ "confidence":v.confidence,"cost":v.cost} for v in s.verifier_calls],
98
+ "context_size_tokens":s.context_size_tokens,"retry_count":s.retry_count,
99
+ "recovery_action":s.recovery_action,"step_outcome":s.step_outcome.value if s.step_outcome else None,
100
+ "step_cost":s.step_cost,"step_latency_ms":s.step_latency_ms} for s in self.steps],
101
+ "final_outcome":self.final_outcome.value if self.final_outcome else None,
102
+ "failure_tags":[f.value for f in self.failure_tags],
103
+ "total_cost":self.total_cost_computed,"total_latency_ms":self.total_latency_ms,
104
+ "total_retries":self.total_retries,"total_tool_calls":self.total_tool_calls,
105
+ "total_verifier_calls":self.total_verifier_calls,
106
+ "cache_hit_rate":self.cache_hit_rate,"metadata":self.metadata}
107
+
108
+ class SyntheticTraceGenerator:
109
+ # Realistic provider pricing (per 1K tokens)
110
+ MODEL_CONFIGS = {
111
+ "tiny_local": {"tier":1,"cost_input":0.0001,"cost_output":0.0002,"latency":200,"strength":0.35,"name":"Tiny Local (Qwen-0.5B)"},
112
+ "cheap_cloud": {"tier":2,"cost_input":0.00015,"cost_output":0.0006,"latency":400,"strength":0.55,"name":"GPT-4o-mini"},
113
+ "medium": {"tier":3,"cost_input":0.0015,"cost_output":0.006,"latency":800,"strength":0.80,"name":"Claude-3.5-Sonnet"},
114
+ "frontier": {"tier":4,"cost_input":0.005,"cost_output":0.015,"latency":1500,"strength":0.93,"name":"GPT-4o / Claude-3-Opus"},
115
+ "specialist": {"tier":5,"cost_input":0.01,"cost_output":0.03,"latency":2000,"strength":0.97,"name":"o1 / o3-mini"},
116
+ }
117
+ TOOL_COSTS = {"search":0.002,"retrieve":0.001,"fetch":0.003,"code_execution":0.005,
118
+ "linter":0.001,"test_runner":0.003,"file_read":0.0005,"file_write":0.0005,
119
+ "calculator":0.0001,"database_query":0.004,"compliance_check":0.01,
120
+ "summarize":0.002,"task_planner":0.001,"progress_tracker":0.0005}
121
+ # Task difficulty: [tier_needed, risk_level]
122
+ TASK_DIFFICULTY = {
123
+ TaskType.QUICK_ANSWER: (1, 0.1),
124
+ TaskType.CODING: (3, 0.4),
125
+ TaskType.RESEARCH: (3, 0.5),
126
+ TaskType.DOCUMENT_DRAFTING: (2, 0.2),
127
+ TaskType.LEGAL_REGULATED: (4, 0.8),
128
+ TaskType.TOOL_HEAVY: (2, 0.3),
129
+ TaskType.RETRIEVAL_HEAVY: (2, 0.35),
130
+ TaskType.LONG_HORIZON: (3, 0.6),
131
+ TaskType.UNKNOWN_AMBIGUOUS: (3, 0.7),
132
+ }
133
+ SCENARIOS = [
134
+ {"name":"quick_answer_success","prob":0.18,"task_type":TaskType.QUICK_ANSWER,"tier":[1,2],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":1},
135
+ {"name":"quick_answer_cheap_fail","prob":0.02,"task_type":TaskType.QUICK_ANSWER,"tier":[1],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":2},
136
+ {"name":"coding_success_frontier","prob":0.08,"task_type":TaskType.CODING,"tier":[4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":4},
137
+ {"name":"coding_success_medium","prob":0.10,"task_type":TaskType.CODING,"tier":[3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
138
+ {"name":"coding_cheap_fail","prob":0.05,"task_type":TaskType.CODING,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":4},
139
+ {"name":"coding_tool_underuse","prob":0.04,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.TOOL_MISSED],"difficulty":3},
140
+ {"name":"research_success","prob":0.10,"task_type":TaskType.RESEARCH,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
141
+ {"name":"research_cheap_fail","prob":0.03,"task_type":TaskType.RESEARCH,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":4},
142
+ {"name":"document_draft_success","prob":0.08,"task_type":TaskType.DOCUMENT_DRAFTING,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
143
+ {"name":"legal_frontier_success","prob":0.04,"task_type":TaskType.LEGAL_REGULATED,"tier":[4,5],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":5},
144
+ {"name":"legal_cheap_unsafe","prob":0.02,"task_type":TaskType.LEGAL_REGULATED,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.UNSAFE_CHEAP_MODEL],"difficulty":5},
145
+ {"name":"tool_heavy_success","prob":0.06,"task_type":TaskType.TOOL_HEAVY,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
146
+ {"name":"retrieval_success","prob":0.06,"task_type":TaskType.RETRIEVAL_HEAVY,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
147
+ {"name":"long_horizon_success","prob":0.05,"task_type":TaskType.LONG_HORIZON,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":4},
148
+ {"name":"long_horizon_retry_loop","prob":0.03,"task_type":TaskType.LONG_HORIZON,"tier":[3],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.RETRY_LOOP],"difficulty":4},
149
+ {"name":"unknown_ambiguous_success","prob":0.03,"task_type":TaskType.UNKNOWN_AMBIGUOUS,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
150
+ {"name":"unknown_ambiguous_blocked","prob":0.02,"task_type":TaskType.UNKNOWN_AMBIGUOUS,"tier":[3,4],"outcome":Outcome.BLOCKED,"failure_tags":[FailureTag.MISSED_ESCALATION],"difficulty":3},
151
+ {"name":"tool_overuse","prob":0.04,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.PARTIAL_SUCCESS,"failure_tags":[FailureTag.TOOL_UNNECESSARY],"difficulty":3},
152
+ {"name":"cache_break_scenario","prob":0.03,"task_type":TaskType.RESEARCH,"tier":[3,4],"outcome":Outcome.PARTIAL_SUCCESS,"failure_tags":[FailureTag.CACHE_BREAK],"difficulty":3},
153
+ {"name":"false_done_scenario","prob":0.02,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.FALSE_DONE,"failure_tags":[FailureTag.VERIFIER_FALSE_PASS],"difficulty":3},
154
+ ]
155
+ def __init__(self,seed=42): self.rng=random.Random(seed)
156
+ def generate(self,n=10000): return [self._generate_trace(i) for i in range(n)]
157
+ def _pick_scenario(self): return self.rng.choices(self.SCENARIOS,weights=[s["prob"] for s in self.SCENARIOS])[0]
158
+ def _tier_to_model(self,tier): return {1:"tiny_local",2:"cheap_cloud",3:"medium",4:"frontier",5:"specialist"}.get(tier,"medium")
159
+ def _generate_trace(self,idx):
160
+ scenario=self._pick_scenario()
161
+ trace_id=f"synth_{idx}_{uuid.uuid4().hex[:8]}"
162
+ task_type=scenario["task_type"]
163
+ user_request=self._generate_request(task_type,scenario["name"])
164
+ base_steps=self.rng.randint(1,8)
165
+ if "long_horizon" in scenario["name"] or "retry_loop" in scenario["name"]: base_steps=self.rng.randint(4,12)
166
+ elif "coding" in scenario["name"] and scenario["outcome"]==Outcome.FAILURE: base_steps=self.rng.randint(3,8)
167
+ tier=self.rng.choice(scenario["tier"])
168
+ model_key=self._tier_to_model(tier)
169
+ model_cfg=self.MODEL_CONFIGS[model_key]
170
+ steps=[]
171
+ for step_idx in range(base_steps):
172
+ steps.append(self._generate_step(trace_id,step_idx,task_type,model_key,model_cfg,scenario,step_idx==base_steps-1))
173
+ return AgentTrace(
174
+ trace_id=trace_id,user_request=user_request,task_type=task_type,steps=steps,
175
+ final_outcome=scenario["outcome"],failure_tags=list(scenario.get("failure_tags",[])),
176
+ total_cost=sum(s.step_cost for s in steps),
177
+ metadata={"scenario":scenario["name"],"synthetic":True,"difficulty":scenario["difficulty"],
178
+ "optimal_tier":scenario["difficulty"],"actual_tier":tier})
179
+ def _generate_request(self,task_type,scenario_name):
180
+ templates={
181
+ TaskType.QUICK_ANSWER:["What is the capital of France?","Briefly explain quantum computing.","Summarize article X.","What is 237 * 452?"],
182
+ TaskType.CODING:["Write a Python function to reverse a linked list.","Fix the bug in this React component.","Refactor auth module to JWT.","Implement LRU cache in Go.","Debug this segfault in C++ thread pool."],
183
+ TaskType.RESEARCH:["Research latest transformer advances.","Find sources comparing LoRA and full FT.","Investigate data center climate impact.","What does literature say on speculative decoding?"],
184
+ TaskType.DOCUMENT_DRAFTING:["Draft project proposal for ML pipeline.","Write email to team about deployment.","Create technical report on performance."],
185
+ TaskType.LEGAL_REGULATED:["Review this contract for liability clauses.","Check GDPR compliance for data pipeline.","Draft privacy policy section."],
186
+ TaskType.TOOL_HEAVY:["Search open issues and create summary.","Fetch API docs and generate client code.","Query Q3 sales and produce chart."],
187
+ TaskType.RETRIEVAL_HEAVY:["Answer based on 50-page document.","Find all 'payment processing' mentions.","Retrieve relevant cases for legal query."],
188
+ TaskType.LONG_HORIZON:["Plan 3-month roadmap.","Orchestrate multi-region deployment.","Redesign data architecture end-to-end."],
189
+ TaskType.UNKNOWN_AMBIGUOUS:["Help me with this thing.","I need something about the server.","Can you look into that issue?"],
190
+ }
191
+ return self.rng.choice(templates.get(task_type,["Generic request"]))
192
+ def _get_tools_for_task(self,task_type):
193
+ return {TaskType.QUICK_ANSWER:["calculator","search"],
194
+ TaskType.CODING:["file_read","file_write","code_execution","linter","test_runner"],
195
+ TaskType.RESEARCH:["search","retrieve","fetch","summarize"],
196
+ TaskType.DOCUMENT_DRAFTING:["file_read","summarize"],
197
+ TaskType.LEGAL_REGULATED:["document_retrieval","compliance_check","search"],
198
+ TaskType.TOOL_HEAVY:["search","fetch","api_call","database_query"],
199
+ TaskType.RETRIEVAL_HEAVY:["retrieve","search","fetch"],
200
+ TaskType.LONG_HORIZON:["task_planner","progress_tracker","file_read"],
201
+ TaskType.UNKNOWN_AMBIGUOUS:["search"]}.get(task_type,["search"])
202
+ def _generate_step(self,trace_id,step_idx,task_type,model_key,model_cfg,scenario,is_last):
203
+ step_id=f"{trace_id}_step_{step_idx}"
204
+ input_tokens=self.rng.randint(800,12000)
205
+ output_tokens=self.rng.randint(200,6000)
206
+ cache_hit=self.rng.random()<0.35
207
+ cache_hit_tokens=int(input_tokens*self.rng.random()*0.6) if cache_hit else 0
208
+ model_call=ModelCall(model_id=model_key,provider="synthetic",input_tokens=input_tokens,output_tokens=output_tokens,
209
+ reasoning_tokens=output_tokens//4 if model_key in ("frontier","specialist") else 0,
210
+ cost_per_1k_input=model_cfg["cost_input"],cost_per_1k_output=model_cfg["cost_output"],
211
+ cache_hit_input_tokens=cache_hit_tokens,latency_ms=model_cfg["latency"]*self.rng.uniform(0.8,1.5))
212
+ tool_calls=[]; base_tools=self._get_tools_for_task(task_type); num_tools=self.rng.randint(0,len(base_tools))
213
+ if scenario["name"]=="tool_overuse": num_tools+=3
214
+ for t in range(min(num_tools,len(base_tools))):
215
+ tool_name=base_tools[t]
216
+ tool_calls.append(ToolCall(tool_name=tool_name,tool_input={"query":f"auto_{tool_name}"},
217
+ tool_cost=self.TOOL_COSTS.get(tool_name,0.001),tool_latency_ms=self.rng.uniform(100,1200),
218
+ cache_hit=self.rng.random()<0.2,repeated=self.rng.random()<0.1,
219
+ ignored_result=self.rng.random()<0.05,
220
+ failed=self.rng.random()<(0.3 if "retry_loop" in scenario["name"] else 0.05)))
221
+ verifier_calls=[]; num_verifiers=0
222
+ if task_type==TaskType.LEGAL_REGULATED: num_verifiers=1
223
+ elif task_type in (TaskType.CODING,TaskType.RESEARCH) and model_key in ("frontier","specialist"): num_verifiers=1 if self.rng.random()<0.4 else 0
224
+ for _ in range(num_verifiers):
225
+ verifier_calls.append(VerifierCall(verifier_model_id="verifier_medium",target_step_id=step_id,
226
+ passed=self.rng.random()<0.85,confidence=self.rng.uniform(0.6,0.99),cost=0.005,latency_ms=500))
227
+ context_size=self.rng.randint(1500,20000)
228
+ if scenario["name"]=="cache_break_scenario": context_size+=self.rng.randint(8000,30000)
229
+ retries=0
230
+ if "retry_loop" in scenario["name"]: retries=self.rng.randint(4,8)
231
+ elif self.rng.random()<0.12: retries=self.rng.randint(1,3)
232
+ recovery=None
233
+ if retries>0: recovery=self.rng.choice(["retry_same","retry_changed_prompt","repair_tool","switch_model","ask_clarification"])
234
+ step_outcome=Outcome.SUCCESS
235
+ if is_last: step_outcome=scenario["outcome"]
236
+ elif "retry_loop" in scenario["name"] and step_idx>=2: step_outcome=Outcome.FAILURE
237
+ return TraceStep(step_id=step_id,timestamp=datetime.utcnow()+timedelta(seconds=step_idx*30),task_type=task_type,
238
+ model_call=model_call,tool_calls=tool_calls,verifier_calls=verifier_calls,
239
+ context_size_tokens=context_size,context_sources=["system_rules","tool_descriptions","user_preferences","recent_messages"],
240
+ retry_count=retries,recovery_action=recovery,
241
+ artifacts_created=[f"artifact_{step_idx}"] if self.rng.random()<0.25 else [],
242
+ step_outcome=step_outcome)
243
+
244
+ @dataclass
245
+ class BenchmarkResult:
246
+ baseline_name:str; num_tasks:int; num_success:int; num_partial:int
247
+ num_failure:int; num_false_done:int; num_blocked:int
248
+ total_cost:float; avg_cost_success:float; avg_latency_ms:float
249
+ total_tool_calls:int; total_verifier_calls:int; total_retries:int
250
+ avg_cache_hit_rate:float; cost_reduction_vs_frontier:float
251
+ false_done_rate:float; unsafe_cheap_miss_rate:float
252
+ missed_escalation_rate:float; regression_rate:float
253
+ per_scenario_stats:Dict[str,Dict[str,Any]]=field(default_factory=dict)
254
+
255
+ class BenchmarkSuite:
256
+ MODEL_CONFIGS = SyntheticTraceGenerator.MODEL_CONFIGS
257
+ TASK_DIFFICULTY = SyntheticTraceGenerator.TASK_DIFFICULTY
258
+ def __init__(self): pass
259
+ def generate_benchmark_data(self,n=1000,seed=42): return SyntheticTraceGenerator(seed=seed).generate(n)
260
+
261
+ def run_all_baselines(self,traces):
262
+ baselines=["always_frontier","always_cheap","static","cascade","full_optimizer"]
263
+ results={}
264
+ for baseline in baselines:
265
+ print(f"Running baseline: {baseline}...")
266
+ results[baseline]=self._run_baseline(traces,baseline)
267
+ return results
268
+
269
+ def run_ablations(self,traces):
270
+ ablations=["no_router","no_tool_gate","no_verifier","no_early_term","no_context_budget"]
271
+ results={}
272
+ for ablation in ablations:
273
+ print(f"Running ablation: {ablation}...")
274
+ results[ablation]=self._run_baseline(traces,ablation)
275
+ return results
276
+
277
+ def _run_baseline(self,traces,baseline_name):
278
+ success_count=0; partial_count=0; failure_count=0; false_done_count=0; blocked_count=0
279
+ total_cost=0.0; total_latency=0.0; total_tools=0; total_verifiers=0; total_retries=0
280
+ cache_rates=[]; cheap_misses=0; escalation_misses=0; regression_count=0
281
+ per_scenario=defaultdict(lambda:{"count":0,"success":0,"cost":0.0})
282
+ for trace in traces:
283
+ sim_cost,sim_success,sim_outcome=self._simulate(trace,baseline_name)
284
+ total_cost+=sim_cost; total_latency+=trace.total_latency_ms*0.7
285
+ total_tools+=trace.total_tool_calls; total_verifiers+=trace.total_verifier_calls
286
+ total_retries+=trace.total_retries; cache_rates.append(trace.cache_hit_rate)
287
+ scenario=trace.metadata.get("scenario","normal")
288
+ per_scenario[scenario]["count"]+=1; per_scenario[scenario]["cost"]+=sim_cost
289
+ if sim_success:
290
+ if sim_outcome in (Outcome.SUCCESS,Outcome.PARTIAL_SUCCESS):
291
+ success_count+=1; per_scenario[scenario]["success"]+=1
292
+ else: regression_count+=1
293
+ else:
294
+ if sim_outcome==Outcome.FALSE_DONE: false_done_count+=1
295
+ elif sim_outcome==Outcome.BLOCKED: blocked_count+=1
296
+ else: failure_count+=1
297
+ # Track cheap model misses
298
+ difficulty=trace.metadata.get("difficulty",3)
299
+ actual_tier=trace.metadata.get("actual_tier",3)
300
+ if actual_tier<difficulty and actual_tier<=2 and sim_outcome in (Outcome.FAILURE,Outcome.PARTIAL_SUCCESS):
301
+ cheap_misses+=1
302
+ if actual_tier<difficulty and sim_outcome in (Outcome.FAILURE,Outcome.BLOCKED):
303
+ escalation_misses+=1
304
+ n=len(traces); avg_cost_success=total_cost/max(success_count,1)
305
+ frontier_total=sum(t.total_cost_computed*4 for t in traces) # frontier costs ~4x medium
306
+ cost_reduction=(frontier_total-total_cost)/max(frontier_total,1)
307
+ return BenchmarkResult(
308
+ baseline_name=baseline_name,num_tasks=n,num_success=success_count,num_partial=partial_count,
309
+ num_failure=failure_count,num_false_done=false_done_count,num_blocked=blocked_count,
310
+ total_cost=total_cost,avg_cost_success=avg_cost_success,avg_latency_ms=total_latency/n,
311
+ total_tool_calls=total_tools,total_verifier_calls=total_verifiers,total_retries=total_retries,
312
+ avg_cache_hit_rate=sum(cache_rates)/n,cost_reduction_vs_frontier=cost_reduction,
313
+ false_done_rate=false_done_count/n,unsafe_cheap_miss_rate=cheap_misses/n,
314
+ missed_escalation_rate=escalation_misses/n,regression_rate=regression_count/n,
315
+ per_scenario_stats=dict(per_scenario))
316
+
317
+ def _simulate(self,trace,baseline):
318
+ """Realistic simulation: tier vs difficulty determines success."""
319
+ scenario=trace.metadata.get("scenario","normal")
320
+ difficulty=trace.metadata.get("difficulty",3)
321
+ actual_tier=trace.metadata.get("actual_tier",3)
322
+ base_cost=trace.total_cost_computed
323
+ # Determine what tier the baseline would actually use
324
+ if baseline=="always_frontier": chosen_tier=4
325
+ elif baseline=="always_cheap": chosen_tier=2
326
+ elif baseline in ("no_router","static"): chosen_tier=actual_tier # uses same as trace, no optimization
327
+ elif baseline in ("cascade","full_optimizer"):
328
+ # Cascade tries lower tier first, escalates if needed
329
+ if difficulty<=2: chosen_tier=2
330
+ elif difficulty==3: chosen_tier=3 if self._tier_success_prob(3,difficulty)>0.7 else 4
331
+ elif difficulty==4: chosen_tier=3 if self._tier_success_prob(3,difficulty)>0.6 else 4
332
+ else: chosen_tier=4 if self._tier_success_prob(4,difficulty)>0.5 else 5
333
+ elif baseline=="no_tool_gate": chosen_tier=actual_tier # same tier, but no tool savings
334
+ elif baseline=="no_verifier": chosen_tier=actual_tier
335
+ elif baseline=="no_early_term": chosen_tier=actual_tier
336
+ elif baseline=="no_context_budget": chosen_tier=actual_tier
337
+ else: chosen_tier=actual_tier
338
+ # Cost multiplier based on chosen tier
339
+ tier_cost_mult={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}.get(chosen_tier,0.75)
340
+ actual_cost_mult={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}.get(actual_tier,0.75)
341
+ # Adjust cost: cascade uses cheaper tier when possible
342
+ cost_ratio=tier_cost_mult/actual_cost_mult if actual_cost_mult>0 else 1.0
343
+ sim_cost=base_cost*cost_ratio
344
+ # Tool gate savings for cascade/full
345
+ if baseline in ("cascade","full_optimizer"):
346
+ if "tool_overuse" in scenario: sim_cost*=0.75
347
+ # Cache savings
348
+ if baseline=="full_optimizer" and "cache_break" not in scenario: sim_cost*=0.92
349
+ # Verifier savings
350
+ if baseline=="full_optimizer" and chosen_tier>=3 and difficulty<4: sim_cost*=0.95
351
+ # Early termination savings
352
+ if baseline=="full_optimizer" and "retry_loop" in scenario: sim_cost*=0.60
353
+ if baseline=="no_early_term" and "retry_loop" in scenario: sim_cost*=1.4
354
+ # Determine success probability
355
+ success_prob=self._tier_success_prob(chosen_tier,difficulty)
356
+ # Apply baseline-specific modifiers
357
+ if baseline=="always_cheap" and difficulty>=3: success_prob*=0.3
358
+ elif baseline=="no_tool_gate" and "tool" in scenario: success_prob*=0.7
359
+ elif baseline=="no_verifier" and difficulty>=4: success_prob*=0.85
360
+ elif baseline=="full_optimizer": success_prob=min(1.0,success_prob+0.05)
361
+ # Special scenarios
362
+ if "false_done" in scenario: success_prob=0.1
363
+ elif "blocked" in scenario: success_prob=0.0
364
+ elif "retry_loop" in scenario and baseline not in ("full_optimizer",):
365
+ if baseline=="no_early_term": success_prob=0.1
366
+ else: success_prob=0.25
367
+ elif "retry_loop" in scenario and baseline=="full_optimizer":
368
+ success_prob=0.5 # Doom detector catches it
369
+ sim_success=success_prob>0.5
370
+ # Determine simulated outcome
371
+ if "false_done" in scenario: sim_outcome=Outcome.FALSE_DONE
372
+ elif "blocked" in scenario: sim_outcome=Outcome.BLOCKED
373
+ elif sim_success:
374
+ if success_prob>0.85: sim_outcome=Outcome.SUCCESS
375
+ else: sim_outcome=Outcome.PARTIAL_SUCCESS
376
+ else:
377
+ if "retry_loop" in scenario: sim_outcome=Outcome.FAILURE
378
+ elif success_prob<0.2: sim_outcome=Outcome.BLOCKED
379
+ else: sim_outcome=Outcome.FAILURE
380
+ return sim_cost,sim_success,sim_outcome
381
+
382
+ def _tier_success_prob(self,tier,difficulty):
383
+ strength={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}.get(tier,0.5)
384
+ # Success = strength^difficulty (harder tasks need exponentially more strength)
385
+ return strength**(difficulty*0.6)
386
+
387
+ def report(self,results):
388
+ lines=["="*100,"AGENT COST OPTIMIZER BENCHMARK REPORT v2","="*100,""]
389
+ headers=["Baseline","Success","Partial","Fail","Blocked","F-DONE",
390
+ "Total Cost","Avg$/Succ","Lat(ms)","Tools","Verif","Retry",
391
+ "Cache%","CostRed%","Regression","CheapMiss","EscMiss"]
392
+ lines.append(" | ".join(headers)); lines.append("-"*160)
393
+ for name,result in results.items():
394
+ row=[name[:22].ljust(22),
395
+ f"{result.num_success/result.num_tasks:.1%}",
396
+ f"{result.num_partial/result.num_tasks:.1%}",
397
+ f"{result.num_failure/result.num_tasks:.1%}",
398
+ f"{result.num_blocked/result.num_tasks:.1%}",
399
+ f"{result.false_done_rate:.1%}",
400
+ f"${result.total_cost:.2f}",
401
+ f"${result.avg_cost_success:.4f}",
402
+ f"{result.avg_latency_ms:.0f}",
403
+ str(result.total_tool_calls),str(result.total_verifier_calls),str(result.total_retries),
404
+ f"{result.avg_cache_hit_rate:.1%}",
405
+ f"{result.cost_reduction_vs_frontier:.1%}",
406
+ f"{result.regression_rate:.1%}",
407
+ f"{result.unsafe_cheap_miss_rate:.1%}",
408
+ f"{result.missed_escalation_rate:.1%}",
409
+ ]
410
+ lines.append(" | ".join(row))
411
+ lines.append(""); lines.append("="*100)
412
+ # Find best on Pareto frontier
413
+ best_score,best_name=-float("inf"),""
414
+ for name,result in results.items():
415
+ success_rate=(result.num_success+result.num_partial)/result.num_tasks
416
+ score=success_rate*20-result.avg_cost_success*50-result.regression_rate*30-result.unsafe_cheap_miss_rate*40
417
+ if score>best_score: best_score,best_name=score,name
418
+ lines.append(f"BEST PARETO: {best_name} (score={best_score:.2f})")
419
+ # Quality/cost ranking
420
+ lines.append(""); lines.append("QUALITY/COST FRONTIER (Success Rate vs Avg Cost per Success):")
421
+ points=[(name,(r.num_success+r.num_partial)/r.num_tasks,r.avg_cost_success) for name,r in results.items()]
422
+ points.sort(key=lambda x:(-x[1],x[2]))
423
+ for name,sr,cost in points:
424
+ lines.append(f" {name:22s} | Success: {sr:.1%} | Cost/Success: ${cost:.4f}")
425
+ lines.append(""); lines.append("="*100)
426
+ return "\n".join(lines)
427
+
428
+ def export(self,results,path):
429
+ export_data={}
430
+ for name,result in results.items():
431
+ export_data[name]={"baseline_name":result.baseline_name,"num_tasks":result.num_tasks,
432
+ "num_success":result.num_success,"num_partial":result.num_partial,
433
+ "num_failure":result.num_failure,"num_false_done":result.num_false_done,
434
+ "num_blocked":result.num_blocked,"total_cost":result.total_cost,
435
+ "avg_cost_success":result.avg_cost_success,"avg_latency_ms":result.avg_latency_ms,
436
+ "total_tool_calls":result.total_tool_calls,"total_verifier_calls":result.total_verifier_calls,
437
+ "total_retries":result.total_retries,"avg_cache_hit_rate":result.avg_cache_hit_rate,
438
+ "cost_reduction_vs_frontier":result.cost_reduction_vs_frontier,
439
+ "false_done_rate":result.false_done_rate,
440
+ "unsafe_cheap_miss_rate":result.unsafe_cheap_miss_rate,
441
+ "missed_escalation_rate":result.missed_escalation_rate,
442
+ "regression_rate":result.regression_rate,
443
+ "per_scenario_stats":result.per_scenario_stats}
444
+ with open(path,"w") as f: json.dump(export_data,f,indent=2)
445
+
446
+ if __name__=="__main__":
447
+ parser=argparse.ArgumentParser(description="ACO Evaluation Runner v2")
448
+ parser.add_argument("--tasks","-n",type=int,default=2000,help="Number of tasks")
449
+ parser.add_argument("--seed","-s",type=int,default=42,help="Random seed")
450
+ parser.add_argument("--output","-o",default="./eval_results_v2",help="Output directory")
451
+ args=parser.parse_args()
452
+ os.makedirs(args.output,exist_ok=True)
453
+ suite=BenchmarkSuite()
454
+ print(f"[{datetime.now().isoformat()}] Generating {args.tasks} synthetic traces...")
455
+ traces=suite.generate_benchmark_data(args.tasks,seed=args.seed)
456
+ traces_path=os.path.join(args.output,"traces.jsonl")
457
+ with open(traces_path,"w") as f:
458
+ for trace in traces: f.write(json.dumps(trace.to_dict())+"\n")
459
+ print(f" Saved {len(traces)} traces to {traces_path}")
460
+ print(f"\n[{datetime.now().isoformat()}] Running baselines...")
461
+ baseline_results=suite.run_all_baselines(traces)
462
+ baseline_path=os.path.join(args.output,"baseline_results.json")
463
+ suite.export(baseline_results,baseline_path)
464
+ print(f" Saved to {baseline_path}")
465
+ print(f"\n[{datetime.now().isoformat()}] Running ablations...")
466
+ ablation_results=suite.run_ablations(traces)
467
+ ablation_path=os.path.join(args.output,"ablation_results.json")
468
+ suite.export(ablation_results,ablation_path)
469
+ print(f" Saved to {ablation_path}")
470
+ all_results={**baseline_results,**ablation_results}
471
+ report=suite.report(all_results)
472
+ report_path=os.path.join(args.output,"report.txt")
473
+ with open(report_path,"w") as f: f.write(report)
474
+ print(f" Saved report to {report_path}")
475
+ # Cost-quality frontier
476
+ points=[]
477
+ for name,result in all_results.items():
478
+ sr=(result.num_success+result.num_partial)/result.num_tasks
479
+ points.append({"baseline":name,"success_rate":sr,"avg_cost_per_success":result.avg_cost_success,
480
+ "total_cost":result.total_cost,"regression_rate":result.regression_rate,
481
+ "false_done_rate":result.false_done_rate,"cheap_miss_rate":result.unsafe_cheap_miss_rate})
482
+ frontier=[]
483
+ for p in points:
484
+ dominated=False
485
+ for q in points:
486
+ if q["baseline"]==p["baseline"]: continue
487
+ if q["success_rate"]>=p["success_rate"] and q["avg_cost_per_success"]<=p["avg_cost_per_success"]:
488
+ if q["success_rate"]>p["success_rate"] or q["avg_cost_per_success"]<p["avg_cost_per_success"]:
489
+ dominated=True; break
490
+ if not dominated: frontier.append(p)
491
+ frontier.sort(key=lambda x:x["success_rate"],reverse=True)
492
+ frontier_data={"all_points":points,"pareto_frontier":frontier,"frontier_baselines":[p["baseline"] for p in frontier]}
493
+ frontier_path=os.path.join(args.output,"cost_quality_frontier.json")
494
+ with open(frontier_path,"w") as f: json.dump(frontier_data,indent=2,fp=f)
495
+ print(f" Saved frontier to {frontier_path}")
496
+ print("\n"+"="*100)
497
+ print(report)
498
+ print("="*100)