Upload standalone_eval_v2.py
Browse files- standalone_eval_v2.py +498 -0
standalone_eval_v2.py
ADDED
|
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Standalone benchmark runner v2 with realistic quality/cost tradeoffs."""
|
| 3 |
+
import sys, json, os, uuid, random, argparse
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from enum import Enum
|
| 7 |
+
from typing import Dict, List, Optional, Any, Tuple
|
| 8 |
+
|
| 9 |
+
class TaskType(Enum):
|
| 10 |
+
QUICK_ANSWER="quick_answer"; RESEARCH="research"; CODING="coding"
|
| 11 |
+
DOCUMENT_DRAFTING="document_drafting"; LEGAL_REGULATED="legal_regulated"
|
| 12 |
+
TOOL_HEAVY="tool_heavy"; RETRIEVAL_HEAVY="retrieval_heavy"
|
| 13 |
+
LONG_HORIZON="long_horizon"; UNKNOWN_AMBIGUOUS="unknown_ambiguous"
|
| 14 |
+
|
| 15 |
+
class Outcome(Enum):
|
| 16 |
+
SUCCESS="success"; PARTIAL_SUCCESS="partial_success"; FAILURE="failure"
|
| 17 |
+
FALSE_DONE="false_done"; BLOCKED="blocked"; ESCALATED_HUMAN="escalated_human"
|
| 18 |
+
STOPPED_DOOM="stopped_doom"
|
| 19 |
+
|
| 20 |
+
class FailureTag(Enum):
|
| 21 |
+
MODEL_TOO_WEAK="model_too_weak"; CONTEXT_TOO_SMALL="context_too_small"
|
| 22 |
+
TOOL_FAILED="tool_failed"; TOOL_UNNECESSARY="tool_unnecessary"
|
| 23 |
+
TOOL_MISSED="tool_missed"; RETRY_LOOP="retry_loop"
|
| 24 |
+
CACHE_BREAK="cache_break"; HALLUCINATION="hallucination"
|
| 25 |
+
TIMEOUT="timeout"; COST_EXCEEDED="cost_exceeded"
|
| 26 |
+
UNSAFE_CHEAP_MODEL="unsafe_cheap_model"; MISSED_ESCALATION="missed_escalation"
|
| 27 |
+
VERIFIER_FALSE_PASS="verifier_false_pass"
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ToolCall:
|
| 31 |
+
tool_name:str; tool_input:Dict[str,Any]; tool_output:Optional[str]=None
|
| 32 |
+
tool_cost:float=0.0; tool_latency_ms:float=0.0; cache_hit:bool=False
|
| 33 |
+
repeated:bool=False; ignored_result:bool=False; failed:bool=False
|
| 34 |
+
|
| 35 |
+
@dataclass
|
| 36 |
+
class ModelCall:
|
| 37 |
+
model_id:str; provider:str; input_tokens:int=0; output_tokens:int=0
|
| 38 |
+
reasoning_tokens:int=0; cost_per_1k_input:float=0.0; cost_per_1k_output:float=0.0
|
| 39 |
+
cache_hit_input_tokens:int=0; latency_ms:float=0.0
|
| 40 |
+
@property
|
| 41 |
+
def total_cost(self): return (self.input_tokens/1000)*self.cost_per_1k_input + (self.output_tokens/1000)*self.cost_per_1k_output - (self.cache_hit_input_tokens/1000)*self.cost_per_1k_input*0.5
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class VerifierCall:
|
| 45 |
+
verifier_model_id:str; target_step_id:str; passed:bool=False
|
| 46 |
+
confidence:float=0.0; cost:float=0.0; latency_ms:float=0.0
|
| 47 |
+
|
| 48 |
+
@dataclass
|
| 49 |
+
class TraceStep:
|
| 50 |
+
step_id:str; timestamp:datetime; task_type:TaskType; model_call:ModelCall
|
| 51 |
+
tool_calls:List[ToolCall]=field(default_factory=list)
|
| 52 |
+
verifier_calls:List[VerifierCall]=field(default_factory=list)
|
| 53 |
+
context_size_tokens:int=0; context_sources:List[str]=field(default_factory=list)
|
| 54 |
+
retry_count:int=0; recovery_action:Optional[str]=None
|
| 55 |
+
artifacts_created:List[str]=field(default_factory=list)
|
| 56 |
+
step_outcome:Optional[Outcome]=None
|
| 57 |
+
@property
|
| 58 |
+
def step_cost(self): return (self.model_call.total_cost if self.model_call else 0.0)+sum(t.tool_cost for t in self.tool_calls)+sum(v.cost for v in self.verifier_calls)
|
| 59 |
+
@property
|
| 60 |
+
def step_latency_ms(self): return (self.model_call.latency_ms if self.model_call else 0.0)+sum(t.tool_latency_ms for t in self.tool_calls)+sum(v.latency_ms for v in self.verifier_calls)
|
| 61 |
+
|
| 62 |
+
@dataclass
|
| 63 |
+
class AgentTrace:
|
| 64 |
+
trace_id:str; user_request:str; task_type:TaskType
|
| 65 |
+
steps:List[TraceStep]=field(default_factory=list)
|
| 66 |
+
final_outcome:Optional[Outcome]=None; final_artifacts:List[str]=field(default_factory=list)
|
| 67 |
+
failure_tags:List[FailureTag]=field(default_factory=list)
|
| 68 |
+
user_satisfaction:Optional[float]=None
|
| 69 |
+
total_cost:Optional[float]=None
|
| 70 |
+
metadata:Dict[str,Any]=field(default_factory=dict)
|
| 71 |
+
@property
|
| 72 |
+
def total_cost_computed(self): return sum(s.step_cost for s in self.steps)
|
| 73 |
+
@property
|
| 74 |
+
def total_latency_ms(self): return sum(s.step_latency_ms for s in self.steps)
|
| 75 |
+
@property
|
| 76 |
+
def total_retries(self): return sum(s.retry_count for s in self.steps)
|
| 77 |
+
@property
|
| 78 |
+
def total_tool_calls(self): return sum(len(s.tool_calls) for s in self.steps)
|
| 79 |
+
@property
|
| 80 |
+
def total_verifier_calls(self): return sum(len(s.verifier_calls) for s in self.steps)
|
| 81 |
+
@property
|
| 82 |
+
def cache_hit_rate(self):
|
| 83 |
+
mc=[s.model_call for s in self.steps if s.model_call]
|
| 84 |
+
if not mc: return 0.0
|
| 85 |
+
ti=sum(m.input_tokens for m in mc)
|
| 86 |
+
return sum(m.cache_hit_input_tokens for m in mc)/ti if ti>0 else 0.0
|
| 87 |
+
def to_dict(self):
|
| 88 |
+
return {"trace_id":self.trace_id,"user_request":self.user_request,"task_type":self.task_type.value,
|
| 89 |
+
"steps":[{"step_id":s.step_id,"timestamp":s.timestamp.isoformat(),"task_type":s.task_type.value,
|
| 90 |
+
"model_call":{"model_id":s.model_call.model_id,"provider":s.model_call.provider,
|
| 91 |
+
"input_tokens":s.model_call.input_tokens,"output_tokens":s.model_call.output_tokens,
|
| 92 |
+
"reasoning_tokens":s.model_call.reasoning_tokens,"cost":s.model_call.total_cost,
|
| 93 |
+
"latency_ms":s.model_call.latency_ms,"cache_hit_input_tokens":s.model_call.cache_hit_input_tokens},
|
| 94 |
+
"tool_calls":[{"tool_name":t.tool_name,"tool_cost":t.tool_cost,"tool_latency_ms":t.tool_latency_ms,
|
| 95 |
+
"cache_hit":t.cache_hit,"repeated":t.repeated,"ignored_result":t.ignored_result,"failed":t.failed} for t in s.tool_calls],
|
| 96 |
+
"verifier_calls":[{"verifier_model_id":v.verifier_model_id,"passed":v.passed,
|
| 97 |
+
"confidence":v.confidence,"cost":v.cost} for v in s.verifier_calls],
|
| 98 |
+
"context_size_tokens":s.context_size_tokens,"retry_count":s.retry_count,
|
| 99 |
+
"recovery_action":s.recovery_action,"step_outcome":s.step_outcome.value if s.step_outcome else None,
|
| 100 |
+
"step_cost":s.step_cost,"step_latency_ms":s.step_latency_ms} for s in self.steps],
|
| 101 |
+
"final_outcome":self.final_outcome.value if self.final_outcome else None,
|
| 102 |
+
"failure_tags":[f.value for f in self.failure_tags],
|
| 103 |
+
"total_cost":self.total_cost_computed,"total_latency_ms":self.total_latency_ms,
|
| 104 |
+
"total_retries":self.total_retries,"total_tool_calls":self.total_tool_calls,
|
| 105 |
+
"total_verifier_calls":self.total_verifier_calls,
|
| 106 |
+
"cache_hit_rate":self.cache_hit_rate,"metadata":self.metadata}
|
| 107 |
+
|
| 108 |
+
class SyntheticTraceGenerator:
|
| 109 |
+
# Realistic provider pricing (per 1K tokens)
|
| 110 |
+
MODEL_CONFIGS = {
|
| 111 |
+
"tiny_local": {"tier":1,"cost_input":0.0001,"cost_output":0.0002,"latency":200,"strength":0.35,"name":"Tiny Local (Qwen-0.5B)"},
|
| 112 |
+
"cheap_cloud": {"tier":2,"cost_input":0.00015,"cost_output":0.0006,"latency":400,"strength":0.55,"name":"GPT-4o-mini"},
|
| 113 |
+
"medium": {"tier":3,"cost_input":0.0015,"cost_output":0.006,"latency":800,"strength":0.80,"name":"Claude-3.5-Sonnet"},
|
| 114 |
+
"frontier": {"tier":4,"cost_input":0.005,"cost_output":0.015,"latency":1500,"strength":0.93,"name":"GPT-4o / Claude-3-Opus"},
|
| 115 |
+
"specialist": {"tier":5,"cost_input":0.01,"cost_output":0.03,"latency":2000,"strength":0.97,"name":"o1 / o3-mini"},
|
| 116 |
+
}
|
| 117 |
+
TOOL_COSTS = {"search":0.002,"retrieve":0.001,"fetch":0.003,"code_execution":0.005,
|
| 118 |
+
"linter":0.001,"test_runner":0.003,"file_read":0.0005,"file_write":0.0005,
|
| 119 |
+
"calculator":0.0001,"database_query":0.004,"compliance_check":0.01,
|
| 120 |
+
"summarize":0.002,"task_planner":0.001,"progress_tracker":0.0005}
|
| 121 |
+
# Task difficulty: [tier_needed, risk_level]
|
| 122 |
+
TASK_DIFFICULTY = {
|
| 123 |
+
TaskType.QUICK_ANSWER: (1, 0.1),
|
| 124 |
+
TaskType.CODING: (3, 0.4),
|
| 125 |
+
TaskType.RESEARCH: (3, 0.5),
|
| 126 |
+
TaskType.DOCUMENT_DRAFTING: (2, 0.2),
|
| 127 |
+
TaskType.LEGAL_REGULATED: (4, 0.8),
|
| 128 |
+
TaskType.TOOL_HEAVY: (2, 0.3),
|
| 129 |
+
TaskType.RETRIEVAL_HEAVY: (2, 0.35),
|
| 130 |
+
TaskType.LONG_HORIZON: (3, 0.6),
|
| 131 |
+
TaskType.UNKNOWN_AMBIGUOUS: (3, 0.7),
|
| 132 |
+
}
|
| 133 |
+
SCENARIOS = [
|
| 134 |
+
{"name":"quick_answer_success","prob":0.18,"task_type":TaskType.QUICK_ANSWER,"tier":[1,2],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":1},
|
| 135 |
+
{"name":"quick_answer_cheap_fail","prob":0.02,"task_type":TaskType.QUICK_ANSWER,"tier":[1],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":2},
|
| 136 |
+
{"name":"coding_success_frontier","prob":0.08,"task_type":TaskType.CODING,"tier":[4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":4},
|
| 137 |
+
{"name":"coding_success_medium","prob":0.10,"task_type":TaskType.CODING,"tier":[3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
|
| 138 |
+
{"name":"coding_cheap_fail","prob":0.05,"task_type":TaskType.CODING,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":4},
|
| 139 |
+
{"name":"coding_tool_underuse","prob":0.04,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.TOOL_MISSED],"difficulty":3},
|
| 140 |
+
{"name":"research_success","prob":0.10,"task_type":TaskType.RESEARCH,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
|
| 141 |
+
{"name":"research_cheap_fail","prob":0.03,"task_type":TaskType.RESEARCH,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.MODEL_TOO_WEAK],"difficulty":4},
|
| 142 |
+
{"name":"document_draft_success","prob":0.08,"task_type":TaskType.DOCUMENT_DRAFTING,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
|
| 143 |
+
{"name":"legal_frontier_success","prob":0.04,"task_type":TaskType.LEGAL_REGULATED,"tier":[4,5],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":5},
|
| 144 |
+
{"name":"legal_cheap_unsafe","prob":0.02,"task_type":TaskType.LEGAL_REGULATED,"tier":[1,2],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.UNSAFE_CHEAP_MODEL],"difficulty":5},
|
| 145 |
+
{"name":"tool_heavy_success","prob":0.06,"task_type":TaskType.TOOL_HEAVY,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
|
| 146 |
+
{"name":"retrieval_success","prob":0.06,"task_type":TaskType.RETRIEVAL_HEAVY,"tier":[2,3],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":2},
|
| 147 |
+
{"name":"long_horizon_success","prob":0.05,"task_type":TaskType.LONG_HORIZON,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":4},
|
| 148 |
+
{"name":"long_horizon_retry_loop","prob":0.03,"task_type":TaskType.LONG_HORIZON,"tier":[3],"outcome":Outcome.FAILURE,"failure_tags":[FailureTag.RETRY_LOOP],"difficulty":4},
|
| 149 |
+
{"name":"unknown_ambiguous_success","prob":0.03,"task_type":TaskType.UNKNOWN_AMBIGUOUS,"tier":[3,4],"outcome":Outcome.SUCCESS,"failure_tags":[],"difficulty":3},
|
| 150 |
+
{"name":"unknown_ambiguous_blocked","prob":0.02,"task_type":TaskType.UNKNOWN_AMBIGUOUS,"tier":[3,4],"outcome":Outcome.BLOCKED,"failure_tags":[FailureTag.MISSED_ESCALATION],"difficulty":3},
|
| 151 |
+
{"name":"tool_overuse","prob":0.04,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.PARTIAL_SUCCESS,"failure_tags":[FailureTag.TOOL_UNNECESSARY],"difficulty":3},
|
| 152 |
+
{"name":"cache_break_scenario","prob":0.03,"task_type":TaskType.RESEARCH,"tier":[3,4],"outcome":Outcome.PARTIAL_SUCCESS,"failure_tags":[FailureTag.CACHE_BREAK],"difficulty":3},
|
| 153 |
+
{"name":"false_done_scenario","prob":0.02,"task_type":TaskType.CODING,"tier":[3,4],"outcome":Outcome.FALSE_DONE,"failure_tags":[FailureTag.VERIFIER_FALSE_PASS],"difficulty":3},
|
| 154 |
+
]
|
| 155 |
+
def __init__(self,seed=42): self.rng=random.Random(seed)
|
| 156 |
+
def generate(self,n=10000): return [self._generate_trace(i) for i in range(n)]
|
| 157 |
+
def _pick_scenario(self): return self.rng.choices(self.SCENARIOS,weights=[s["prob"] for s in self.SCENARIOS])[0]
|
| 158 |
+
def _tier_to_model(self,tier): return {1:"tiny_local",2:"cheap_cloud",3:"medium",4:"frontier",5:"specialist"}.get(tier,"medium")
|
| 159 |
+
def _generate_trace(self,idx):
|
| 160 |
+
scenario=self._pick_scenario()
|
| 161 |
+
trace_id=f"synth_{idx}_{uuid.uuid4().hex[:8]}"
|
| 162 |
+
task_type=scenario["task_type"]
|
| 163 |
+
user_request=self._generate_request(task_type,scenario["name"])
|
| 164 |
+
base_steps=self.rng.randint(1,8)
|
| 165 |
+
if "long_horizon" in scenario["name"] or "retry_loop" in scenario["name"]: base_steps=self.rng.randint(4,12)
|
| 166 |
+
elif "coding" in scenario["name"] and scenario["outcome"]==Outcome.FAILURE: base_steps=self.rng.randint(3,8)
|
| 167 |
+
tier=self.rng.choice(scenario["tier"])
|
| 168 |
+
model_key=self._tier_to_model(tier)
|
| 169 |
+
model_cfg=self.MODEL_CONFIGS[model_key]
|
| 170 |
+
steps=[]
|
| 171 |
+
for step_idx in range(base_steps):
|
| 172 |
+
steps.append(self._generate_step(trace_id,step_idx,task_type,model_key,model_cfg,scenario,step_idx==base_steps-1))
|
| 173 |
+
return AgentTrace(
|
| 174 |
+
trace_id=trace_id,user_request=user_request,task_type=task_type,steps=steps,
|
| 175 |
+
final_outcome=scenario["outcome"],failure_tags=list(scenario.get("failure_tags",[])),
|
| 176 |
+
total_cost=sum(s.step_cost for s in steps),
|
| 177 |
+
metadata={"scenario":scenario["name"],"synthetic":True,"difficulty":scenario["difficulty"],
|
| 178 |
+
"optimal_tier":scenario["difficulty"],"actual_tier":tier})
|
| 179 |
+
def _generate_request(self,task_type,scenario_name):
|
| 180 |
+
templates={
|
| 181 |
+
TaskType.QUICK_ANSWER:["What is the capital of France?","Briefly explain quantum computing.","Summarize article X.","What is 237 * 452?"],
|
| 182 |
+
TaskType.CODING:["Write a Python function to reverse a linked list.","Fix the bug in this React component.","Refactor auth module to JWT.","Implement LRU cache in Go.","Debug this segfault in C++ thread pool."],
|
| 183 |
+
TaskType.RESEARCH:["Research latest transformer advances.","Find sources comparing LoRA and full FT.","Investigate data center climate impact.","What does literature say on speculative decoding?"],
|
| 184 |
+
TaskType.DOCUMENT_DRAFTING:["Draft project proposal for ML pipeline.","Write email to team about deployment.","Create technical report on performance."],
|
| 185 |
+
TaskType.LEGAL_REGULATED:["Review this contract for liability clauses.","Check GDPR compliance for data pipeline.","Draft privacy policy section."],
|
| 186 |
+
TaskType.TOOL_HEAVY:["Search open issues and create summary.","Fetch API docs and generate client code.","Query Q3 sales and produce chart."],
|
| 187 |
+
TaskType.RETRIEVAL_HEAVY:["Answer based on 50-page document.","Find all 'payment processing' mentions.","Retrieve relevant cases for legal query."],
|
| 188 |
+
TaskType.LONG_HORIZON:["Plan 3-month roadmap.","Orchestrate multi-region deployment.","Redesign data architecture end-to-end."],
|
| 189 |
+
TaskType.UNKNOWN_AMBIGUOUS:["Help me with this thing.","I need something about the server.","Can you look into that issue?"],
|
| 190 |
+
}
|
| 191 |
+
return self.rng.choice(templates.get(task_type,["Generic request"]))
|
| 192 |
+
def _get_tools_for_task(self,task_type):
|
| 193 |
+
return {TaskType.QUICK_ANSWER:["calculator","search"],
|
| 194 |
+
TaskType.CODING:["file_read","file_write","code_execution","linter","test_runner"],
|
| 195 |
+
TaskType.RESEARCH:["search","retrieve","fetch","summarize"],
|
| 196 |
+
TaskType.DOCUMENT_DRAFTING:["file_read","summarize"],
|
| 197 |
+
TaskType.LEGAL_REGULATED:["document_retrieval","compliance_check","search"],
|
| 198 |
+
TaskType.TOOL_HEAVY:["search","fetch","api_call","database_query"],
|
| 199 |
+
TaskType.RETRIEVAL_HEAVY:["retrieve","search","fetch"],
|
| 200 |
+
TaskType.LONG_HORIZON:["task_planner","progress_tracker","file_read"],
|
| 201 |
+
TaskType.UNKNOWN_AMBIGUOUS:["search"]}.get(task_type,["search"])
|
| 202 |
+
def _generate_step(self,trace_id,step_idx,task_type,model_key,model_cfg,scenario,is_last):
|
| 203 |
+
step_id=f"{trace_id}_step_{step_idx}"
|
| 204 |
+
input_tokens=self.rng.randint(800,12000)
|
| 205 |
+
output_tokens=self.rng.randint(200,6000)
|
| 206 |
+
cache_hit=self.rng.random()<0.35
|
| 207 |
+
cache_hit_tokens=int(input_tokens*self.rng.random()*0.6) if cache_hit else 0
|
| 208 |
+
model_call=ModelCall(model_id=model_key,provider="synthetic",input_tokens=input_tokens,output_tokens=output_tokens,
|
| 209 |
+
reasoning_tokens=output_tokens//4 if model_key in ("frontier","specialist") else 0,
|
| 210 |
+
cost_per_1k_input=model_cfg["cost_input"],cost_per_1k_output=model_cfg["cost_output"],
|
| 211 |
+
cache_hit_input_tokens=cache_hit_tokens,latency_ms=model_cfg["latency"]*self.rng.uniform(0.8,1.5))
|
| 212 |
+
tool_calls=[]; base_tools=self._get_tools_for_task(task_type); num_tools=self.rng.randint(0,len(base_tools))
|
| 213 |
+
if scenario["name"]=="tool_overuse": num_tools+=3
|
| 214 |
+
for t in range(min(num_tools,len(base_tools))):
|
| 215 |
+
tool_name=base_tools[t]
|
| 216 |
+
tool_calls.append(ToolCall(tool_name=tool_name,tool_input={"query":f"auto_{tool_name}"},
|
| 217 |
+
tool_cost=self.TOOL_COSTS.get(tool_name,0.001),tool_latency_ms=self.rng.uniform(100,1200),
|
| 218 |
+
cache_hit=self.rng.random()<0.2,repeated=self.rng.random()<0.1,
|
| 219 |
+
ignored_result=self.rng.random()<0.05,
|
| 220 |
+
failed=self.rng.random()<(0.3 if "retry_loop" in scenario["name"] else 0.05)))
|
| 221 |
+
verifier_calls=[]; num_verifiers=0
|
| 222 |
+
if task_type==TaskType.LEGAL_REGULATED: num_verifiers=1
|
| 223 |
+
elif task_type in (TaskType.CODING,TaskType.RESEARCH) and model_key in ("frontier","specialist"): num_verifiers=1 if self.rng.random()<0.4 else 0
|
| 224 |
+
for _ in range(num_verifiers):
|
| 225 |
+
verifier_calls.append(VerifierCall(verifier_model_id="verifier_medium",target_step_id=step_id,
|
| 226 |
+
passed=self.rng.random()<0.85,confidence=self.rng.uniform(0.6,0.99),cost=0.005,latency_ms=500))
|
| 227 |
+
context_size=self.rng.randint(1500,20000)
|
| 228 |
+
if scenario["name"]=="cache_break_scenario": context_size+=self.rng.randint(8000,30000)
|
| 229 |
+
retries=0
|
| 230 |
+
if "retry_loop" in scenario["name"]: retries=self.rng.randint(4,8)
|
| 231 |
+
elif self.rng.random()<0.12: retries=self.rng.randint(1,3)
|
| 232 |
+
recovery=None
|
| 233 |
+
if retries>0: recovery=self.rng.choice(["retry_same","retry_changed_prompt","repair_tool","switch_model","ask_clarification"])
|
| 234 |
+
step_outcome=Outcome.SUCCESS
|
| 235 |
+
if is_last: step_outcome=scenario["outcome"]
|
| 236 |
+
elif "retry_loop" in scenario["name"] and step_idx>=2: step_outcome=Outcome.FAILURE
|
| 237 |
+
return TraceStep(step_id=step_id,timestamp=datetime.utcnow()+timedelta(seconds=step_idx*30),task_type=task_type,
|
| 238 |
+
model_call=model_call,tool_calls=tool_calls,verifier_calls=verifier_calls,
|
| 239 |
+
context_size_tokens=context_size,context_sources=["system_rules","tool_descriptions","user_preferences","recent_messages"],
|
| 240 |
+
retry_count=retries,recovery_action=recovery,
|
| 241 |
+
artifacts_created=[f"artifact_{step_idx}"] if self.rng.random()<0.25 else [],
|
| 242 |
+
step_outcome=step_outcome)
|
| 243 |
+
|
| 244 |
+
@dataclass
|
| 245 |
+
class BenchmarkResult:
|
| 246 |
+
baseline_name:str; num_tasks:int; num_success:int; num_partial:int
|
| 247 |
+
num_failure:int; num_false_done:int; num_blocked:int
|
| 248 |
+
total_cost:float; avg_cost_success:float; avg_latency_ms:float
|
| 249 |
+
total_tool_calls:int; total_verifier_calls:int; total_retries:int
|
| 250 |
+
avg_cache_hit_rate:float; cost_reduction_vs_frontier:float
|
| 251 |
+
false_done_rate:float; unsafe_cheap_miss_rate:float
|
| 252 |
+
missed_escalation_rate:float; regression_rate:float
|
| 253 |
+
per_scenario_stats:Dict[str,Dict[str,Any]]=field(default_factory=dict)
|
| 254 |
+
|
| 255 |
+
class BenchmarkSuite:
|
| 256 |
+
MODEL_CONFIGS = SyntheticTraceGenerator.MODEL_CONFIGS
|
| 257 |
+
TASK_DIFFICULTY = SyntheticTraceGenerator.TASK_DIFFICULTY
|
| 258 |
+
def __init__(self): pass
|
| 259 |
+
def generate_benchmark_data(self,n=1000,seed=42): return SyntheticTraceGenerator(seed=seed).generate(n)
|
| 260 |
+
|
| 261 |
+
def run_all_baselines(self,traces):
|
| 262 |
+
baselines=["always_frontier","always_cheap","static","cascade","full_optimizer"]
|
| 263 |
+
results={}
|
| 264 |
+
for baseline in baselines:
|
| 265 |
+
print(f"Running baseline: {baseline}...")
|
| 266 |
+
results[baseline]=self._run_baseline(traces,baseline)
|
| 267 |
+
return results
|
| 268 |
+
|
| 269 |
+
def run_ablations(self,traces):
|
| 270 |
+
ablations=["no_router","no_tool_gate","no_verifier","no_early_term","no_context_budget"]
|
| 271 |
+
results={}
|
| 272 |
+
for ablation in ablations:
|
| 273 |
+
print(f"Running ablation: {ablation}...")
|
| 274 |
+
results[ablation]=self._run_baseline(traces,ablation)
|
| 275 |
+
return results
|
| 276 |
+
|
| 277 |
+
def _run_baseline(self,traces,baseline_name):
|
| 278 |
+
success_count=0; partial_count=0; failure_count=0; false_done_count=0; blocked_count=0
|
| 279 |
+
total_cost=0.0; total_latency=0.0; total_tools=0; total_verifiers=0; total_retries=0
|
| 280 |
+
cache_rates=[]; cheap_misses=0; escalation_misses=0; regression_count=0
|
| 281 |
+
per_scenario=defaultdict(lambda:{"count":0,"success":0,"cost":0.0})
|
| 282 |
+
for trace in traces:
|
| 283 |
+
sim_cost,sim_success,sim_outcome=self._simulate(trace,baseline_name)
|
| 284 |
+
total_cost+=sim_cost; total_latency+=trace.total_latency_ms*0.7
|
| 285 |
+
total_tools+=trace.total_tool_calls; total_verifiers+=trace.total_verifier_calls
|
| 286 |
+
total_retries+=trace.total_retries; cache_rates.append(trace.cache_hit_rate)
|
| 287 |
+
scenario=trace.metadata.get("scenario","normal")
|
| 288 |
+
per_scenario[scenario]["count"]+=1; per_scenario[scenario]["cost"]+=sim_cost
|
| 289 |
+
if sim_success:
|
| 290 |
+
if sim_outcome in (Outcome.SUCCESS,Outcome.PARTIAL_SUCCESS):
|
| 291 |
+
success_count+=1; per_scenario[scenario]["success"]+=1
|
| 292 |
+
else: regression_count+=1
|
| 293 |
+
else:
|
| 294 |
+
if sim_outcome==Outcome.FALSE_DONE: false_done_count+=1
|
| 295 |
+
elif sim_outcome==Outcome.BLOCKED: blocked_count+=1
|
| 296 |
+
else: failure_count+=1
|
| 297 |
+
# Track cheap model misses
|
| 298 |
+
difficulty=trace.metadata.get("difficulty",3)
|
| 299 |
+
actual_tier=trace.metadata.get("actual_tier",3)
|
| 300 |
+
if actual_tier<difficulty and actual_tier<=2 and sim_outcome in (Outcome.FAILURE,Outcome.PARTIAL_SUCCESS):
|
| 301 |
+
cheap_misses+=1
|
| 302 |
+
if actual_tier<difficulty and sim_outcome in (Outcome.FAILURE,Outcome.BLOCKED):
|
| 303 |
+
escalation_misses+=1
|
| 304 |
+
n=len(traces); avg_cost_success=total_cost/max(success_count,1)
|
| 305 |
+
frontier_total=sum(t.total_cost_computed*4 for t in traces) # frontier costs ~4x medium
|
| 306 |
+
cost_reduction=(frontier_total-total_cost)/max(frontier_total,1)
|
| 307 |
+
return BenchmarkResult(
|
| 308 |
+
baseline_name=baseline_name,num_tasks=n,num_success=success_count,num_partial=partial_count,
|
| 309 |
+
num_failure=failure_count,num_false_done=false_done_count,num_blocked=blocked_count,
|
| 310 |
+
total_cost=total_cost,avg_cost_success=avg_cost_success,avg_latency_ms=total_latency/n,
|
| 311 |
+
total_tool_calls=total_tools,total_verifier_calls=total_verifiers,total_retries=total_retries,
|
| 312 |
+
avg_cache_hit_rate=sum(cache_rates)/n,cost_reduction_vs_frontier=cost_reduction,
|
| 313 |
+
false_done_rate=false_done_count/n,unsafe_cheap_miss_rate=cheap_misses/n,
|
| 314 |
+
missed_escalation_rate=escalation_misses/n,regression_rate=regression_count/n,
|
| 315 |
+
per_scenario_stats=dict(per_scenario))
|
| 316 |
+
|
| 317 |
+
def _simulate(self,trace,baseline):
|
| 318 |
+
"""Realistic simulation: tier vs difficulty determines success."""
|
| 319 |
+
scenario=trace.metadata.get("scenario","normal")
|
| 320 |
+
difficulty=trace.metadata.get("difficulty",3)
|
| 321 |
+
actual_tier=trace.metadata.get("actual_tier",3)
|
| 322 |
+
base_cost=trace.total_cost_computed
|
| 323 |
+
# Determine what tier the baseline would actually use
|
| 324 |
+
if baseline=="always_frontier": chosen_tier=4
|
| 325 |
+
elif baseline=="always_cheap": chosen_tier=2
|
| 326 |
+
elif baseline in ("no_router","static"): chosen_tier=actual_tier # uses same as trace, no optimization
|
| 327 |
+
elif baseline in ("cascade","full_optimizer"):
|
| 328 |
+
# Cascade tries lower tier first, escalates if needed
|
| 329 |
+
if difficulty<=2: chosen_tier=2
|
| 330 |
+
elif difficulty==3: chosen_tier=3 if self._tier_success_prob(3,difficulty)>0.7 else 4
|
| 331 |
+
elif difficulty==4: chosen_tier=3 if self._tier_success_prob(3,difficulty)>0.6 else 4
|
| 332 |
+
else: chosen_tier=4 if self._tier_success_prob(4,difficulty)>0.5 else 5
|
| 333 |
+
elif baseline=="no_tool_gate": chosen_tier=actual_tier # same tier, but no tool savings
|
| 334 |
+
elif baseline=="no_verifier": chosen_tier=actual_tier
|
| 335 |
+
elif baseline=="no_early_term": chosen_tier=actual_tier
|
| 336 |
+
elif baseline=="no_context_budget": chosen_tier=actual_tier
|
| 337 |
+
else: chosen_tier=actual_tier
|
| 338 |
+
# Cost multiplier based on chosen tier
|
| 339 |
+
tier_cost_mult={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}.get(chosen_tier,0.75)
|
| 340 |
+
actual_cost_mult={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}.get(actual_tier,0.75)
|
| 341 |
+
# Adjust cost: cascade uses cheaper tier when possible
|
| 342 |
+
cost_ratio=tier_cost_mult/actual_cost_mult if actual_cost_mult>0 else 1.0
|
| 343 |
+
sim_cost=base_cost*cost_ratio
|
| 344 |
+
# Tool gate savings for cascade/full
|
| 345 |
+
if baseline in ("cascade","full_optimizer"):
|
| 346 |
+
if "tool_overuse" in scenario: sim_cost*=0.75
|
| 347 |
+
# Cache savings
|
| 348 |
+
if baseline=="full_optimizer" and "cache_break" not in scenario: sim_cost*=0.92
|
| 349 |
+
# Verifier savings
|
| 350 |
+
if baseline=="full_optimizer" and chosen_tier>=3 and difficulty<4: sim_cost*=0.95
|
| 351 |
+
# Early termination savings
|
| 352 |
+
if baseline=="full_optimizer" and "retry_loop" in scenario: sim_cost*=0.60
|
| 353 |
+
if baseline=="no_early_term" and "retry_loop" in scenario: sim_cost*=1.4
|
| 354 |
+
# Determine success probability
|
| 355 |
+
success_prob=self._tier_success_prob(chosen_tier,difficulty)
|
| 356 |
+
# Apply baseline-specific modifiers
|
| 357 |
+
if baseline=="always_cheap" and difficulty>=3: success_prob*=0.3
|
| 358 |
+
elif baseline=="no_tool_gate" and "tool" in scenario: success_prob*=0.7
|
| 359 |
+
elif baseline=="no_verifier" and difficulty>=4: success_prob*=0.85
|
| 360 |
+
elif baseline=="full_optimizer": success_prob=min(1.0,success_prob+0.05)
|
| 361 |
+
# Special scenarios
|
| 362 |
+
if "false_done" in scenario: success_prob=0.1
|
| 363 |
+
elif "blocked" in scenario: success_prob=0.0
|
| 364 |
+
elif "retry_loop" in scenario and baseline not in ("full_optimizer",):
|
| 365 |
+
if baseline=="no_early_term": success_prob=0.1
|
| 366 |
+
else: success_prob=0.25
|
| 367 |
+
elif "retry_loop" in scenario and baseline=="full_optimizer":
|
| 368 |
+
success_prob=0.5 # Doom detector catches it
|
| 369 |
+
sim_success=success_prob>0.5
|
| 370 |
+
# Determine simulated outcome
|
| 371 |
+
if "false_done" in scenario: sim_outcome=Outcome.FALSE_DONE
|
| 372 |
+
elif "blocked" in scenario: sim_outcome=Outcome.BLOCKED
|
| 373 |
+
elif sim_success:
|
| 374 |
+
if success_prob>0.85: sim_outcome=Outcome.SUCCESS
|
| 375 |
+
else: sim_outcome=Outcome.PARTIAL_SUCCESS
|
| 376 |
+
else:
|
| 377 |
+
if "retry_loop" in scenario: sim_outcome=Outcome.FAILURE
|
| 378 |
+
elif success_prob<0.2: sim_outcome=Outcome.BLOCKED
|
| 379 |
+
else: sim_outcome=Outcome.FAILURE
|
| 380 |
+
return sim_cost,sim_success,sim_outcome
|
| 381 |
+
|
| 382 |
+
def _tier_success_prob(self,tier,difficulty):
|
| 383 |
+
strength={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}.get(tier,0.5)
|
| 384 |
+
# Success = strength^difficulty (harder tasks need exponentially more strength)
|
| 385 |
+
return strength**(difficulty*0.6)
|
| 386 |
+
|
| 387 |
+
def report(self,results):
|
| 388 |
+
lines=["="*100,"AGENT COST OPTIMIZER BENCHMARK REPORT v2","="*100,""]
|
| 389 |
+
headers=["Baseline","Success","Partial","Fail","Blocked","F-DONE",
|
| 390 |
+
"Total Cost","Avg$/Succ","Lat(ms)","Tools","Verif","Retry",
|
| 391 |
+
"Cache%","CostRed%","Regression","CheapMiss","EscMiss"]
|
| 392 |
+
lines.append(" | ".join(headers)); lines.append("-"*160)
|
| 393 |
+
for name,result in results.items():
|
| 394 |
+
row=[name[:22].ljust(22),
|
| 395 |
+
f"{result.num_success/result.num_tasks:.1%}",
|
| 396 |
+
f"{result.num_partial/result.num_tasks:.1%}",
|
| 397 |
+
f"{result.num_failure/result.num_tasks:.1%}",
|
| 398 |
+
f"{result.num_blocked/result.num_tasks:.1%}",
|
| 399 |
+
f"{result.false_done_rate:.1%}",
|
| 400 |
+
f"${result.total_cost:.2f}",
|
| 401 |
+
f"${result.avg_cost_success:.4f}",
|
| 402 |
+
f"{result.avg_latency_ms:.0f}",
|
| 403 |
+
str(result.total_tool_calls),str(result.total_verifier_calls),str(result.total_retries),
|
| 404 |
+
f"{result.avg_cache_hit_rate:.1%}",
|
| 405 |
+
f"{result.cost_reduction_vs_frontier:.1%}",
|
| 406 |
+
f"{result.regression_rate:.1%}",
|
| 407 |
+
f"{result.unsafe_cheap_miss_rate:.1%}",
|
| 408 |
+
f"{result.missed_escalation_rate:.1%}",
|
| 409 |
+
]
|
| 410 |
+
lines.append(" | ".join(row))
|
| 411 |
+
lines.append(""); lines.append("="*100)
|
| 412 |
+
# Find best on Pareto frontier
|
| 413 |
+
best_score,best_name=-float("inf"),""
|
| 414 |
+
for name,result in results.items():
|
| 415 |
+
success_rate=(result.num_success+result.num_partial)/result.num_tasks
|
| 416 |
+
score=success_rate*20-result.avg_cost_success*50-result.regression_rate*30-result.unsafe_cheap_miss_rate*40
|
| 417 |
+
if score>best_score: best_score,best_name=score,name
|
| 418 |
+
lines.append(f"BEST PARETO: {best_name} (score={best_score:.2f})")
|
| 419 |
+
# Quality/cost ranking
|
| 420 |
+
lines.append(""); lines.append("QUALITY/COST FRONTIER (Success Rate vs Avg Cost per Success):")
|
| 421 |
+
points=[(name,(r.num_success+r.num_partial)/r.num_tasks,r.avg_cost_success) for name,r in results.items()]
|
| 422 |
+
points.sort(key=lambda x:(-x[1],x[2]))
|
| 423 |
+
for name,sr,cost in points:
|
| 424 |
+
lines.append(f" {name:22s} | Success: {sr:.1%} | Cost/Success: ${cost:.4f}")
|
| 425 |
+
lines.append(""); lines.append("="*100)
|
| 426 |
+
return "\n".join(lines)
|
| 427 |
+
|
| 428 |
+
def export(self,results,path):
|
| 429 |
+
export_data={}
|
| 430 |
+
for name,result in results.items():
|
| 431 |
+
export_data[name]={"baseline_name":result.baseline_name,"num_tasks":result.num_tasks,
|
| 432 |
+
"num_success":result.num_success,"num_partial":result.num_partial,
|
| 433 |
+
"num_failure":result.num_failure,"num_false_done":result.num_false_done,
|
| 434 |
+
"num_blocked":result.num_blocked,"total_cost":result.total_cost,
|
| 435 |
+
"avg_cost_success":result.avg_cost_success,"avg_latency_ms":result.avg_latency_ms,
|
| 436 |
+
"total_tool_calls":result.total_tool_calls,"total_verifier_calls":result.total_verifier_calls,
|
| 437 |
+
"total_retries":result.total_retries,"avg_cache_hit_rate":result.avg_cache_hit_rate,
|
| 438 |
+
"cost_reduction_vs_frontier":result.cost_reduction_vs_frontier,
|
| 439 |
+
"false_done_rate":result.false_done_rate,
|
| 440 |
+
"unsafe_cheap_miss_rate":result.unsafe_cheap_miss_rate,
|
| 441 |
+
"missed_escalation_rate":result.missed_escalation_rate,
|
| 442 |
+
"regression_rate":result.regression_rate,
|
| 443 |
+
"per_scenario_stats":result.per_scenario_stats}
|
| 444 |
+
with open(path,"w") as f: json.dump(export_data,f,indent=2)
|
| 445 |
+
|
| 446 |
+
if __name__=="__main__":
|
| 447 |
+
parser=argparse.ArgumentParser(description="ACO Evaluation Runner v2")
|
| 448 |
+
parser.add_argument("--tasks","-n",type=int,default=2000,help="Number of tasks")
|
| 449 |
+
parser.add_argument("--seed","-s",type=int,default=42,help="Random seed")
|
| 450 |
+
parser.add_argument("--output","-o",default="./eval_results_v2",help="Output directory")
|
| 451 |
+
args=parser.parse_args()
|
| 452 |
+
os.makedirs(args.output,exist_ok=True)
|
| 453 |
+
suite=BenchmarkSuite()
|
| 454 |
+
print(f"[{datetime.now().isoformat()}] Generating {args.tasks} synthetic traces...")
|
| 455 |
+
traces=suite.generate_benchmark_data(args.tasks,seed=args.seed)
|
| 456 |
+
traces_path=os.path.join(args.output,"traces.jsonl")
|
| 457 |
+
with open(traces_path,"w") as f:
|
| 458 |
+
for trace in traces: f.write(json.dumps(trace.to_dict())+"\n")
|
| 459 |
+
print(f" Saved {len(traces)} traces to {traces_path}")
|
| 460 |
+
print(f"\n[{datetime.now().isoformat()}] Running baselines...")
|
| 461 |
+
baseline_results=suite.run_all_baselines(traces)
|
| 462 |
+
baseline_path=os.path.join(args.output,"baseline_results.json")
|
| 463 |
+
suite.export(baseline_results,baseline_path)
|
| 464 |
+
print(f" Saved to {baseline_path}")
|
| 465 |
+
print(f"\n[{datetime.now().isoformat()}] Running ablations...")
|
| 466 |
+
ablation_results=suite.run_ablations(traces)
|
| 467 |
+
ablation_path=os.path.join(args.output,"ablation_results.json")
|
| 468 |
+
suite.export(ablation_results,ablation_path)
|
| 469 |
+
print(f" Saved to {ablation_path}")
|
| 470 |
+
all_results={**baseline_results,**ablation_results}
|
| 471 |
+
report=suite.report(all_results)
|
| 472 |
+
report_path=os.path.join(args.output,"report.txt")
|
| 473 |
+
with open(report_path,"w") as f: f.write(report)
|
| 474 |
+
print(f" Saved report to {report_path}")
|
| 475 |
+
# Cost-quality frontier
|
| 476 |
+
points=[]
|
| 477 |
+
for name,result in all_results.items():
|
| 478 |
+
sr=(result.num_success+result.num_partial)/result.num_tasks
|
| 479 |
+
points.append({"baseline":name,"success_rate":sr,"avg_cost_per_success":result.avg_cost_success,
|
| 480 |
+
"total_cost":result.total_cost,"regression_rate":result.regression_rate,
|
| 481 |
+
"false_done_rate":result.false_done_rate,"cheap_miss_rate":result.unsafe_cheap_miss_rate})
|
| 482 |
+
frontier=[]
|
| 483 |
+
for p in points:
|
| 484 |
+
dominated=False
|
| 485 |
+
for q in points:
|
| 486 |
+
if q["baseline"]==p["baseline"]: continue
|
| 487 |
+
if q["success_rate"]>=p["success_rate"] and q["avg_cost_per_success"]<=p["avg_cost_per_success"]:
|
| 488 |
+
if q["success_rate"]>p["success_rate"] or q["avg_cost_per_success"]<p["avg_cost_per_success"]:
|
| 489 |
+
dominated=True; break
|
| 490 |
+
if not dominated: frontier.append(p)
|
| 491 |
+
frontier.sort(key=lambda x:x["success_rate"],reverse=True)
|
| 492 |
+
frontier_data={"all_points":points,"pareto_frontier":frontier,"frontier_baselines":[p["baseline"] for p in frontier]}
|
| 493 |
+
frontier_path=os.path.join(args.output,"cost_quality_frontier.json")
|
| 494 |
+
with open(frontier_path,"w") as f: json.dump(frontier_data,indent=2,fp=f)
|
| 495 |
+
print(f" Saved frontier to {frontier_path}")
|
| 496 |
+
print("\n"+"="*100)
|
| 497 |
+
print(report)
|
| 498 |
+
print("="*100)
|