File size: 8,512 Bytes
4c6ae13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | #!/usr/bin/env python3
"""ACO Benchmark Evaluation: Full system test with simulated agent traces."""
import sys,json,random,pickle,time
sys.path.insert(0,"/app")
from collections import defaultdict
TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}
TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}
TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3,
"unknown_ambiguous":3,"quick_answer":1,"document_drafting":2,
"tool_heavy":2,"retrieval_heavy":2}
CODE_KW=["python","javascript","code","function","bug","debug","refactor","implement","test"]
CRITICAL_KW=["critical","production","urgent","now","emergency","live","deployed","safety","security"]
SIMPLE_KW=["typo","simple","quick","brief","briefly","just","minor","small","easy","trivial","clarification"]
from aco.classifier import TaskCostClassifier
from aco.router import ModelCascadeRouter
from aco.context_budgeter import ContextBudgeter
from aco.tool_gate import ToolCostGate
from aco.verifier_budgeter import VerifierBudgeter
from aco.retry_optimizer import RetryOptimizer
from aco.meta_tool_miner import MetaToolMiner
from aco.doom_detector import DoomDetector
TASKS={
"quick_answer":["What is 2+2?","Explain quantum computing briefly.","Just tell me what 2+2 is."],
"coding":["Write a Python function to reverse a linked list.","Fix a typo in the README.","Debug this critical production segfault NOW.","Just fix the typo in line 42."],
"research":["Research latest transformer advances.","Find sources comparing LoRA and full FT briefly."],
"document_drafting":["Draft project proposal for ML pipeline.","Write email to team about deployment."],
"legal_regulated":["Review this contract for liability clauses.","Check GDPR compliance for data pipeline urgently."],
"tool_heavy":["Search open issues and create summary.","Fetch API docs and generate client code."],
"retrieval_heavy":["Answer based on 50-page document.","Find all payment processing mentions."],
"long_horizon":["Plan 3-month roadmap.","Orchestrate complete multi-region deployment."],
"unknown_ambiguous":["Help me with this thing.","I need something about the server."],
}
TOOL_LIST=["web_search","code_search","file_read","file_write","code_execute","verify"]
TOOL_COST_ESTIMATES={"web_search":{"cost":0.01},"code_search":{"cost":0.005},"file_read":{"cost":0.001},"file_write":{"cost":0.001},"code_execute":{"cost":0.01},"verify":{"cost":0.02}}
VERIFIER_COST=0.02
print("="*80)
print("ACO FULL SYSTEM BENCHMARK EVALUATION")
print("="*80)
# Initialize modules
classifier=TaskCostClassifier()
router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
context_budgeter=ContextBudgeter()
tool_gate=ToolCostGate()
verifier_budgeter=VerifierBudgeter()
retry_optimizer=RetryOptimizer()
meta_tool_miner=MetaToolMiner()
doom_detector=DoomDetector()
# Simulate 2000 agent runs
rng=random.Random(42)
N=2000
results_aco=[]
results_frontier=[]
results_heuristic=[]
results_cheap=[]
for i in range(N):
tt=rng.choice(list(TASKS.keys()))
req=rng.choice(TASKS[tt])
# Classify
pred=classifier.classify(req)
# Route
routing=router.route(req, pred["task_type"], pred["difficulty"], pred)
# Context budget
budget=context_budgeter.budget(pred["task_type"],pred["difficulty"],pred["needs_retrieval"],pred["needs_tools"])
# Tool decisions
tool_decisions={}
for tool in TOOL_LIST:
if pred["needs_tools"] or tt in ("coding","tool_heavy","retrieval_heavy","research"):
td=tool_gate.gate(tool,{"query":req},tt,1,5,routing.confidence)
tool_decisions[tool]=td
# Verifier
vd=verifier_budgeter.should_verify(tt,pred["risk"],routing.confidence,False,False,routing.tier)
# Simulate success
ps=TIER_STR[routing.tier]**(pred["difficulty"]*0.6)
success=rng.random()<ps
# Compute cost
model_cost=TIER_COST[routing.tier]
tool_cost=sum(TOOL_COST_ESTIMATES.get(t,{}).get("cost",0.02) for t,td in tool_decisions.items() if td.action=="use")
ver_cost=VERIFIER_COST if vd.should_verify else 0
total_cost=model_cost+tool_cost+ver_cost
results_aco.append({"tt":tt,"tier":routing.tier,"success":success,"cost":total_cost,
"model_cost":model_cost,"tool_cost":tool_cost,"ver_cost":ver_cost,
"context_tokens":budget.total_tokens,"verified":vd.should_verify,
"tools_used":sum(1 for td in tool_decisions.values() if td.action=="use"),
"escalated":routing.escalated,"downgraded":routing.downgraded})
# Baseline: always frontier
ps_f=TIER_STR[4]**(pred["difficulty"]*0.6)
s_f=rng.random()<ps_f
results_frontier.append({"tt":tt,"tier":4,"success":s_f,"cost":1.0+tool_cost+VERIFIER_COST})
# Baseline: heuristic
h_tier=min(pred["difficulty"]+1,5)
h_tier=max(h_tier,TASK_FLOOR.get(tt,2))
ps_h=TIER_STR[h_tier]**(pred["difficulty"]*0.6)
s_h=rng.random()<ps_h
results_heuristic.append({"tt":tt,"tier":h_tier,"success":s_h,"cost":TIER_COST[h_tier]+tool_cost+ver_cost})
# Baseline: always cheap
ps_c=TIER_STR[1]**(pred["difficulty"]*0.6)
s_c=rng.random()<ps_c
results_cheap.append({"tt":tt,"tier":1,"success":s_c,"cost":0.05+tool_cost})
verifier_budgeter.reset_run()
# Compute metrics
def compute_metrics(results, name):
n=len(results)
succ=sum(1 for r in results if r["success"])
cost=sum(r["cost"] for r in results)
model_cost=sum(r.get("model_cost",r["cost"]) for r in results)
tool_cost=sum(r.get("tool_cost",0) for r in results)
ver_cost=sum(r.get("ver_cost",0) for r in results)
ctx=sum(r.get("context_tokens",8000) for r in results)/n
verified=sum(1 for r in results if r.get("verified",True))
tools=sum(r.get("tools_used",0) for r in results)/n
escalations=sum(1 for r in results if r.get("escalated",False))
downgrades=sum(1 for r in results if r.get("downgraded",False))
return {"name":name,"success_rate":succ/n,"avg_cost":cost/n,
"model_cost":model_cost/n,"tool_cost":tool_cost/n,"ver_cost":ver_cost/n,
"avg_context_tokens":ctx,"verifications":verified,
"avg_tools":tools,"escalations":escalations,"downgrades":downgrades}
m=compute_metrics(results_aco,"aco_v8")
m_f=compute_metrics(results_frontier,"always_frontier")
m_h=compute_metrics(results_heuristic,"heuristic")
m_c=compute_metrics(results_cheap,"always_cheap")
print(f"\n{'Router':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'ModelCost':>10} {'ToolCost':>10} {'VerCost':>10} {'Context':>10} {'Verifs':>8}")
print("-"*100)
for r in [m_f,m_h,m,m_c]:
cr=(1-r["avg_cost"]/m_f["avg_cost"])*100
print(f"{r['name']:<20} {r['success_rate']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['model_cost']:>10.4f} {r['tool_cost']:>10.4f} {r['ver_cost']:>10.4f} {r['avg_context_tokens']:>10.0f} {r['verifications']:>8d}")
# Per-task breakdown
print(f"\n\nPer-task breakdown:")
for tt in sorted(set(r["tt"] for r in results_aco)):
aco_tt=[r for r in results_aco if r["tt"]==tt]
front_tt=[r for r in results_frontier if r["tt"]==tt]
n_tt=len(aco_tt)
a_s=sum(1 for r in aco_tt if r["success"])/n_tt
a_c=sum(r["cost"] for r in aco_tt)/n_tt
f_c=sum(r["cost"] for r in front_tt)/n_tt
f_s=sum(1 for r in front_tt if r["success"])/n_tt
cr=(1-a_c/f_c)*100
print(f" {tt:<20} n={n_tt:>4} aco_success={a_s:.3f} frontier_success={f_s:.3f} aco_cost={a_c:.4f} costRed={cr:.1f}%")
# Cost-quality frontier
print(f"\n\nCost-Quality Frontier:")
frontier_points=[]
for r in [m_c,m_h,m,m_f]:
frontier_points.append((r["avg_cost"],r["success_rate"],r["name"]))
frontier_points.sort(key=lambda x:x[0])
for cost,succ,name in frontier_points:
print(f" {name:<20} cost={cost:.4f} success={succ:.3f}")
# Key findings
print(f"\n\nKEY FINDINGS:")
print(f" ACO v8 success rate: {m['success_rate']:.3f}")
print(f" ACO v8 cost reduction: {(1-m['avg_cost']/m_f['avg_cost'])*100:.1f}%")
print(f" ACO v8 avg context: {m['avg_context_tokens']:.0f} tokens")
print(f" ACO v8 verifications: {m['verifications']}/{N}")
print(f" Escalations: {m['escalations']} ({m['escalations']/N*100:.1f}%)")
print(f" Downgrades: {m['downgrades']} ({m['downgrades']/N*100:.1f}%)")
# Save
with open("/app/aco_benchmark_results.json","w") as f:
json.dump({"aco_v8":m,"frontier":m_f,"heuristic":m_h,"cheap":m_c},f,indent=2)
print(f"\nSaved to /app/aco_benchmark_results.json")
print("DONE!")
|