| |
| """ACO Benchmark Evaluation: Full system test with simulated agent traces.""" |
| import sys,json,random,pickle,time |
| sys.path.insert(0,"/app") |
| from collections import defaultdict |
|
|
| TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97} |
| TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5} |
| TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3, |
| "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2, |
| "tool_heavy":2,"retrieval_heavy":2} |
|
|
| CODE_KW=["python","javascript","code","function","bug","debug","refactor","implement","test"] |
| CRITICAL_KW=["critical","production","urgent","now","emergency","live","deployed","safety","security"] |
| SIMPLE_KW=["typo","simple","quick","brief","briefly","just","minor","small","easy","trivial","clarification"] |
|
|
| from aco.classifier import TaskCostClassifier |
| from aco.router import ModelCascadeRouter |
| from aco.context_budgeter import ContextBudgeter |
| from aco.tool_gate import ToolCostGate |
| from aco.verifier_budgeter import VerifierBudgeter |
| from aco.retry_optimizer import RetryOptimizer |
| from aco.meta_tool_miner import MetaToolMiner |
| from aco.doom_detector import DoomDetector |
|
|
| TASKS={ |
| "quick_answer":["What is 2+2?","Explain quantum computing briefly.","Just tell me what 2+2 is."], |
| "coding":["Write a Python function to reverse a linked list.","Fix a typo in the README.","Debug this critical production segfault NOW.","Just fix the typo in line 42."], |
| "research":["Research latest transformer advances.","Find sources comparing LoRA and full FT briefly."], |
| "document_drafting":["Draft project proposal for ML pipeline.","Write email to team about deployment."], |
| "legal_regulated":["Review this contract for liability clauses.","Check GDPR compliance for data pipeline urgently."], |
| "tool_heavy":["Search open issues and create summary.","Fetch API docs and generate client code."], |
| "retrieval_heavy":["Answer based on 50-page document.","Find all payment processing mentions."], |
| "long_horizon":["Plan 3-month roadmap.","Orchestrate complete multi-region deployment."], |
| "unknown_ambiguous":["Help me with this thing.","I need something about the server."], |
| } |
|
|
| TOOL_LIST=["web_search","code_search","file_read","file_write","code_execute","verify"] |
| TOOL_COST_ESTIMATES={"web_search":{"cost":0.01},"code_search":{"cost":0.005},"file_read":{"cost":0.001},"file_write":{"cost":0.001},"code_execute":{"cost":0.01},"verify":{"cost":0.02}} |
| VERIFIER_COST=0.02 |
|
|
| print("="*80) |
| print("ACO FULL SYSTEM BENCHMARK EVALUATION") |
| print("="*80) |
|
|
| |
| classifier=TaskCostClassifier() |
| router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") |
| context_budgeter=ContextBudgeter() |
| tool_gate=ToolCostGate() |
| verifier_budgeter=VerifierBudgeter() |
| retry_optimizer=RetryOptimizer() |
| meta_tool_miner=MetaToolMiner() |
| doom_detector=DoomDetector() |
|
|
| |
| rng=random.Random(42) |
| N=2000 |
| results_aco=[] |
| results_frontier=[] |
| results_heuristic=[] |
| results_cheap=[] |
|
|
| for i in range(N): |
| tt=rng.choice(list(TASKS.keys())) |
| req=rng.choice(TASKS[tt]) |
| |
| |
| pred=classifier.classify(req) |
| |
| routing=router.route(req, pred["task_type"], pred["difficulty"], pred) |
| |
| budget=context_budgeter.budget(pred["task_type"],pred["difficulty"],pred["needs_retrieval"],pred["needs_tools"]) |
| |
| tool_decisions={} |
| for tool in TOOL_LIST: |
| if pred["needs_tools"] or tt in ("coding","tool_heavy","retrieval_heavy","research"): |
| td=tool_gate.gate(tool,{"query":req},tt,1,5,routing.confidence) |
| tool_decisions[tool]=td |
| |
| vd=verifier_budgeter.should_verify(tt,pred["risk"],routing.confidence,False,False,routing.tier) |
| |
| ps=TIER_STR[routing.tier]**(pred["difficulty"]*0.6) |
| success=rng.random()<ps |
| |
| model_cost=TIER_COST[routing.tier] |
| tool_cost=sum(TOOL_COST_ESTIMATES.get(t,{}).get("cost",0.02) for t,td in tool_decisions.items() if td.action=="use") |
| ver_cost=VERIFIER_COST if vd.should_verify else 0 |
| total_cost=model_cost+tool_cost+ver_cost |
| |
| results_aco.append({"tt":tt,"tier":routing.tier,"success":success,"cost":total_cost, |
| "model_cost":model_cost,"tool_cost":tool_cost,"ver_cost":ver_cost, |
| "context_tokens":budget.total_tokens,"verified":vd.should_verify, |
| "tools_used":sum(1 for td in tool_decisions.values() if td.action=="use"), |
| "escalated":routing.escalated,"downgraded":routing.downgraded}) |
| |
| |
| ps_f=TIER_STR[4]**(pred["difficulty"]*0.6) |
| s_f=rng.random()<ps_f |
| results_frontier.append({"tt":tt,"tier":4,"success":s_f,"cost":1.0+tool_cost+VERIFIER_COST}) |
| |
| |
| h_tier=min(pred["difficulty"]+1,5) |
| h_tier=max(h_tier,TASK_FLOOR.get(tt,2)) |
| ps_h=TIER_STR[h_tier]**(pred["difficulty"]*0.6) |
| s_h=rng.random()<ps_h |
| results_heuristic.append({"tt":tt,"tier":h_tier,"success":s_h,"cost":TIER_COST[h_tier]+tool_cost+ver_cost}) |
| |
| |
| ps_c=TIER_STR[1]**(pred["difficulty"]*0.6) |
| s_c=rng.random()<ps_c |
| results_cheap.append({"tt":tt,"tier":1,"success":s_c,"cost":0.05+tool_cost}) |
| |
| verifier_budgeter.reset_run() |
|
|
|
|
| |
| def compute_metrics(results, name): |
| n=len(results) |
| succ=sum(1 for r in results if r["success"]) |
| cost=sum(r["cost"] for r in results) |
| model_cost=sum(r.get("model_cost",r["cost"]) for r in results) |
| tool_cost=sum(r.get("tool_cost",0) for r in results) |
| ver_cost=sum(r.get("ver_cost",0) for r in results) |
| ctx=sum(r.get("context_tokens",8000) for r in results)/n |
| verified=sum(1 for r in results if r.get("verified",True)) |
| tools=sum(r.get("tools_used",0) for r in results)/n |
| escalations=sum(1 for r in results if r.get("escalated",False)) |
| downgrades=sum(1 for r in results if r.get("downgraded",False)) |
| return {"name":name,"success_rate":succ/n,"avg_cost":cost/n, |
| "model_cost":model_cost/n,"tool_cost":tool_cost/n,"ver_cost":ver_cost/n, |
| "avg_context_tokens":ctx,"verifications":verified, |
| "avg_tools":tools,"escalations":escalations,"downgrades":downgrades} |
|
|
| m=compute_metrics(results_aco,"aco_v8") |
| m_f=compute_metrics(results_frontier,"always_frontier") |
| m_h=compute_metrics(results_heuristic,"heuristic") |
| m_c=compute_metrics(results_cheap,"always_cheap") |
|
|
| print(f"\n{'Router':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'ModelCost':>10} {'ToolCost':>10} {'VerCost':>10} {'Context':>10} {'Verifs':>8}") |
| print("-"*100) |
| for r in [m_f,m_h,m,m_c]: |
| cr=(1-r["avg_cost"]/m_f["avg_cost"])*100 |
| print(f"{r['name']:<20} {r['success_rate']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['model_cost']:>10.4f} {r['tool_cost']:>10.4f} {r['ver_cost']:>10.4f} {r['avg_context_tokens']:>10.0f} {r['verifications']:>8d}") |
|
|
| |
| print(f"\n\nPer-task breakdown:") |
| for tt in sorted(set(r["tt"] for r in results_aco)): |
| aco_tt=[r for r in results_aco if r["tt"]==tt] |
| front_tt=[r for r in results_frontier if r["tt"]==tt] |
| n_tt=len(aco_tt) |
| a_s=sum(1 for r in aco_tt if r["success"])/n_tt |
| a_c=sum(r["cost"] for r in aco_tt)/n_tt |
| f_c=sum(r["cost"] for r in front_tt)/n_tt |
| f_s=sum(1 for r in front_tt if r["success"])/n_tt |
| cr=(1-a_c/f_c)*100 |
| print(f" {tt:<20} n={n_tt:>4} aco_success={a_s:.3f} frontier_success={f_s:.3f} aco_cost={a_c:.4f} costRed={cr:.1f}%") |
|
|
| |
| print(f"\n\nCost-Quality Frontier:") |
| frontier_points=[] |
| for r in [m_c,m_h,m,m_f]: |
| frontier_points.append((r["avg_cost"],r["success_rate"],r["name"])) |
| frontier_points.sort(key=lambda x:x[0]) |
| for cost,succ,name in frontier_points: |
| print(f" {name:<20} cost={cost:.4f} success={succ:.3f}") |
|
|
| |
| print(f"\n\nKEY FINDINGS:") |
| print(f" ACO v8 success rate: {m['success_rate']:.3f}") |
| print(f" ACO v8 cost reduction: {(1-m['avg_cost']/m_f['avg_cost'])*100:.1f}%") |
| print(f" ACO v8 avg context: {m['avg_context_tokens']:.0f} tokens") |
| print(f" ACO v8 verifications: {m['verifications']}/{N}") |
| print(f" Escalations: {m['escalations']} ({m['escalations']/N*100:.1f}%)") |
| print(f" Downgrades: {m['downgrades']} ({m['downgrades']/N*100:.1f}%)") |
|
|
| |
| with open("/app/aco_benchmark_results.json","w") as f: |
| json.dump({"aco_v8":m,"frontier":m_f,"heuristic":m_h,"cheap":m_c},f,indent=2) |
| print(f"\nSaved to /app/aco_benchmark_results.json") |
| print("DONE!") |
|
|