#!/usr/bin/env python3 """ACO Benchmark Evaluation: Full system test with simulated agent traces.""" import sys,json,random,pickle,time sys.path.insert(0,"/app") from collections import defaultdict TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97} TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5} TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3, "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2, "tool_heavy":2,"retrieval_heavy":2} CODE_KW=["python","javascript","code","function","bug","debug","refactor","implement","test"] CRITICAL_KW=["critical","production","urgent","now","emergency","live","deployed","safety","security"] SIMPLE_KW=["typo","simple","quick","brief","briefly","just","minor","small","easy","trivial","clarification"] from aco.classifier import TaskCostClassifier from aco.router import ModelCascadeRouter from aco.context_budgeter import ContextBudgeter from aco.tool_gate import ToolCostGate from aco.verifier_budgeter import VerifierBudgeter from aco.retry_optimizer import RetryOptimizer from aco.meta_tool_miner import MetaToolMiner from aco.doom_detector import DoomDetector TASKS={ "quick_answer":["What is 2+2?","Explain quantum computing briefly.","Just tell me what 2+2 is."], "coding":["Write a Python function to reverse a linked list.","Fix a typo in the README.","Debug this critical production segfault NOW.","Just fix the typo in line 42."], "research":["Research latest transformer advances.","Find sources comparing LoRA and full FT briefly."], "document_drafting":["Draft project proposal for ML pipeline.","Write email to team about deployment."], "legal_regulated":["Review this contract for liability clauses.","Check GDPR compliance for data pipeline urgently."], "tool_heavy":["Search open issues and create summary.","Fetch API docs and generate client code."], "retrieval_heavy":["Answer based on 50-page document.","Find all payment processing mentions."], "long_horizon":["Plan 3-month roadmap.","Orchestrate complete multi-region deployment."], "unknown_ambiguous":["Help me with this thing.","I need something about the server."], } TOOL_LIST=["web_search","code_search","file_read","file_write","code_execute","verify"] TOOL_COST_ESTIMATES={"web_search":{"cost":0.01},"code_search":{"cost":0.005},"file_read":{"cost":0.001},"file_write":{"cost":0.001},"code_execute":{"cost":0.01},"verify":{"cost":0.02}} VERIFIER_COST=0.02 print("="*80) print("ACO FULL SYSTEM BENCHMARK EVALUATION") print("="*80) # Initialize modules classifier=TaskCostClassifier() router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") context_budgeter=ContextBudgeter() tool_gate=ToolCostGate() verifier_budgeter=VerifierBudgeter() retry_optimizer=RetryOptimizer() meta_tool_miner=MetaToolMiner() doom_detector=DoomDetector() # Simulate 2000 agent runs rng=random.Random(42) N=2000 results_aco=[] results_frontier=[] results_heuristic=[] results_cheap=[] for i in range(N): tt=rng.choice(list(TASKS.keys())) req=rng.choice(TASKS[tt]) # Classify pred=classifier.classify(req) # Route routing=router.route(req, pred["task_type"], pred["difficulty"], pred) # Context budget budget=context_budgeter.budget(pred["task_type"],pred["difficulty"],pred["needs_retrieval"],pred["needs_tools"]) # Tool decisions tool_decisions={} for tool in TOOL_LIST: if pred["needs_tools"] or tt in ("coding","tool_heavy","retrieval_heavy","research"): td=tool_gate.gate(tool,{"query":req},tt,1,5,routing.confidence) tool_decisions[tool]=td # Verifier vd=verifier_budgeter.should_verify(tt,pred["risk"],routing.confidence,False,False,routing.tier) # Simulate success ps=TIER_STR[routing.tier]**(pred["difficulty"]*0.6) success=rng.random()10} {'AvgCost':>10} {'CostRed':>10} {'ModelCost':>10} {'ToolCost':>10} {'VerCost':>10} {'Context':>10} {'Verifs':>8}") print("-"*100) for r in [m_f,m_h,m,m_c]: cr=(1-r["avg_cost"]/m_f["avg_cost"])*100 print(f"{r['name']:<20} {r['success_rate']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['model_cost']:>10.4f} {r['tool_cost']:>10.4f} {r['ver_cost']:>10.4f} {r['avg_context_tokens']:>10.0f} {r['verifications']:>8d}") # Per-task breakdown print(f"\n\nPer-task breakdown:") for tt in sorted(set(r["tt"] for r in results_aco)): aco_tt=[r for r in results_aco if r["tt"]==tt] front_tt=[r for r in results_frontier if r["tt"]==tt] n_tt=len(aco_tt) a_s=sum(1 for r in aco_tt if r["success"])/n_tt a_c=sum(r["cost"] for r in aco_tt)/n_tt f_c=sum(r["cost"] for r in front_tt)/n_tt f_s=sum(1 for r in front_tt if r["success"])/n_tt cr=(1-a_c/f_c)*100 print(f" {tt:<20} n={n_tt:>4} aco_success={a_s:.3f} frontier_success={f_s:.3f} aco_cost={a_c:.4f} costRed={cr:.1f}%") # Cost-quality frontier print(f"\n\nCost-Quality Frontier:") frontier_points=[] for r in [m_c,m_h,m,m_f]: frontier_points.append((r["avg_cost"],r["success_rate"],r["name"])) frontier_points.sort(key=lambda x:x[0]) for cost,succ,name in frontier_points: print(f" {name:<20} cost={cost:.4f} success={succ:.3f}") # Key findings print(f"\n\nKEY FINDINGS:") print(f" ACO v8 success rate: {m['success_rate']:.3f}") print(f" ACO v8 cost reduction: {(1-m['avg_cost']/m_f['avg_cost'])*100:.1f}%") print(f" ACO v8 avg context: {m['avg_context_tokens']:.0f} tokens") print(f" ACO v8 verifications: {m['verifications']}/{N}") print(f" Escalations: {m['escalations']} ({m['escalations']/N*100:.1f}%)") print(f" Downgrades: {m['downgrades']} ({m['downgrades']/N*100:.1f}%)") # Save with open("/app/aco_benchmark_results.json","w") as f: json.dump({"aco_v8":m,"frontier":m_f,"heuristic":m_h,"cheap":m_c},f,indent=2) print(f"\nSaved to /app/aco_benchmark_results.json") print("DONE!")