File size: 8,512 Bytes
4c6ae13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python3
"""ACO Benchmark Evaluation: Full system test with simulated agent traces."""
import sys,json,random,pickle,time
sys.path.insert(0,"/app")
from collections import defaultdict

TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}
TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}
TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3,
             "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2,
             "tool_heavy":2,"retrieval_heavy":2}

CODE_KW=["python","javascript","code","function","bug","debug","refactor","implement","test"]
CRITICAL_KW=["critical","production","urgent","now","emergency","live","deployed","safety","security"]
SIMPLE_KW=["typo","simple","quick","brief","briefly","just","minor","small","easy","trivial","clarification"]

from aco.classifier import TaskCostClassifier
from aco.router import ModelCascadeRouter
from aco.context_budgeter import ContextBudgeter
from aco.tool_gate import ToolCostGate
from aco.verifier_budgeter import VerifierBudgeter
from aco.retry_optimizer import RetryOptimizer
from aco.meta_tool_miner import MetaToolMiner
from aco.doom_detector import DoomDetector

TASKS={
  "quick_answer":["What is 2+2?","Explain quantum computing briefly.","Just tell me what 2+2 is."],
  "coding":["Write a Python function to reverse a linked list.","Fix a typo in the README.","Debug this critical production segfault NOW.","Just fix the typo in line 42."],
  "research":["Research latest transformer advances.","Find sources comparing LoRA and full FT briefly."],
  "document_drafting":["Draft project proposal for ML pipeline.","Write email to team about deployment."],
  "legal_regulated":["Review this contract for liability clauses.","Check GDPR compliance for data pipeline urgently."],
  "tool_heavy":["Search open issues and create summary.","Fetch API docs and generate client code."],
  "retrieval_heavy":["Answer based on 50-page document.","Find all payment processing mentions."],
  "long_horizon":["Plan 3-month roadmap.","Orchestrate complete multi-region deployment."],
  "unknown_ambiguous":["Help me with this thing.","I need something about the server."],
}

TOOL_LIST=["web_search","code_search","file_read","file_write","code_execute","verify"]
TOOL_COST_ESTIMATES={"web_search":{"cost":0.01},"code_search":{"cost":0.005},"file_read":{"cost":0.001},"file_write":{"cost":0.001},"code_execute":{"cost":0.01},"verify":{"cost":0.02}}
VERIFIER_COST=0.02

print("="*80)
print("ACO FULL SYSTEM BENCHMARK EVALUATION")
print("="*80)

# Initialize modules
classifier=TaskCostClassifier()
router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
context_budgeter=ContextBudgeter()
tool_gate=ToolCostGate()
verifier_budgeter=VerifierBudgeter()
retry_optimizer=RetryOptimizer()
meta_tool_miner=MetaToolMiner()
doom_detector=DoomDetector()

# Simulate 2000 agent runs
rng=random.Random(42)
N=2000
results_aco=[]
results_frontier=[]
results_heuristic=[]
results_cheap=[]

for i in range(N):
    tt=rng.choice(list(TASKS.keys()))
    req=rng.choice(TASKS[tt])
    
    # Classify
    pred=classifier.classify(req)
    # Route
    routing=router.route(req, pred["task_type"], pred["difficulty"], pred)
    # Context budget
    budget=context_budgeter.budget(pred["task_type"],pred["difficulty"],pred["needs_retrieval"],pred["needs_tools"])
    # Tool decisions
    tool_decisions={}
    for tool in TOOL_LIST:
        if pred["needs_tools"] or tt in ("coding","tool_heavy","retrieval_heavy","research"):
            td=tool_gate.gate(tool,{"query":req},tt,1,5,routing.confidence)
            tool_decisions[tool]=td
    # Verifier
    vd=verifier_budgeter.should_verify(tt,pred["risk"],routing.confidence,False,False,routing.tier)
    # Simulate success
    ps=TIER_STR[routing.tier]**(pred["difficulty"]*0.6)
    success=rng.random()<ps
    # Compute cost
    model_cost=TIER_COST[routing.tier]
    tool_cost=sum(TOOL_COST_ESTIMATES.get(t,{}).get("cost",0.02) for t,td in tool_decisions.items() if td.action=="use")
    ver_cost=VERIFIER_COST if vd.should_verify else 0
    total_cost=model_cost+tool_cost+ver_cost
    
    results_aco.append({"tt":tt,"tier":routing.tier,"success":success,"cost":total_cost,
                       "model_cost":model_cost,"tool_cost":tool_cost,"ver_cost":ver_cost,
                       "context_tokens":budget.total_tokens,"verified":vd.should_verify,
                       "tools_used":sum(1 for td in tool_decisions.values() if td.action=="use"),
                       "escalated":routing.escalated,"downgraded":routing.downgraded})
    
    # Baseline: always frontier
    ps_f=TIER_STR[4]**(pred["difficulty"]*0.6)
    s_f=rng.random()<ps_f
    results_frontier.append({"tt":tt,"tier":4,"success":s_f,"cost":1.0+tool_cost+VERIFIER_COST})
    
    # Baseline: heuristic
    h_tier=min(pred["difficulty"]+1,5)
    h_tier=max(h_tier,TASK_FLOOR.get(tt,2))
    ps_h=TIER_STR[h_tier]**(pred["difficulty"]*0.6)
    s_h=rng.random()<ps_h
    results_heuristic.append({"tt":tt,"tier":h_tier,"success":s_h,"cost":TIER_COST[h_tier]+tool_cost+ver_cost})
    
    # Baseline: always cheap
    ps_c=TIER_STR[1]**(pred["difficulty"]*0.6)
    s_c=rng.random()<ps_c
    results_cheap.append({"tt":tt,"tier":1,"success":s_c,"cost":0.05+tool_cost})
    
    verifier_budgeter.reset_run()


# Compute metrics
def compute_metrics(results, name):
    n=len(results)
    succ=sum(1 for r in results if r["success"])
    cost=sum(r["cost"] for r in results)
    model_cost=sum(r.get("model_cost",r["cost"]) for r in results)
    tool_cost=sum(r.get("tool_cost",0) for r in results)
    ver_cost=sum(r.get("ver_cost",0) for r in results)
    ctx=sum(r.get("context_tokens",8000) for r in results)/n
    verified=sum(1 for r in results if r.get("verified",True))
    tools=sum(r.get("tools_used",0) for r in results)/n
    escalations=sum(1 for r in results if r.get("escalated",False))
    downgrades=sum(1 for r in results if r.get("downgraded",False))
    return {"name":name,"success_rate":succ/n,"avg_cost":cost/n,
            "model_cost":model_cost/n,"tool_cost":tool_cost/n,"ver_cost":ver_cost/n,
            "avg_context_tokens":ctx,"verifications":verified,
            "avg_tools":tools,"escalations":escalations,"downgrades":downgrades}

m=compute_metrics(results_aco,"aco_v8")
m_f=compute_metrics(results_frontier,"always_frontier")
m_h=compute_metrics(results_heuristic,"heuristic")
m_c=compute_metrics(results_cheap,"always_cheap")

print(f"\n{'Router':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'ModelCost':>10} {'ToolCost':>10} {'VerCost':>10} {'Context':>10} {'Verifs':>8}")
print("-"*100)
for r in [m_f,m_h,m,m_c]:
    cr=(1-r["avg_cost"]/m_f["avg_cost"])*100
    print(f"{r['name']:<20} {r['success_rate']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['model_cost']:>10.4f} {r['tool_cost']:>10.4f} {r['ver_cost']:>10.4f} {r['avg_context_tokens']:>10.0f} {r['verifications']:>8d}")

# Per-task breakdown
print(f"\n\nPer-task breakdown:")
for tt in sorted(set(r["tt"] for r in results_aco)):
    aco_tt=[r for r in results_aco if r["tt"]==tt]
    front_tt=[r for r in results_frontier if r["tt"]==tt]
    n_tt=len(aco_tt)
    a_s=sum(1 for r in aco_tt if r["success"])/n_tt
    a_c=sum(r["cost"] for r in aco_tt)/n_tt
    f_c=sum(r["cost"] for r in front_tt)/n_tt
    f_s=sum(1 for r in front_tt if r["success"])/n_tt
    cr=(1-a_c/f_c)*100
    print(f"  {tt:<20} n={n_tt:>4} aco_success={a_s:.3f} frontier_success={f_s:.3f} aco_cost={a_c:.4f} costRed={cr:.1f}%")

# Cost-quality frontier
print(f"\n\nCost-Quality Frontier:")
frontier_points=[]
for r in [m_c,m_h,m,m_f]:
    frontier_points.append((r["avg_cost"],r["success_rate"],r["name"]))
frontier_points.sort(key=lambda x:x[0])
for cost,succ,name in frontier_points:
    print(f"  {name:<20} cost={cost:.4f} success={succ:.3f}")

# Key findings
print(f"\n\nKEY FINDINGS:")
print(f"  ACO v8 success rate: {m['success_rate']:.3f}")
print(f"  ACO v8 cost reduction: {(1-m['avg_cost']/m_f['avg_cost'])*100:.1f}%")
print(f"  ACO v8 avg context: {m['avg_context_tokens']:.0f} tokens")
print(f"  ACO v8 verifications: {m['verifications']}/{N}")
print(f"  Escalations: {m['escalations']} ({m['escalations']/N*100:.1f}%)")
print(f"  Downgrades: {m['downgrades']} ({m['downgrades']/N*100:.1f}%)")

# Save
with open("/app/aco_benchmark_results.json","w") as f:
    json.dump({"aco_v8":m,"frontier":m_f,"heuristic":m_h,"cheap":m_c},f,indent=2)
print(f"\nSaved to /app/aco_benchmark_results.json")
print("DONE!")