File size: 8,367 Bytes
d3c4fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#!/usr/bin/env python3
"""Comprehensive benchmark with execution-feedback routing."""
import sys,json,random,math,pickle,time
sys.path.insert(0,"/app")
from collections import defaultdict

TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}
TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}
TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3,
             "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2,
             "tool_heavy":2,"retrieval_heavy":2}

from aco.classifier import TaskCostClassifier
from aco.router import ModelCascadeRouter
from aco.execution_feedback import ExecutionFeedbackRouter, FeedbackSignal

TASKS = {
  "quick_answer":["What is 2+2?","Explain quantum computing briefly.",
      "What is the capital of France?","Convert 100F to Celsius.",
      "Small clarification on this formula."],
  "coding":["Write a Python function to reverse a linked list.",
      "Fix a typo in the README.","Debug this critical production segfault NOW.",
      "Just fix the typo in line 42.","Refactor auth module to JWT.",
      "Implement LRU cache in Go."],
  "research":["Research latest transformer advances.",
      "Find sources comparing LoRA and full FT briefly.",
      "Investigate data center climate impact."],
  "document_drafting":["Draft project proposal for ML pipeline.",
      "Write email to team about deployment."],
  "legal_regulated":["Review this contract for liability clauses.",
      "Check GDPR compliance for data pipeline urgently."],
  "tool_heavy":["Search open issues and create summary.",
      "Fetch API docs and generate client code."],
  "retrieval_heavy":["Answer based on 50-page document.",
      "Find all payment processing mentions."],
  "long_horizon":["Plan 3-month roadmap.",
      "Orchestrate complete multi-region deployment."],
  "unknown_ambiguous":["Help me with this thing.",
      "I need something about the server."],
}

print("="*80)
print("ACO v9 BENCHMARK: EXECUTION-FEEDBACK + DYNAMIC DIFFICULTY")
print("="*80)

classifier = TaskCostClassifier()
router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
ef_router = ExecutionFeedbackRouter(tier_costs=TIER_COST, task_floors=TASK_FLOOR)

rng = random.Random(42)
N = 3000

def sim_logprobs(tier, difficulty, success, rng):
    """Simulate token logprobs based on tier and difficulty."""
    n_tokens = rng.randint(20, 200)
    base_lp = {1:-3.5, 2:-2.5, 3:-1.5, 4:-0.7, 5:-0.3}[tier]
    base_lp *= (1 + difficulty * 0.15)
    lps = []
    for _ in range(n_tokens):
        noise = rng.gauss(0, 1.0 + difficulty * 0.3)
        if success:
            lps.append(base_lp + noise * 0.3)
        else:
            lps.append(base_lp + noise * 0.8)
    return lps

def eval_method(name, route_fn):
    succ = 0; cost = 0.0; unsafe = 0
    per_tt = defaultdict(lambda: {"succ":0,"cost":0.0,"n":0})
    for i in range(N):
        tt = rng.choice(list(TASKS.keys()))
        req = rng.choice(TASKS[tt])
        pred = classifier.classify(req)
        tier, s, c, u = route_fn(req, tt, pred)
        if s: succ += 1
        cost += c
        if u: unsafe += 1
        per_tt[tt]["succ"] += (1 if s else 0)
        per_tt[tt]["cost"] += c
        per_tt[tt]["n"] += 1
    return {"name":name,"success":succ/N,"avg_cost":cost/N,"unsafe":unsafe/N,"per_tt":dict(per_tt)}

# Method A: always frontier
def route_frontier(req, tt, pred):
    ps = TIER_STR[4]**(pred["difficulty"]*0.6)
    return 4, rng.random()<ps, TIER_COST[4], False

# Method B: heuristic
def route_heuristic(req, tt, pred):
    h = min(pred["difficulty"]+1,5)
    h = max(h, TASK_FLOOR.get(tt,2))
    ps = TIER_STR[h]**(pred["difficulty"]*0.6)
    return h, rng.random()<ps, TIER_COST[h], (h < 4 and not rng.random()<ps)

# Method C: v8 router (no feedback)
def route_v8(req, tt, pred):
    r = router.route(req, tt, pred["difficulty"], pred)
    ps = TIER_STR[r.tier]**(r.dynamic_difficulty*0.6)
    return r.tier, rng.random()<ps, TIER_COST[r.tier], r.escalated

# Method D: v9 = v8 router + execution feedback cascade
def route_v9(req, tt, pred):
    r = router.route(req, tt, pred["difficulty"], pred)
    initial_tier = r.tier
    ps_initial = TIER_STR[initial_tier]**(r.dynamic_difficulty*0.6)
    initial_success = rng.random() < ps_initial
    lps = sim_logprobs(initial_tier, r.dynamic_difficulty, initial_success, rng)
    signal = ef_router.analyze_output(lps, task_type=tt, current_tier=initial_tier)
    if signal.should_escalate and initial_tier < 5:
        final_tier = min(initial_tier + 1, 5)
        final_tier = max(final_tier, TASK_FLOOR.get(tt, 1))
        ps_final = TIER_STR[final_tier]**(r.dynamic_difficulty*0.6)
        final_success = rng.random() < ps_final
        total_cost = TIER_COST[initial_tier] + TIER_COST[final_tier]
        return final_tier, final_success, total_cost, False
    else:
        return initial_tier, initial_success, TIER_COST[initial_tier], False

# Method E: oracle
def route_oracle(req, tt, pred):
    for t in range(1,6):
        ps = TIER_STR[t]**(pred["difficulty"]*0.6)
        if rng.random() < ps:
            return t, True, TIER_COST[t], False
    return 5, False, TIER_COST[5], False

# Method F: always cheap
def route_cheap(req, tt, pred):
    ps = TIER_STR[1]**(pred["difficulty"]*0.6)
    return 1, rng.random()<ps, TIER_COST[1], (not rng.random()<ps)

# Run all methods
print(f"\n[1] Running {N} simulated traces per method...")
results = {}
for name, fn in [("always_frontier",route_frontier),("always_cheap",route_cheap),
                  ("heuristic",route_heuristic),("v8_router",route_v8),
                  ("v9_feedback",route_v9),("oracle",route_oracle)]:
    rng_state = rng.getstate()
    rng.seed(42)
    results[name] = eval_method(name, fn)
    rng.setstate(rng_state)

# Print comparison
print(f"\n\n{'Method':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'Unsafe':>10}")
print("-"*60)
fc = results["always_frontier"]["avg_cost"]
for name in ["oracle","always_frontier","v9_feedback","v8_router","heuristic","always_cheap"]:
    r = results[name]
    cr = (1-r["avg_cost"]/fc)*100
    print(f"{name:<20} {r['success']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['unsafe']:>10.3f}")

# Per-task comparison
print(f"\n\n[2] Per-task success rate comparison:")
print(f"{'Task':<20} {'Frontier':>10} {'v8':>10} {'v9_feedback':>12} {'CostRed_v9':>12}")
print("-"*65)
for tt in sorted(set(k for r in results.values() for k in r["per_tt"])):
    f_r = results["always_frontier"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
    v8_r = results["v8_router"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
    v9_r = results["v9_feedback"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
    f_sr = f_r["succ"]/max(f_r["n"],1)
    v8_sr = v8_r["succ"]/max(v8_r["n"],1)
    v9_sr = v9_r["succ"]/max(v9_r["n"],1)
    f_c = f_r["cost"]/max(f_r["n"],1)
    v9_c = v9_r["cost"]/max(v9_r["n"],1)
    cr = (1-v9_c/f_c)*100 if f_c > 0 else 0
    print(f"{tt:<20} {f_sr:>10.3f} {v8_sr:>10.3f} {v9_sr:>12.3f} {cr:>11.1f}%")

# Cost-quality frontier
print(f"\n\n[3] Cost-Quality Frontier:")
for name in ["always_cheap","v8_router","v9_feedback","heuristic","always_frontier","oracle"]:
    r = results[name]
    cr = (1-r["avg_cost"]/fc)*100
    print(f"  {name:<20} success={r['success']:.3f} cost={r['avg_cost']:.4f} costRed={cr:.1f}%")

# Key metrics
v9 = results["v9_feedback"]
v8 = results["v8_router"]
fr = results["always_frontier"]
v9_cr = (1-v9["avg_cost"]/fr["avg_cost"])*100
v8_cr = (1-v8["avg_cost"]/fr["avg_cost"])*100
quality_gap_v9 = fr["success"] - v9["success"]
quality_gap_v8 = fr["success"] - v8["success"]

print(f"\n\n[4] KEY RESULTS:")
print(f"  v8 success:     {v8['success']:.3f} (gap vs frontier: {quality_gap_v8:.3f})")
print(f"  v9 success:     {v9['success']:.3f} (gap vs frontier: {quality_gap_v9:.3f})")
print(f"  v8 costRed:     {v8_cr:.1f}%")
print(f"  v9 costRed:     {v9_cr:.1f}%")
print(f"  Quality gain v9 vs v8: {v9['success']-v8['success']:+.3f}")
print(f"  Cost increase v9 vs v8: {v9['avg_cost']-v8['avg_cost']:+.4f}")

# Save
with open("/app/benchmark_v9_results.json","w") as f:
    save_data = {}
    for name, r in results.items():
        save_data[name] = {"success":r["success"],"avg_cost":r["avg_cost"],"unsafe":r["unsafe"]}
    json.dump(save_data, f, indent=2)
print(f"\nSaved to /app/benchmark_v9_results.json")
print("DONE!")