File size: 8,367 Bytes
d3c4fd1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | #!/usr/bin/env python3
"""Comprehensive benchmark with execution-feedback routing."""
import sys,json,random,math,pickle,time
sys.path.insert(0,"/app")
from collections import defaultdict
TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}
TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}
TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3,
"unknown_ambiguous":3,"quick_answer":1,"document_drafting":2,
"tool_heavy":2,"retrieval_heavy":2}
from aco.classifier import TaskCostClassifier
from aco.router import ModelCascadeRouter
from aco.execution_feedback import ExecutionFeedbackRouter, FeedbackSignal
TASKS = {
"quick_answer":["What is 2+2?","Explain quantum computing briefly.",
"What is the capital of France?","Convert 100F to Celsius.",
"Small clarification on this formula."],
"coding":["Write a Python function to reverse a linked list.",
"Fix a typo in the README.","Debug this critical production segfault NOW.",
"Just fix the typo in line 42.","Refactor auth module to JWT.",
"Implement LRU cache in Go."],
"research":["Research latest transformer advances.",
"Find sources comparing LoRA and full FT briefly.",
"Investigate data center climate impact."],
"document_drafting":["Draft project proposal for ML pipeline.",
"Write email to team about deployment."],
"legal_regulated":["Review this contract for liability clauses.",
"Check GDPR compliance for data pipeline urgently."],
"tool_heavy":["Search open issues and create summary.",
"Fetch API docs and generate client code."],
"retrieval_heavy":["Answer based on 50-page document.",
"Find all payment processing mentions."],
"long_horizon":["Plan 3-month roadmap.",
"Orchestrate complete multi-region deployment."],
"unknown_ambiguous":["Help me with this thing.",
"I need something about the server."],
}
print("="*80)
print("ACO v9 BENCHMARK: EXECUTION-FEEDBACK + DYNAMIC DIFFICULTY")
print("="*80)
classifier = TaskCostClassifier()
router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
ef_router = ExecutionFeedbackRouter(tier_costs=TIER_COST, task_floors=TASK_FLOOR)
rng = random.Random(42)
N = 3000
def sim_logprobs(tier, difficulty, success, rng):
"""Simulate token logprobs based on tier and difficulty."""
n_tokens = rng.randint(20, 200)
base_lp = {1:-3.5, 2:-2.5, 3:-1.5, 4:-0.7, 5:-0.3}[tier]
base_lp *= (1 + difficulty * 0.15)
lps = []
for _ in range(n_tokens):
noise = rng.gauss(0, 1.0 + difficulty * 0.3)
if success:
lps.append(base_lp + noise * 0.3)
else:
lps.append(base_lp + noise * 0.8)
return lps
def eval_method(name, route_fn):
succ = 0; cost = 0.0; unsafe = 0
per_tt = defaultdict(lambda: {"succ":0,"cost":0.0,"n":0})
for i in range(N):
tt = rng.choice(list(TASKS.keys()))
req = rng.choice(TASKS[tt])
pred = classifier.classify(req)
tier, s, c, u = route_fn(req, tt, pred)
if s: succ += 1
cost += c
if u: unsafe += 1
per_tt[tt]["succ"] += (1 if s else 0)
per_tt[tt]["cost"] += c
per_tt[tt]["n"] += 1
return {"name":name,"success":succ/N,"avg_cost":cost/N,"unsafe":unsafe/N,"per_tt":dict(per_tt)}
# Method A: always frontier
def route_frontier(req, tt, pred):
ps = TIER_STR[4]**(pred["difficulty"]*0.6)
return 4, rng.random()<ps, TIER_COST[4], False
# Method B: heuristic
def route_heuristic(req, tt, pred):
h = min(pred["difficulty"]+1,5)
h = max(h, TASK_FLOOR.get(tt,2))
ps = TIER_STR[h]**(pred["difficulty"]*0.6)
return h, rng.random()<ps, TIER_COST[h], (h < 4 and not rng.random()<ps)
# Method C: v8 router (no feedback)
def route_v8(req, tt, pred):
r = router.route(req, tt, pred["difficulty"], pred)
ps = TIER_STR[r.tier]**(r.dynamic_difficulty*0.6)
return r.tier, rng.random()<ps, TIER_COST[r.tier], r.escalated
# Method D: v9 = v8 router + execution feedback cascade
def route_v9(req, tt, pred):
r = router.route(req, tt, pred["difficulty"], pred)
initial_tier = r.tier
ps_initial = TIER_STR[initial_tier]**(r.dynamic_difficulty*0.6)
initial_success = rng.random() < ps_initial
lps = sim_logprobs(initial_tier, r.dynamic_difficulty, initial_success, rng)
signal = ef_router.analyze_output(lps, task_type=tt, current_tier=initial_tier)
if signal.should_escalate and initial_tier < 5:
final_tier = min(initial_tier + 1, 5)
final_tier = max(final_tier, TASK_FLOOR.get(tt, 1))
ps_final = TIER_STR[final_tier]**(r.dynamic_difficulty*0.6)
final_success = rng.random() < ps_final
total_cost = TIER_COST[initial_tier] + TIER_COST[final_tier]
return final_tier, final_success, total_cost, False
else:
return initial_tier, initial_success, TIER_COST[initial_tier], False
# Method E: oracle
def route_oracle(req, tt, pred):
for t in range(1,6):
ps = TIER_STR[t]**(pred["difficulty"]*0.6)
if rng.random() < ps:
return t, True, TIER_COST[t], False
return 5, False, TIER_COST[5], False
# Method F: always cheap
def route_cheap(req, tt, pred):
ps = TIER_STR[1]**(pred["difficulty"]*0.6)
return 1, rng.random()<ps, TIER_COST[1], (not rng.random()<ps)
# Run all methods
print(f"\n[1] Running {N} simulated traces per method...")
results = {}
for name, fn in [("always_frontier",route_frontier),("always_cheap",route_cheap),
("heuristic",route_heuristic),("v8_router",route_v8),
("v9_feedback",route_v9),("oracle",route_oracle)]:
rng_state = rng.getstate()
rng.seed(42)
results[name] = eval_method(name, fn)
rng.setstate(rng_state)
# Print comparison
print(f"\n\n{'Method':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'Unsafe':>10}")
print("-"*60)
fc = results["always_frontier"]["avg_cost"]
for name in ["oracle","always_frontier","v9_feedback","v8_router","heuristic","always_cheap"]:
r = results[name]
cr = (1-r["avg_cost"]/fc)*100
print(f"{name:<20} {r['success']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['unsafe']:>10.3f}")
# Per-task comparison
print(f"\n\n[2] Per-task success rate comparison:")
print(f"{'Task':<20} {'Frontier':>10} {'v8':>10} {'v9_feedback':>12} {'CostRed_v9':>12}")
print("-"*65)
for tt in sorted(set(k for r in results.values() for k in r["per_tt"])):
f_r = results["always_frontier"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
v8_r = results["v8_router"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
v9_r = results["v9_feedback"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0})
f_sr = f_r["succ"]/max(f_r["n"],1)
v8_sr = v8_r["succ"]/max(v8_r["n"],1)
v9_sr = v9_r["succ"]/max(v9_r["n"],1)
f_c = f_r["cost"]/max(f_r["n"],1)
v9_c = v9_r["cost"]/max(v9_r["n"],1)
cr = (1-v9_c/f_c)*100 if f_c > 0 else 0
print(f"{tt:<20} {f_sr:>10.3f} {v8_sr:>10.3f} {v9_sr:>12.3f} {cr:>11.1f}%")
# Cost-quality frontier
print(f"\n\n[3] Cost-Quality Frontier:")
for name in ["always_cheap","v8_router","v9_feedback","heuristic","always_frontier","oracle"]:
r = results[name]
cr = (1-r["avg_cost"]/fc)*100
print(f" {name:<20} success={r['success']:.3f} cost={r['avg_cost']:.4f} costRed={cr:.1f}%")
# Key metrics
v9 = results["v9_feedback"]
v8 = results["v8_router"]
fr = results["always_frontier"]
v9_cr = (1-v9["avg_cost"]/fr["avg_cost"])*100
v8_cr = (1-v8["avg_cost"]/fr["avg_cost"])*100
quality_gap_v9 = fr["success"] - v9["success"]
quality_gap_v8 = fr["success"] - v8["success"]
print(f"\n\n[4] KEY RESULTS:")
print(f" v8 success: {v8['success']:.3f} (gap vs frontier: {quality_gap_v8:.3f})")
print(f" v9 success: {v9['success']:.3f} (gap vs frontier: {quality_gap_v9:.3f})")
print(f" v8 costRed: {v8_cr:.1f}%")
print(f" v9 costRed: {v9_cr:.1f}%")
print(f" Quality gain v9 vs v8: {v9['success']-v8['success']:+.3f}")
print(f" Cost increase v9 vs v8: {v9['avg_cost']-v8['avg_cost']:+.4f}")
# Save
with open("/app/benchmark_v9_results.json","w") as f:
save_data = {}
for name, r in results.items():
save_data[name] = {"success":r["success"],"avg_cost":r["avg_cost"],"unsafe":r["unsafe"]}
json.dump(save_data, f, indent=2)
print(f"\nSaved to /app/benchmark_v9_results.json")
print("DONE!")
|