File size: 6,319 Bytes
3d0ebe0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | #!/usr/bin/env python3
"""FINAL v9 benchmark: v8 router + selective execution feedback.
Strategy: Use v8 router for initial tier selection. Then for tasks
where v8 chose tier < 4 (not frontier), run cheap model first and
use output confidence to decide whether to escalate. This catches
the ~2% of cases where cheap model fails but could be saved by
escalation, without running double-inference everywhere.
"""
import sys,random,math,pickle,json
sys.path.insert(0,"/app")
from collections import defaultdict
from aco.classifier import TaskCostClassifier
from aco.router import ModelCascadeRouter
from aco.execution_feedback import ExecutionFeedbackRouter
TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97}
TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5}
TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3,
"unknown_ambiguous":3,"quick_answer":1,"document_drafting":2,
"tool_heavy":2,"retrieval_heavy":2}
TASKS = {
"quick_answer":["What is 2+2?","Explain quantum computing briefly.","Convert 100F to Celsius."],
"coding":["Write Python function to reverse linked list.","Fix typo in README.",
"Debug critical production segfault NOW.","Just fix typo in line 42."],
"research":["Research latest transformer advances."],
"document_drafting":["Draft project proposal for ML pipeline."],
"legal_regulated":["Review this contract for liability clauses."],
"tool_heavy":["Search open issues and create summary."],
"retrieval_heavy":["Answer based on 50-page document."],
"long_horizon":["Plan 3-month roadmap."],
"unknown_ambiguous":["Help me with this thing."],
}
classifier=TaskCostClassifier()
router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl")
def sim_lps_correlated(tier, diff, success, rng):
"""Better simulation: successful outputs have higher confidence."""
n=rng.randint(20,150)
if success:
# High confidence: mean logprob close to 0
base={1:-1.5,2:-1.0,3:-0.6,4:-0.3,5:-0.2}[tier]
noise_std=0.3+diff*0.05
else:
# Low confidence: more uncertain, lower logprobs
base={1:-4.0,2:-3.0,3:-2.0,4:-1.5,5:-1.0}[tier]
noise_std=0.8+diff*0.2
return [base+rng.gauss(0,noise_std) for _ in range(n)]
N=3000
results = {}
rng = random.Random(42)
# Method 1: always frontier
rng.seed(42); succ=0; cost=0.0
for i in range(N):
tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt])
pred=classifier.classify(req)
ps=TIER_STR[4]**(pred["difficulty"]*0.6)
if rng.random()<ps: succ+=1
cost+=TIER_COST[4]
results["frontier"]={"success":succ/N,"cost":cost/N}
# Method 2: v8 router
rng.seed(42); succ=0; cost=0.0
for i in range(N):
tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt])
pred=classifier.classify(req)
r=router.route(req,tt,pred["difficulty"],pred)
ps=TIER_STR[r.tier]**(r.dynamic_difficulty*0.6)
if rng.random()<ps: succ+=1
cost+=TIER_COST[r.tier]
results["v8"]={"success":succ/N,"cost":cost/N}
# Method 3: v9 = v8 + feedback on non-frontier tiers
# Only use feedback when v8 selected tier < 4
for ent_thr in [2.0,2.5,3.0]:
for lc_thr in [0.05,0.10,0.15]:
ef=ExecutionFeedbackRouter(entropy_threshold=ent_thr,
low_conf_ratio_threshold=lc_thr, tier_costs=TIER_COST)
rng.seed(42); succ=0; cost=0.0; escalated_count=0
for i in range(N):
tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt])
pred=classifier.classify(req)
r=router.route(req,tt,pred["difficulty"],pred)
tier=r.tier; diff=r.dynamic_difficulty
# Only use feedback for non-frontier tiers
if tier < 4:
ps=TIER_STR[tier]**(diff*0.6)
initial_success=rng.random()<ps
lps=sim_lps_correlated(tier,diff,initial_success,rng)
signal=ef.analyze_output(lps,task_type=tt,current_tier=tier)
if signal.should_escalate:
final_tier=min(tier+1,5)
final_tier=max(final_tier,TASK_FLOOR.get(tt,1))
ps2=TIER_STR[final_tier]**(diff*0.6)
c=TIER_COST[tier]+TIER_COST[final_tier]
if rng.random()<ps2: succ+=1
escalated_count+=1
else:
c=TIER_COST[tier]
if initial_success: succ+=1
else:
ps=TIER_STR[tier]**(diff*0.6)
if rng.random()<ps: succ+=1
c=TIER_COST[tier]
cost+=c
name=f"v9_e{ent_thr}_lc{lc_thr}"
results[name]={"success":succ/N,"cost":cost/N,
"escalated":escalated_count,"ent_thr":ent_thr,"lc_thr":lc_thr}
print("="*80)
print("FINAL v9 RESULTS: v8 ROUTER + SELECTIVE FEEDBACK")
print("="*80)
fc=results["frontier"]["cost"]
print(f"\n{'Method':<25} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'Gap':>8}")
print("-"*65)
print(f"{'always_frontier':<25} {results['frontier']['success']:>10.3f} {results['frontier']['cost']:>10.4f} {'0.0%':>10} {0.901-results['frontier']['success']:>8.3f}")
print(f"{'v8_router':<25} {results['v8']['success']:>10.3f} {results['v8']['cost']:>10.4f} {(1-results['v8']['cost']/fc)*100:>9.1f}% {0.901-results['v8']['success']:>8.3f}")
best_v9=None
for name,r in sorted(results.items(),key=lambda x:-x[1]["success"]):
if name.startswith("v9"):
cr=(1-r["cost"]/fc)*100
gap=0.901-r["success"]
print(f"{name:<25} {r['success']:>10.3f} {r['cost']:>10.4f} {cr:>9.1f}% {gap:>8.3f}")
# Pick the one closest to frontier quality with most savings
score=r["success"]*20-r["cost"]*5
if best_v9 is None or score>best_v9[0]:
best_v9=(score,name,r)
print(f"\n\nBest v9: {best_v9[1]}")
print(f" success={best_v9[2]['success']:.3f}, cost={best_v9[2]['cost']:.4f}")
print(f" costRed={(1-best_v9[2]['cost']/fc)*100:.1f}%")
print(f" quality_gap={0.901-best_v9[2]['success']:.3f}")
print(f" v8 gap={0.901-results['v8']['success']:.3f}")
print(f" Improvement: {best_v9[2]['success']-results['v8']['success']:+.3f} success, {best_v9[2]['cost']-results['v8']['cost']:+.4f} cost")
with open("/app/benchmark_v9_final.json","w") as f:
json.dump(results,f,indent=2,default=str)
print("DONE!")
|