| |
| """FINAL v9 benchmark: v8 router + selective execution feedback. |
| |
| Strategy: Use v8 router for initial tier selection. Then for tasks |
| where v8 chose tier < 4 (not frontier), run cheap model first and |
| use output confidence to decide whether to escalate. This catches |
| the ~2% of cases where cheap model fails but could be saved by |
| escalation, without running double-inference everywhere. |
| """ |
| import sys,random,math,pickle,json |
| sys.path.insert(0,"/app") |
| from collections import defaultdict |
| from aco.classifier import TaskCostClassifier |
| from aco.router import ModelCascadeRouter |
| from aco.execution_feedback import ExecutionFeedbackRouter |
|
|
| TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97} |
| TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5} |
| TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3, |
| "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2, |
| "tool_heavy":2,"retrieval_heavy":2} |
|
|
| TASKS = { |
| "quick_answer":["What is 2+2?","Explain quantum computing briefly.","Convert 100F to Celsius."], |
| "coding":["Write Python function to reverse linked list.","Fix typo in README.", |
| "Debug critical production segfault NOW.","Just fix typo in line 42."], |
| "research":["Research latest transformer advances."], |
| "document_drafting":["Draft project proposal for ML pipeline."], |
| "legal_regulated":["Review this contract for liability clauses."], |
| "tool_heavy":["Search open issues and create summary."], |
| "retrieval_heavy":["Answer based on 50-page document."], |
| "long_horizon":["Plan 3-month roadmap."], |
| "unknown_ambiguous":["Help me with this thing."], |
| } |
|
|
| classifier=TaskCostClassifier() |
| router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") |
|
|
| def sim_lps_correlated(tier, diff, success, rng): |
| """Better simulation: successful outputs have higher confidence.""" |
| n=rng.randint(20,150) |
| if success: |
| |
| base={1:-1.5,2:-1.0,3:-0.6,4:-0.3,5:-0.2}[tier] |
| noise_std=0.3+diff*0.05 |
| else: |
| |
| base={1:-4.0,2:-3.0,3:-2.0,4:-1.5,5:-1.0}[tier] |
| noise_std=0.8+diff*0.2 |
| return [base+rng.gauss(0,noise_std) for _ in range(n)] |
|
|
| N=3000 |
| results = {} |
| rng = random.Random(42) |
|
|
| |
| rng.seed(42); succ=0; cost=0.0 |
| for i in range(N): |
| tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt]) |
| pred=classifier.classify(req) |
| ps=TIER_STR[4]**(pred["difficulty"]*0.6) |
| if rng.random()<ps: succ+=1 |
| cost+=TIER_COST[4] |
| results["frontier"]={"success":succ/N,"cost":cost/N} |
|
|
| |
| rng.seed(42); succ=0; cost=0.0 |
| for i in range(N): |
| tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt]) |
| pred=classifier.classify(req) |
| r=router.route(req,tt,pred["difficulty"],pred) |
| ps=TIER_STR[r.tier]**(r.dynamic_difficulty*0.6) |
| if rng.random()<ps: succ+=1 |
| cost+=TIER_COST[r.tier] |
| results["v8"]={"success":succ/N,"cost":cost/N} |
|
|
| |
| |
| for ent_thr in [2.0,2.5,3.0]: |
| for lc_thr in [0.05,0.10,0.15]: |
| ef=ExecutionFeedbackRouter(entropy_threshold=ent_thr, |
| low_conf_ratio_threshold=lc_thr, tier_costs=TIER_COST) |
| rng.seed(42); succ=0; cost=0.0; escalated_count=0 |
| for i in range(N): |
| tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt]) |
| pred=classifier.classify(req) |
| r=router.route(req,tt,pred["difficulty"],pred) |
| tier=r.tier; diff=r.dynamic_difficulty |
| |
| if tier < 4: |
| ps=TIER_STR[tier]**(diff*0.6) |
| initial_success=rng.random()<ps |
| lps=sim_lps_correlated(tier,diff,initial_success,rng) |
| signal=ef.analyze_output(lps,task_type=tt,current_tier=tier) |
| if signal.should_escalate: |
| final_tier=min(tier+1,5) |
| final_tier=max(final_tier,TASK_FLOOR.get(tt,1)) |
| ps2=TIER_STR[final_tier]**(diff*0.6) |
| c=TIER_COST[tier]+TIER_COST[final_tier] |
| if rng.random()<ps2: succ+=1 |
| escalated_count+=1 |
| else: |
| c=TIER_COST[tier] |
| if initial_success: succ+=1 |
| else: |
| ps=TIER_STR[tier]**(diff*0.6) |
| if rng.random()<ps: succ+=1 |
| c=TIER_COST[tier] |
| cost+=c |
| name=f"v9_e{ent_thr}_lc{lc_thr}" |
| results[name]={"success":succ/N,"cost":cost/N, |
| "escalated":escalated_count,"ent_thr":ent_thr,"lc_thr":lc_thr} |
|
|
| print("="*80) |
| print("FINAL v9 RESULTS: v8 ROUTER + SELECTIVE FEEDBACK") |
| print("="*80) |
| fc=results["frontier"]["cost"] |
| print(f"\n{'Method':<25} {'Success':>10} {'AvgCost':>10} {'CostRed':>10} {'Gap':>8}") |
| print("-"*65) |
| print(f"{'always_frontier':<25} {results['frontier']['success']:>10.3f} {results['frontier']['cost']:>10.4f} {'0.0%':>10} {0.901-results['frontier']['success']:>8.3f}") |
| print(f"{'v8_router':<25} {results['v8']['success']:>10.3f} {results['v8']['cost']:>10.4f} {(1-results['v8']['cost']/fc)*100:>9.1f}% {0.901-results['v8']['success']:>8.3f}") |
|
|
| best_v9=None |
| for name,r in sorted(results.items(),key=lambda x:-x[1]["success"]): |
| if name.startswith("v9"): |
| cr=(1-r["cost"]/fc)*100 |
| gap=0.901-r["success"] |
| print(f"{name:<25} {r['success']:>10.3f} {r['cost']:>10.4f} {cr:>9.1f}% {gap:>8.3f}") |
| |
| score=r["success"]*20-r["cost"]*5 |
| if best_v9 is None or score>best_v9[0]: |
| best_v9=(score,name,r) |
|
|
| print(f"\n\nBest v9: {best_v9[1]}") |
| print(f" success={best_v9[2]['success']:.3f}, cost={best_v9[2]['cost']:.4f}") |
| print(f" costRed={(1-best_v9[2]['cost']/fc)*100:.1f}%") |
| print(f" quality_gap={0.901-best_v9[2]['success']:.3f}") |
| print(f" v8 gap={0.901-results['v8']['success']:.3f}") |
| print(f" Improvement: {best_v9[2]['success']-results['v8']['success']:+.3f} success, {best_v9[2]['cost']-results['v8']['cost']:+.4f} cost") |
|
|
| with open("/app/benchmark_v9_final.json","w") as f: |
| json.dump(results,f,indent=2,default=str) |
| print("DONE!") |
|
|