#!/usr/bin/env python3 """FINAL v9 benchmark: v8 router + selective execution feedback. Strategy: Use v8 router for initial tier selection. Then for tasks where v8 chose tier < 4 (not frontier), run cheap model first and use output confidence to decide whether to escalate. This catches the ~2% of cases where cheap model fails but could be saved by escalation, without running double-inference everywhere. """ import sys,random,math,pickle,json sys.path.insert(0,"/app") from collections import defaultdict from aco.classifier import TaskCostClassifier from aco.router import ModelCascadeRouter from aco.execution_feedback import ExecutionFeedbackRouter TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97} TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5} TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3, "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2, "tool_heavy":2,"retrieval_heavy":2} TASKS = { "quick_answer":["What is 2+2?","Explain quantum computing briefly.","Convert 100F to Celsius."], "coding":["Write Python function to reverse linked list.","Fix typo in README.", "Debug critical production segfault NOW.","Just fix typo in line 42."], "research":["Research latest transformer advances."], "document_drafting":["Draft project proposal for ML pipeline."], "legal_regulated":["Review this contract for liability clauses."], "tool_heavy":["Search open issues and create summary."], "retrieval_heavy":["Answer based on 50-page document."], "long_horizon":["Plan 3-month roadmap."], "unknown_ambiguous":["Help me with this thing."], } classifier=TaskCostClassifier() router=ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") def sim_lps_correlated(tier, diff, success, rng): """Better simulation: successful outputs have higher confidence.""" n=rng.randint(20,150) if success: # High confidence: mean logprob close to 0 base={1:-1.5,2:-1.0,3:-0.6,4:-0.3,5:-0.2}[tier] noise_std=0.3+diff*0.05 else: # Low confidence: more uncertain, lower logprobs base={1:-4.0,2:-3.0,3:-2.0,4:-1.5,5:-1.0}[tier] noise_std=0.8+diff*0.2 return [base+rng.gauss(0,noise_std) for _ in range(n)] N=3000 results = {} rng = random.Random(42) # Method 1: always frontier rng.seed(42); succ=0; cost=0.0 for i in range(N): tt=rng.choice(list(TASKS.keys())); req=rng.choice(TASKS[tt]) pred=classifier.classify(req) ps=TIER_STR[4]**(pred["difficulty"]*0.6) if rng.random()10} {'AvgCost':>10} {'CostRed':>10} {'Gap':>8}") print("-"*65) print(f"{'always_frontier':<25} {results['frontier']['success']:>10.3f} {results['frontier']['cost']:>10.4f} {'0.0%':>10} {0.901-results['frontier']['success']:>8.3f}") print(f"{'v8_router':<25} {results['v8']['success']:>10.3f} {results['v8']['cost']:>10.4f} {(1-results['v8']['cost']/fc)*100:>9.1f}% {0.901-results['v8']['success']:>8.3f}") best_v9=None for name,r in sorted(results.items(),key=lambda x:-x[1]["success"]): if name.startswith("v9"): cr=(1-r["cost"]/fc)*100 gap=0.901-r["success"] print(f"{name:<25} {r['success']:>10.3f} {r['cost']:>10.4f} {cr:>9.1f}% {gap:>8.3f}") # Pick the one closest to frontier quality with most savings score=r["success"]*20-r["cost"]*5 if best_v9 is None or score>best_v9[0]: best_v9=(score,name,r) print(f"\n\nBest v9: {best_v9[1]}") print(f" success={best_v9[2]['success']:.3f}, cost={best_v9[2]['cost']:.4f}") print(f" costRed={(1-best_v9[2]['cost']/fc)*100:.1f}%") print(f" quality_gap={0.901-best_v9[2]['success']:.3f}") print(f" v8 gap={0.901-results['v8']['success']:.3f}") print(f" Improvement: {best_v9[2]['success']-results['v8']['success']:+.3f} success, {best_v9[2]['cost']-results['v8']['cost']:+.4f} cost") with open("/app/benchmark_v9_final.json","w") as f: json.dump(results,f,indent=2,default=str) print("DONE!")