#!/usr/bin/env python3 """Comprehensive benchmark with execution-feedback routing.""" import sys,json,random,math,pickle,time sys.path.insert(0,"/app") from collections import defaultdict TIER_STR={1:0.35,2:0.55,3:0.80,4:0.93,5:0.97} TIER_COST={1:0.05,2:0.15,3:0.75,4:1.0,5:1.5} TASK_FLOOR={"legal_regulated":4,"long_horizon":3,"research":3,"coding":3, "unknown_ambiguous":3,"quick_answer":1,"document_drafting":2, "tool_heavy":2,"retrieval_heavy":2} from aco.classifier import TaskCostClassifier from aco.router import ModelCascadeRouter from aco.execution_feedback import ExecutionFeedbackRouter, FeedbackSignal TASKS = { "quick_answer":["What is 2+2?","Explain quantum computing briefly.", "What is the capital of France?","Convert 100F to Celsius.", "Small clarification on this formula."], "coding":["Write a Python function to reverse a linked list.", "Fix a typo in the README.","Debug this critical production segfault NOW.", "Just fix the typo in line 42.","Refactor auth module to JWT.", "Implement LRU cache in Go."], "research":["Research latest transformer advances.", "Find sources comparing LoRA and full FT briefly.", "Investigate data center climate impact."], "document_drafting":["Draft project proposal for ML pipeline.", "Write email to team about deployment."], "legal_regulated":["Review this contract for liability clauses.", "Check GDPR compliance for data pipeline urgently."], "tool_heavy":["Search open issues and create summary.", "Fetch API docs and generate client code."], "retrieval_heavy":["Answer based on 50-page document.", "Find all payment processing mentions."], "long_horizon":["Plan 3-month roadmap.", "Orchestrate complete multi-region deployment."], "unknown_ambiguous":["Help me with this thing.", "I need something about the server."], } print("="*80) print("ACO v9 BENCHMARK: EXECUTION-FEEDBACK + DYNAMIC DIFFICULTY") print("="*80) classifier = TaskCostClassifier() router = ModelCascadeRouter(model_path="/app/router_models/router_bundle_v8.pkl") ef_router = ExecutionFeedbackRouter(tier_costs=TIER_COST, task_floors=TASK_FLOOR) rng = random.Random(42) N = 3000 def sim_logprobs(tier, difficulty, success, rng): """Simulate token logprobs based on tier and difficulty.""" n_tokens = rng.randint(20, 200) base_lp = {1:-3.5, 2:-2.5, 3:-1.5, 4:-0.7, 5:-0.3}[tier] base_lp *= (1 + difficulty * 0.15) lps = [] for _ in range(n_tokens): noise = rng.gauss(0, 1.0 + difficulty * 0.3) if success: lps.append(base_lp + noise * 0.3) else: lps.append(base_lp + noise * 0.8) return lps def eval_method(name, route_fn): succ = 0; cost = 0.0; unsafe = 0 per_tt = defaultdict(lambda: {"succ":0,"cost":0.0,"n":0}) for i in range(N): tt = rng.choice(list(TASKS.keys())) req = rng.choice(TASKS[tt]) pred = classifier.classify(req) tier, s, c, u = route_fn(req, tt, pred) if s: succ += 1 cost += c if u: unsafe += 1 per_tt[tt]["succ"] += (1 if s else 0) per_tt[tt]["cost"] += c per_tt[tt]["n"] += 1 return {"name":name,"success":succ/N,"avg_cost":cost/N,"unsafe":unsafe/N,"per_tt":dict(per_tt)} # Method A: always frontier def route_frontier(req, tt, pred): ps = TIER_STR[4]**(pred["difficulty"]*0.6) return 4, rng.random()10} {'AvgCost':>10} {'CostRed':>10} {'Unsafe':>10}") print("-"*60) fc = results["always_frontier"]["avg_cost"] for name in ["oracle","always_frontier","v9_feedback","v8_router","heuristic","always_cheap"]: r = results[name] cr = (1-r["avg_cost"]/fc)*100 print(f"{name:<20} {r['success']:>10.3f} {r['avg_cost']:>10.4f} {cr:>9.1f}% {r['unsafe']:>10.3f}") # Per-task comparison print(f"\n\n[2] Per-task success rate comparison:") print(f"{'Task':<20} {'Frontier':>10} {'v8':>10} {'v9_feedback':>12} {'CostRed_v9':>12}") print("-"*65) for tt in sorted(set(k for r in results.values() for k in r["per_tt"])): f_r = results["always_frontier"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0}) v8_r = results["v8_router"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0}) v9_r = results["v9_feedback"]["per_tt"].get(tt,{"succ":0,"n":1,"cost":0}) f_sr = f_r["succ"]/max(f_r["n"],1) v8_sr = v8_r["succ"]/max(v8_r["n"],1) v9_sr = v9_r["succ"]/max(v9_r["n"],1) f_c = f_r["cost"]/max(f_r["n"],1) v9_c = v9_r["cost"]/max(v9_r["n"],1) cr = (1-v9_c/f_c)*100 if f_c > 0 else 0 print(f"{tt:<20} {f_sr:>10.3f} {v8_sr:>10.3f} {v9_sr:>12.3f} {cr:>11.1f}%") # Cost-quality frontier print(f"\n\n[3] Cost-Quality Frontier:") for name in ["always_cheap","v8_router","v9_feedback","heuristic","always_frontier","oracle"]: r = results[name] cr = (1-r["avg_cost"]/fc)*100 print(f" {name:<20} success={r['success']:.3f} cost={r['avg_cost']:.4f} costRed={cr:.1f}%") # Key metrics v9 = results["v9_feedback"] v8 = results["v8_router"] fr = results["always_frontier"] v9_cr = (1-v9["avg_cost"]/fr["avg_cost"])*100 v8_cr = (1-v8["avg_cost"]/fr["avg_cost"])*100 quality_gap_v9 = fr["success"] - v9["success"] quality_gap_v8 = fr["success"] - v8["success"] print(f"\n\n[4] KEY RESULTS:") print(f" v8 success: {v8['success']:.3f} (gap vs frontier: {quality_gap_v8:.3f})") print(f" v9 success: {v9['success']:.3f} (gap vs frontier: {quality_gap_v9:.3f})") print(f" v8 costRed: {v8_cr:.1f}%") print(f" v9 costRed: {v9_cr:.1f}%") print(f" Quality gain v9 vs v8: {v9['success']-v8['success']:+.3f}") print(f" Cost increase v9 vs v8: {v9['avg_cost']-v8['avg_cost']:+.4f}") # Save with open("/app/benchmark_v9_results.json","w") as f: save_data = {} for name, r in results.items(): save_data[name] = {"success":r["success"],"avg_cost":r["avg_cost"],"unsafe":r["unsafe"]} json.dump(save_data, f, indent=2) print(f"\nSaved to /app/benchmark_v9_results.json") print("DONE!")