# ── Part 4: Print results ── print(f"\n\n{'='*70}") print("BERT vs XGBoost ROUTER COMPARISON ON SWE-BENCH") print(f"{'='*70}") fr = policies['frontier'] fr_cost = fr['cost'] / fr['n'] fr_succ = fr['success'] / fr['n'] print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") print("-"*52) order = ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap'] for name in order: if name not in policies: continue r = policies[name] sr = r['success']/r['n'] if r['n'] > 0 else 0 ac = r['cost']/r['n'] if r['n'] > 0 else 0 cr = (1 - ac/fr_cost)*100 if fr_cost > 0 else 0 print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") # BERT tier distribution print(f"\n\nBERT tier distribution:") bert_tiers = defaultdict(int) for iid, model_results in traces.items(): problem = next(iter(model_results.values()))['problem'] t, c = route_bert(problem) bert_tiers[t] += 1 for t in sorted(bert_tiers): print(f" Tier {t}: {bert_tiers[t]}") # Quality gap analysis print(f"\n\nQuality gap vs frontier:") for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']: r = policies[name] sr = r['success']/r['n'] if r['n'] > 0 else 0 gap = (sr - fr_succ) * 100 print(f" {name}: {gap:+.1f}pp vs frontier") # Save results results = {} for name, r in policies.items(): sr = r['success']/r['n'] if r['n'] > 0 else 0 ac = r['cost']/r['n'] if r['n'] > 0 else 0 cr = (1 - ac/fr_cost)*100 if fr_cost > 0 else 0 results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)} # Upload results to Hub from huggingface_hub import HfApi api = HfApi() import tempfile, json with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: json.dump(results, f, indent=2) api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json", repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model") os.unlink(f.name) print(f"\nResults saved to eval/bert_vs_xgboost_results.json") print("DONE!")