|
|
| |
| fr = policies['frontier'] |
| fr_cost = fr['cost'] / fr['n'] |
| fr_succ = fr['success'] / fr['n'] |
|
|
| print(f"\n\n{'='*70}") |
| print("BERT vs XGBoost ROUTER COMPARISON ON SWE-BENCH") |
| print(f"{'='*70}") |
| print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}") |
| print("-"*52) |
| for name in ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap']: |
| |
| actual_name = name |
| if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies: |
| actual_name = 'v10_xgboost' |
| if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies: |
| actual_name = 'v10_feedback' |
| if actual_name not in policies: |
| continue |
| r = policies[actual_name] |
| sr = r['success']/r['n'] if r['n'] > 0 else 0 |
| ac = r['cost']/r['n'] if r['n'] > 0 else 0 |
| cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0 |
| print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%") |
|
|
| print(f"\nQuality gap vs frontier:") |
| for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']: |
| actual_name = name |
| if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies: |
| actual_name = 'v10_xgboost' |
| if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies: |
| actual_name = 'v10_feedback' |
| if actual_name not in policies: |
| continue |
| r = policies[actual_name] |
| sr = r['success']/r['n'] if r['n'] > 0 else 0 |
| gap = (sr - fr_succ) * 100 |
| print(f" {name}: {gap:+.1f}pp vs frontier") |
|
|
| |
| print(f"\nBERT tier distribution (first 100 tasks):") |
| bert_tiers = defaultdict(int) |
| bert_probs_dist = defaultdict(list) |
| for iid, model_results in list(traces.items())[:100]: |
| problem = next(iter(model_results.values()))['problem'] |
| t, conf, probs = route_bert(problem) |
| bert_tiers[t] += 1 |
| for tier, p in probs.items(): |
| bert_probs_dist[tier].append(p) |
| print(f" Tier routing counts:") |
| for t in sorted(bert_tiers): |
| print(f" Tier {t}: {bert_tiers[t]}") |
| print(f" Per-tier P(success) stats:") |
| for t in sorted(bert_probs_dist): |
| ps = bert_probs_dist[t] |
| print(f" Tier {t}: mean={np.mean(ps):.3f}, std={np.std(ps):.3f}, min={np.min(ps):.3f}, max={np.max(ps):.3f}") |
|
|
| |
| results = {} |
| for name, r in policies.items(): |
| sr = r['success']/r['n'] if r['n'] > 0 else 0 |
| ac = r['cost']/r['n'] if r['n'] > 0 else 0 |
| cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0 |
| results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)} |
|
|
| import tempfile, json |
| try: |
| from huggingface_hub import HfApi |
| api = HfApi() |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: |
| json.dump(results, f, indent=2) |
| api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json", |
| repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model") |
| os.unlink(f.name) |
| print(f"\nResults uploaded to eval/bert_vs_xgboost_results.json on Hub") |
| except Exception as e: |
| print(f"\nCould not upload to Hub: {e}") |
| print(f"Results JSON:\n{json.dumps(results, indent=2)}") |
|
|
| print("\nDONE!") |
|
|