File size: 3,353 Bytes


# ── Results ──
fr = policies['frontier']
fr_cost = fr['cost'] / fr['n']
fr_succ = fr['success'] / fr['n']

print(f"\n\n{'='*70}")
print("BERT vs XGBoost ROUTER COMPARISON ON SWE-BENCH")
print(f"{'='*70}")
print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
print("-"*52)
for name in ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap']:
    # Map v11 names to v10 if v11 not available
    actual_name = name
    if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies:
        actual_name = 'v10_xgboost'
    if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies:
        actual_name = 'v10_feedback'
    if actual_name not in policies:
        continue
    r = policies[actual_name]
    sr = r['success']/r['n'] if r['n'] > 0 else 0
    ac = r['cost']/r['n'] if r['n'] > 0 else 0
    cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
    print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")

print(f"\nQuality gap vs frontier:")
for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']:
    actual_name = name
    if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies:
        actual_name = 'v10_xgboost'
    if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies:
        actual_name = 'v10_feedback'
    if actual_name not in policies:
        continue
    r = policies[actual_name]
    sr = r['success']/r['n'] if r['n'] > 0 else 0
    gap = (sr - fr_succ) * 100
    print(f"  {name}: {gap:+.1f}pp vs frontier")

# BERT tier distribution
print(f"\nBERT tier distribution (first 100 tasks):")
bert_tiers = defaultdict(int)
bert_probs_dist = defaultdict(list)
for iid, model_results in list(traces.items())[:100]:
    problem = next(iter(model_results.values()))['problem']
    t, conf, probs = route_bert(problem)
    bert_tiers[t] += 1
    for tier, p in probs.items():
        bert_probs_dist[tier].append(p)
print(f"  Tier routing counts:")
for t in sorted(bert_tiers):
    print(f"    Tier {t}: {bert_tiers[t]}")
print(f"  Per-tier P(success) stats:")
for t in sorted(bert_probs_dist):
    ps = bert_probs_dist[t]
    print(f"    Tier {t}: mean={np.mean(ps):.3f}, std={np.std(ps):.3f}, min={np.min(ps):.3f}, max={np.max(ps):.3f}")

# Save results to local file and try to upload to Hub
results = {}
for name, r in policies.items():
    sr = r['success']/r['n'] if r['n'] > 0 else 0
    ac = r['cost']/r['n'] if r['n'] > 0 else 0
    cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
    results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)}

import tempfile, json
try:
    from huggingface_hub import HfApi
    api = HfApi()
    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
        json.dump(results, f, indent=2)
        api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json",
                        repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model")
        os.unlink(f.name)
    print(f"\nResults uploaded to eval/bert_vs_xgboost_results.json on Hub")
except Exception as e:
    print(f"\nCould not upload to Hub: {e}")
    print(f"Results JSON:\n{json.dumps(results, indent=2)}")

print("\nDONE!")