narcolepticchicken commited on
Commit
617b314
·
verified ·
1 Parent(s): 7a843a8

Upload eval/eval_bert_part4.py

Browse files
Files changed (1) hide show
  1. eval/eval_bert_part4.py +61 -0
eval/eval_bert_part4.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # ── Part 4: Print results ──
3
+
4
+ print(f"\n\n{'='*70}")
5
+ print("BERT vs XGBoost ROUTER COMPARISON ON SWE-BENCH")
6
+ print(f"{'='*70}")
7
+
8
+ fr = policies['frontier']
9
+ fr_cost = fr['cost'] / fr['n']
10
+ fr_succ = fr['success'] / fr['n']
11
+
12
+ print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
13
+ print("-"*52)
14
+ order = ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap']
15
+ for name in order:
16
+ if name not in policies:
17
+ continue
18
+ r = policies[name]
19
+ sr = r['success']/r['n'] if r['n'] > 0 else 0
20
+ ac = r['cost']/r['n'] if r['n'] > 0 else 0
21
+ cr = (1 - ac/fr_cost)*100 if fr_cost > 0 else 0
22
+ print(f"{name:<20} {sr:>10.3f} {ac:>10.4f} {cr:>9.1f}%")
23
+
24
+ # BERT tier distribution
25
+ print(f"\n\nBERT tier distribution:")
26
+ bert_tiers = defaultdict(int)
27
+ for iid, model_results in traces.items():
28
+ problem = next(iter(model_results.values()))['problem']
29
+ t, c = route_bert(problem)
30
+ bert_tiers[t] += 1
31
+ for t in sorted(bert_tiers):
32
+ print(f" Tier {t}: {bert_tiers[t]}")
33
+
34
+ # Quality gap analysis
35
+ print(f"\n\nQuality gap vs frontier:")
36
+ for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']:
37
+ r = policies[name]
38
+ sr = r['success']/r['n'] if r['n'] > 0 else 0
39
+ gap = (sr - fr_succ) * 100
40
+ print(f" {name}: {gap:+.1f}pp vs frontier")
41
+
42
+ # Save results
43
+ results = {}
44
+ for name, r in policies.items():
45
+ sr = r['success']/r['n'] if r['n'] > 0 else 0
46
+ ac = r['cost']/r['n'] if r['n'] > 0 else 0
47
+ cr = (1 - ac/fr_cost)*100 if fr_cost > 0 else 0
48
+ results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)}
49
+
50
+ # Upload results to Hub
51
+ from huggingface_hub import HfApi
52
+ api = HfApi()
53
+ import tempfile, json
54
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
55
+ json.dump(results, f, indent=2)
56
+ api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json",
57
+ repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model")
58
+ os.unlink(f.name)
59
+
60
+ print(f"\nResults saved to eval/bert_vs_xgboost_results.json")
61
+ print("DONE!")