narcolepticchicken commited on
Commit
8a5c26f
·
verified ·
1 Parent(s): 3bed4ff

Upload eval/eval_bert_partD.py

Browse files
Files changed (1) hide show
  1. eval/eval_bert_partD.py +45 -18
eval/eval_bert_partD.py CHANGED
@@ -10,8 +10,15 @@ print(f"{'='*70}")
10
  print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
11
  print("-"*52)
12
  for name in ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap']:
13
- if name not in policies: continue
14
- r = policies[name]
 
 
 
 
 
 
 
15
  sr = r['success']/r['n'] if r['n'] > 0 else 0
16
  ac = r['cost']/r['n'] if r['n'] > 0 else 0
17
  cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
@@ -19,22 +26,37 @@ for name in ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','front
19
 
20
  print(f"\nQuality gap vs frontier:")
21
  for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']:
22
- r = policies[name]
 
 
 
 
 
 
 
23
  sr = r['success']/r['n'] if r['n'] > 0 else 0
24
  gap = (sr - fr_succ) * 100
25
  print(f" {name}: {gap:+.1f}pp vs frontier")
26
 
27
  # BERT tier distribution
28
- print(f"\nBERT tier distribution:")
29
  bert_tiers = defaultdict(int)
30
- for iid, model_results in list(traces.items())[:500]:
 
31
  problem = next(iter(model_results.values()))['problem']
32
- t, c = route_bert(problem)
33
  bert_tiers[t] += 1
 
 
 
34
  for t in sorted(bert_tiers):
35
- print(f" Tier {t}: {bert_tiers[t]}")
 
 
 
 
36
 
37
- # Save results
38
  results = {}
39
  for name, r in policies.items():
40
  sr = r['success']/r['n'] if r['n'] > 0 else 0
@@ -42,13 +64,18 @@ for name, r in policies.items():
42
  cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
43
  results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)}
44
 
45
- from huggingface_hub import HfApi
46
- import tempfile
47
- api = HfApi()
48
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
49
- json.dump(results, f, indent=2)
50
- api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json",
51
- repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model")
52
- os.unlink(f.name)
53
- print(f"\nResults uploaded to eval/bert_vs_xgboost_results.json")
54
- print("DONE!")
 
 
 
 
 
 
10
  print(f"\n{'Policy':<20} {'Success':>10} {'AvgCost':>10} {'CostRed':>10}")
11
  print("-"*52)
12
  for name in ['oracle','bert_feedback','v11_feedback','bert','v11_xgboost','frontier','always_cheap']:
13
+ # Map v11 names to v10 if v11 not available
14
+ actual_name = name
15
+ if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies:
16
+ actual_name = 'v10_xgboost'
17
+ if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies:
18
+ actual_name = 'v10_feedback'
19
+ if actual_name not in policies:
20
+ continue
21
+ r = policies[actual_name]
22
  sr = r['success']/r['n'] if r['n'] > 0 else 0
23
  ac = r['cost']/r['n'] if r['n'] > 0 else 0
24
  cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
 
26
 
27
  print(f"\nQuality gap vs frontier:")
28
  for name in ['bert','bert_feedback','v11_xgboost','v11_feedback']:
29
+ actual_name = name
30
+ if name == 'v11_xgboost' and 'v11_xgboost' not in policies and 'v10_xgboost' in policies:
31
+ actual_name = 'v10_xgboost'
32
+ if name == 'v11_feedback' and 'v11_feedback' not in policies and 'v10_feedback' in policies:
33
+ actual_name = 'v10_feedback'
34
+ if actual_name not in policies:
35
+ continue
36
+ r = policies[actual_name]
37
  sr = r['success']/r['n'] if r['n'] > 0 else 0
38
  gap = (sr - fr_succ) * 100
39
  print(f" {name}: {gap:+.1f}pp vs frontier")
40
 
41
  # BERT tier distribution
42
+ print(f"\nBERT tier distribution (first 100 tasks):")
43
  bert_tiers = defaultdict(int)
44
+ bert_probs_dist = defaultdict(list)
45
+ for iid, model_results in list(traces.items())[:100]:
46
  problem = next(iter(model_results.values()))['problem']
47
+ t, conf, probs = route_bert(problem)
48
  bert_tiers[t] += 1
49
+ for tier, p in probs.items():
50
+ bert_probs_dist[tier].append(p)
51
+ print(f" Tier routing counts:")
52
  for t in sorted(bert_tiers):
53
+ print(f" Tier {t}: {bert_tiers[t]}")
54
+ print(f" Per-tier P(success) stats:")
55
+ for t in sorted(bert_probs_dist):
56
+ ps = bert_probs_dist[t]
57
+ print(f" Tier {t}: mean={np.mean(ps):.3f}, std={np.std(ps):.3f}, min={np.min(ps):.3f}, max={np.max(ps):.3f}")
58
 
59
+ # Save results to local file and try to upload to Hub
60
  results = {}
61
  for name, r in policies.items():
62
  sr = r['success']/r['n'] if r['n'] > 0 else 0
 
64
  cr = (1-ac/fr_cost)*100 if fr_cost > 0 else 0
65
  results[name] = {"success": round(sr, 4), "avg_cost": round(ac, 4), "costRed": round(cr, 1)}
66
 
67
+ import tempfile, json
68
+ try:
69
+ from huggingface_hub import HfApi
70
+ api = HfApi()
71
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
72
+ json.dump(results, f, indent=2)
73
+ api.upload_file(path_or_fileobj=f.name, path_in_repo="eval/bert_vs_xgboost_results.json",
74
+ repo_id="narcolepticchicken/agent-cost-optimizer", repo_type="model")
75
+ os.unlink(f.name)
76
+ print(f"\nResults uploaded to eval/bert_vs_xgboost_results.json on Hub")
77
+ except Exception as e:
78
+ print(f"\nCould not upload to Hub: {e}")
79
+ print(f"Results JSON:\n{json.dumps(results, indent=2)}")
80
+
81
+ print("\nDONE!")