narcolepticchicken commited on
Commit
de4dd10
·
verified ·
1 Parent(s): 9ced75f

Upload training/bfcl_eval.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training/bfcl_eval.py +102 -0
training/bfcl_eval.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing."""
3
+ import sys,json
4
+ from collections import defaultdict
5
+ from datasets import load_dataset
6
+
7
+ print("="*80)
8
+ print("BFCL v3 FUNCTION-CALLING BENCHMARK")
9
+ print("="*80)
10
+
11
+ # Load BFCL
12
+ print("\n[1] Loading BFCL v3 trajectories...")
13
+ ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train')
14
+ print(f" Loaded {len(ds)} rows")
15
+
16
+ # Get unique models and their success rates
17
+ model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0})
18
+ for row in ds:
19
+ m = row['model_name']
20
+ model_stats[m]["total"] += 1
21
+ if row['valid']: model_stats[m]["valid"] += 1
22
+ model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0)
23
+
24
+ # Print model success rates
25
+ print(f"\n[2] Model success rates (top 20 by volume):")
26
+ sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20]
27
+ print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}")
28
+ print("-"*75)
29
+ for m, s in sorted_models:
30
+ rate = s["valid"]/max(s["total"],1)
31
+ print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}")
32
+
33
+ # Group by task ID to find routing opportunities
34
+ print(f"\n[3] Task-level routing analysis...")
35
+ task_results = defaultdict(dict)
36
+ for row in ds:
37
+ task_results[row['id']][row['model_name']] = {
38
+ 'valid': row['valid'],
39
+ 'tool_errors': row.get('num_tool_errors', 0),
40
+ 'category': row.get('category', ''),
41
+ }
42
+
43
+ # Classify models into tiers based on success rate
44
+ all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()}
45
+ # Tier assignment based on success rate
46
+ def get_tier(rate):
47
+ if rate >= 0.85: return 4 # frontier
48
+ if rate >= 0.70: return 3 # medium
49
+ if rate >= 0.50: return 2 # cheap
50
+ return 1 # tiny
51
+
52
+ model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()}
53
+
54
+ # For each task: optimal tier = cheapest tier that succeeds
55
+ opt_tier_dist = defaultdict(int)
56
+ savings_opportunity = 0
57
+ for tid, results in task_results.items():
58
+ successful_tiers = []
59
+ for m, r in results.items():
60
+ if r['valid']:
61
+ successful_tiers.append(model_tiers.get(m, 3))
62
+ if successful_tiers:
63
+ opt = min(successful_tiers)
64
+ opt_tier_dist[opt] += 1
65
+ if opt < 4: # could have used cheaper than frontier
66
+ savings_opportunity += 1
67
+
68
+ total_tasks = len(task_results)
69
+ print(f" Total unique tasks: {total_tasks}")
70
+ print(f" Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)")
71
+ print(f" Optimal tier distribution:")
72
+ for tier in sorted(opt_tier_dist.keys()):
73
+ print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)")
74
+
75
+ # Tool error analysis
76
+ print(f"\n[4] Tool-use cost analysis...")
77
+ total_tool_errors = sum(s["tool_errors"] for s in model_stats.values())
78
+ total_calls = sum(s["total"] for s in model_stats.values())
79
+ print(f" Total tool errors: {total_tool_errors}")
80
+ print(f" Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%")
81
+
82
+ # Categories of errors
83
+ error_cats = defaultdict(int)
84
+ for row in ds:
85
+ if row.get('error_type'):
86
+ error_cats[row['error_type']] += 1
87
+ print(f"\n Error categories:")
88
+ for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]:
89
+ print(f" {cat}: {count}")
90
+
91
+ # Save results
92
+ results = {
93
+ "total_tasks": total_tasks,
94
+ "savings_opportunity_pct": savings_opportunity/total_tasks*100,
95
+ "opt_tier_distribution": dict(opt_tier_dist),
96
+ "model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models},
97
+ "tool_error_rate": total_tool_errors/max(total_calls,1),
98
+ }
99
+ with open("/app/bfcl_results.json", "w") as f:
100
+ json.dump(results, f, indent=2)
101
+ print(f"\nSaved to /app/bfcl_results.json")
102
+ print("DONE!")