Upload training/bfcl_eval.py with huggingface_hub
Browse files- training/bfcl_eval.py +102 -0
training/bfcl_eval.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing."""
|
| 3 |
+
import sys,json
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from datasets import load_dataset
|
| 6 |
+
|
| 7 |
+
print("="*80)
|
| 8 |
+
print("BFCL v3 FUNCTION-CALLING BENCHMARK")
|
| 9 |
+
print("="*80)
|
| 10 |
+
|
| 11 |
+
# Load BFCL
|
| 12 |
+
print("\n[1] Loading BFCL v3 trajectories...")
|
| 13 |
+
ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train')
|
| 14 |
+
print(f" Loaded {len(ds)} rows")
|
| 15 |
+
|
| 16 |
+
# Get unique models and their success rates
|
| 17 |
+
model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0})
|
| 18 |
+
for row in ds:
|
| 19 |
+
m = row['model_name']
|
| 20 |
+
model_stats[m]["total"] += 1
|
| 21 |
+
if row['valid']: model_stats[m]["valid"] += 1
|
| 22 |
+
model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0)
|
| 23 |
+
|
| 24 |
+
# Print model success rates
|
| 25 |
+
print(f"\n[2] Model success rates (top 20 by volume):")
|
| 26 |
+
sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20]
|
| 27 |
+
print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}")
|
| 28 |
+
print("-"*75)
|
| 29 |
+
for m, s in sorted_models:
|
| 30 |
+
rate = s["valid"]/max(s["total"],1)
|
| 31 |
+
print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}")
|
| 32 |
+
|
| 33 |
+
# Group by task ID to find routing opportunities
|
| 34 |
+
print(f"\n[3] Task-level routing analysis...")
|
| 35 |
+
task_results = defaultdict(dict)
|
| 36 |
+
for row in ds:
|
| 37 |
+
task_results[row['id']][row['model_name']] = {
|
| 38 |
+
'valid': row['valid'],
|
| 39 |
+
'tool_errors': row.get('num_tool_errors', 0),
|
| 40 |
+
'category': row.get('category', ''),
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Classify models into tiers based on success rate
|
| 44 |
+
all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()}
|
| 45 |
+
# Tier assignment based on success rate
|
| 46 |
+
def get_tier(rate):
|
| 47 |
+
if rate >= 0.85: return 4 # frontier
|
| 48 |
+
if rate >= 0.70: return 3 # medium
|
| 49 |
+
if rate >= 0.50: return 2 # cheap
|
| 50 |
+
return 1 # tiny
|
| 51 |
+
|
| 52 |
+
model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()}
|
| 53 |
+
|
| 54 |
+
# For each task: optimal tier = cheapest tier that succeeds
|
| 55 |
+
opt_tier_dist = defaultdict(int)
|
| 56 |
+
savings_opportunity = 0
|
| 57 |
+
for tid, results in task_results.items():
|
| 58 |
+
successful_tiers = []
|
| 59 |
+
for m, r in results.items():
|
| 60 |
+
if r['valid']:
|
| 61 |
+
successful_tiers.append(model_tiers.get(m, 3))
|
| 62 |
+
if successful_tiers:
|
| 63 |
+
opt = min(successful_tiers)
|
| 64 |
+
opt_tier_dist[opt] += 1
|
| 65 |
+
if opt < 4: # could have used cheaper than frontier
|
| 66 |
+
savings_opportunity += 1
|
| 67 |
+
|
| 68 |
+
total_tasks = len(task_results)
|
| 69 |
+
print(f" Total unique tasks: {total_tasks}")
|
| 70 |
+
print(f" Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)")
|
| 71 |
+
print(f" Optimal tier distribution:")
|
| 72 |
+
for tier in sorted(opt_tier_dist.keys()):
|
| 73 |
+
print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)")
|
| 74 |
+
|
| 75 |
+
# Tool error analysis
|
| 76 |
+
print(f"\n[4] Tool-use cost analysis...")
|
| 77 |
+
total_tool_errors = sum(s["tool_errors"] for s in model_stats.values())
|
| 78 |
+
total_calls = sum(s["total"] for s in model_stats.values())
|
| 79 |
+
print(f" Total tool errors: {total_tool_errors}")
|
| 80 |
+
print(f" Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%")
|
| 81 |
+
|
| 82 |
+
# Categories of errors
|
| 83 |
+
error_cats = defaultdict(int)
|
| 84 |
+
for row in ds:
|
| 85 |
+
if row.get('error_type'):
|
| 86 |
+
error_cats[row['error_type']] += 1
|
| 87 |
+
print(f"\n Error categories:")
|
| 88 |
+
for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]:
|
| 89 |
+
print(f" {cat}: {count}")
|
| 90 |
+
|
| 91 |
+
# Save results
|
| 92 |
+
results = {
|
| 93 |
+
"total_tasks": total_tasks,
|
| 94 |
+
"savings_opportunity_pct": savings_opportunity/total_tasks*100,
|
| 95 |
+
"opt_tier_distribution": dict(opt_tier_dist),
|
| 96 |
+
"model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models},
|
| 97 |
+
"tool_error_rate": total_tool_errors/max(total_calls,1),
|
| 98 |
+
}
|
| 99 |
+
with open("/app/bfcl_results.json", "w") as f:
|
| 100 |
+
json.dump(results, f, indent=2)
|
| 101 |
+
print(f"\nSaved to /app/bfcl_results.json")
|
| 102 |
+
print("DONE!")
|