File size: 3,832 Bytes
de4dd10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | #!/usr/bin/env python3
"""BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing."""
import sys,json
from collections import defaultdict
from datasets import load_dataset
print("="*80)
print("BFCL v3 FUNCTION-CALLING BENCHMARK")
print("="*80)
# Load BFCL
print("\n[1] Loading BFCL v3 trajectories...")
ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train')
print(f" Loaded {len(ds)} rows")
# Get unique models and their success rates
model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0})
for row in ds:
m = row['model_name']
model_stats[m]["total"] += 1
if row['valid']: model_stats[m]["valid"] += 1
model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0)
# Print model success rates
print(f"\n[2] Model success rates (top 20 by volume):")
sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20]
print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}")
print("-"*75)
for m, s in sorted_models:
rate = s["valid"]/max(s["total"],1)
print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}")
# Group by task ID to find routing opportunities
print(f"\n[3] Task-level routing analysis...")
task_results = defaultdict(dict)
for row in ds:
task_results[row['id']][row['model_name']] = {
'valid': row['valid'],
'tool_errors': row.get('num_tool_errors', 0),
'category': row.get('category', ''),
}
# Classify models into tiers based on success rate
all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()}
# Tier assignment based on success rate
def get_tier(rate):
if rate >= 0.85: return 4 # frontier
if rate >= 0.70: return 3 # medium
if rate >= 0.50: return 2 # cheap
return 1 # tiny
model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()}
# For each task: optimal tier = cheapest tier that succeeds
opt_tier_dist = defaultdict(int)
savings_opportunity = 0
for tid, results in task_results.items():
successful_tiers = []
for m, r in results.items():
if r['valid']:
successful_tiers.append(model_tiers.get(m, 3))
if successful_tiers:
opt = min(successful_tiers)
opt_tier_dist[opt] += 1
if opt < 4: # could have used cheaper than frontier
savings_opportunity += 1
total_tasks = len(task_results)
print(f" Total unique tasks: {total_tasks}")
print(f" Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)")
print(f" Optimal tier distribution:")
for tier in sorted(opt_tier_dist.keys()):
print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)")
# Tool error analysis
print(f"\n[4] Tool-use cost analysis...")
total_tool_errors = sum(s["tool_errors"] for s in model_stats.values())
total_calls = sum(s["total"] for s in model_stats.values())
print(f" Total tool errors: {total_tool_errors}")
print(f" Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%")
# Categories of errors
error_cats = defaultdict(int)
for row in ds:
if row.get('error_type'):
error_cats[row['error_type']] += 1
print(f"\n Error categories:")
for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]:
print(f" {cat}: {count}")
# Save results
results = {
"total_tasks": total_tasks,
"savings_opportunity_pct": savings_opportunity/total_tasks*100,
"opt_tier_distribution": dict(opt_tier_dist),
"model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models},
"tool_error_rate": total_tool_errors/max(total_calls,1),
}
with open("/app/bfcl_results.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\nSaved to /app/bfcl_results.json")
print("DONE!")
|