| |
| """BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing.""" |
| import sys,json |
| from collections import defaultdict |
| from datasets import load_dataset |
|
|
| print("="*80) |
| print("BFCL v3 FUNCTION-CALLING BENCHMARK") |
| print("="*80) |
|
|
| |
| print("\n[1] Loading BFCL v3 trajectories...") |
| ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train') |
| print(f" Loaded {len(ds)} rows") |
|
|
| |
| model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0}) |
| for row in ds: |
| m = row['model_name'] |
| model_stats[m]["total"] += 1 |
| if row['valid']: model_stats[m]["valid"] += 1 |
| model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0) |
|
|
| |
| print(f"\n[2] Model success rates (top 20 by volume):") |
| sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20] |
| print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}") |
| print("-"*75) |
| for m, s in sorted_models: |
| rate = s["valid"]/max(s["total"],1) |
| print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}") |
|
|
| |
| print(f"\n[3] Task-level routing analysis...") |
| task_results = defaultdict(dict) |
| for row in ds: |
| task_results[row['id']][row['model_name']] = { |
| 'valid': row['valid'], |
| 'tool_errors': row.get('num_tool_errors', 0), |
| 'category': row.get('category', ''), |
| } |
|
|
| |
| all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()} |
| |
| def get_tier(rate): |
| if rate >= 0.85: return 4 |
| if rate >= 0.70: return 3 |
| if rate >= 0.50: return 2 |
| return 1 |
|
|
| model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()} |
|
|
| |
| opt_tier_dist = defaultdict(int) |
| savings_opportunity = 0 |
| for tid, results in task_results.items(): |
| successful_tiers = [] |
| for m, r in results.items(): |
| if r['valid']: |
| successful_tiers.append(model_tiers.get(m, 3)) |
| if successful_tiers: |
| opt = min(successful_tiers) |
| opt_tier_dist[opt] += 1 |
| if opt < 4: |
| savings_opportunity += 1 |
|
|
| total_tasks = len(task_results) |
| print(f" Total unique tasks: {total_tasks}") |
| print(f" Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)") |
| print(f" Optimal tier distribution:") |
| for tier in sorted(opt_tier_dist.keys()): |
| print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)") |
|
|
| |
| print(f"\n[4] Tool-use cost analysis...") |
| total_tool_errors = sum(s["tool_errors"] for s in model_stats.values()) |
| total_calls = sum(s["total"] for s in model_stats.values()) |
| print(f" Total tool errors: {total_tool_errors}") |
| print(f" Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%") |
|
|
| |
| error_cats = defaultdict(int) |
| for row in ds: |
| if row.get('error_type'): |
| error_cats[row['error_type']] += 1 |
| print(f"\n Error categories:") |
| for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]: |
| print(f" {cat}: {count}") |
|
|
| |
| results = { |
| "total_tasks": total_tasks, |
| "savings_opportunity_pct": savings_opportunity/total_tasks*100, |
| "opt_tier_distribution": dict(opt_tier_dist), |
| "model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models}, |
| "tool_error_rate": total_tool_errors/max(total_calls,1), |
| } |
| with open("/app/bfcl_results.json", "w") as f: |
| json.dump(results, f, indent=2) |
| print(f"\nSaved to /app/bfcl_results.json") |
| print("DONE!") |
|
|