#!/usr/bin/env python3 """BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing.""" import sys,json from collections import defaultdict from datasets import load_dataset print("="*80) print("BFCL v3 FUNCTION-CALLING BENCHMARK") print("="*80) # Load BFCL print("\n[1] Loading BFCL v3 trajectories...") ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train') print(f" Loaded {len(ds)} rows") # Get unique models and their success rates model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0}) for row in ds: m = row['model_name'] model_stats[m]["total"] += 1 if row['valid']: model_stats[m]["valid"] += 1 model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0) # Print model success rates print(f"\n[2] Model success rates (top 20 by volume):") sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20] print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}") print("-"*75) for m, s in sorted_models: rate = s["valid"]/max(s["total"],1) print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}") # Group by task ID to find routing opportunities print(f"\n[3] Task-level routing analysis...") task_results = defaultdict(dict) for row in ds: task_results[row['id']][row['model_name']] = { 'valid': row['valid'], 'tool_errors': row.get('num_tool_errors', 0), 'category': row.get('category', ''), } # Classify models into tiers based on success rate all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()} # Tier assignment based on success rate def get_tier(rate): if rate >= 0.85: return 4 # frontier if rate >= 0.70: return 3 # medium if rate >= 0.50: return 2 # cheap return 1 # tiny model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()} # For each task: optimal tier = cheapest tier that succeeds opt_tier_dist = defaultdict(int) savings_opportunity = 0 for tid, results in task_results.items(): successful_tiers = [] for m, r in results.items(): if r['valid']: successful_tiers.append(model_tiers.get(m, 3)) if successful_tiers: opt = min(successful_tiers) opt_tier_dist[opt] += 1 if opt < 4: # could have used cheaper than frontier savings_opportunity += 1 total_tasks = len(task_results) print(f" Total unique tasks: {total_tasks}") print(f" Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)") print(f" Optimal tier distribution:") for tier in sorted(opt_tier_dist.keys()): print(f" Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)") # Tool error analysis print(f"\n[4] Tool-use cost analysis...") total_tool_errors = sum(s["tool_errors"] for s in model_stats.values()) total_calls = sum(s["total"] for s in model_stats.values()) print(f" Total tool errors: {total_tool_errors}") print(f" Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%") # Categories of errors error_cats = defaultdict(int) for row in ds: if row.get('error_type'): error_cats[row['error_type']] += 1 print(f"\n Error categories:") for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]: print(f" {cat}: {count}") # Save results results = { "total_tasks": total_tasks, "savings_opportunity_pct": savings_opportunity/total_tasks*100, "opt_tier_distribution": dict(opt_tier_dist), "model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models}, "tool_error_rate": total_tool_errors/max(total_calls,1), } with open("/app/bfcl_results.json", "w") as f: json.dump(results, f, indent=2) print(f"\nSaved to /app/bfcl_results.json") print("DONE!")