narcolepticchicken
/

agent-cost-optimizer

Safetensors

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 1 day ago

Commit

de4dd10

verified ·

1 Parent(s): 9ced75f

Upload training/bfcl_eval.py with huggingface_hub

Browse files

Files changed (1) hide show

training/bfcl_eval.py +102 -0

training/bfcl_eval.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""BFCL v3 Function-Calling Benchmark: Evaluate ACO tool-use routing."""
+import sys,json
+from collections import defaultdict
+from datasets import load_dataset
+print("="*80)
+print("BFCL v3 FUNCTION-CALLING BENCHMARK")
+print("="*80)
+# Load BFCL
+print("\n[1] Loading BFCL v3 trajectories...")
+ds = load_dataset('bespokelabs/bfcl-v3-02-27-metrics-trajectories', split='train')
+print(f"  Loaded {len(ds)} rows")
+# Get unique models and their success rates
+model_stats = defaultdict(lambda: {"valid":0,"total":0,"tool_errors":0})
+for row in ds:
+    m = row['model_name']
+    model_stats[m]["total"] += 1
+    if row['valid']: model_stats[m]["valid"] += 1
+    model_stats[m]["tool_errors"] += row.get('num_tool_errors', 0)
+# Print model success rates
+print(f"\n[2] Model success rates (top 20 by volume):")
+sorted_models = sorted(model_stats.items(), key=lambda x: -x[1]["total"])[:20]
+print(f"{'Model':<40} {'Valid':>8} {'Total':>8} {'Rate':>8} {'ToolErr':>8}")
+print("-"*75)
+for m, s in sorted_models:
+    rate = s["valid"]/max(s["total"],1)
+    print(f"{m:<40} {s['valid']:>8} {s['total']:>8} {rate:>8.3f} {s['tool_errors']:>8}")
+# Group by task ID to find routing opportunities
+print(f"\n[3] Task-level routing analysis...")
+task_results = defaultdict(dict)
+for row in ds:
+    task_results[row['id']][row['model_name']] = {
+        'valid': row['valid'],
+        'tool_errors': row.get('num_tool_errors', 0),
+        'category': row.get('category', ''),
+    }
+# Classify models into tiers based on success rate
+all_model_rates = {m: s["valid"]/max(s["total"],1) for m, s in model_stats.items()}
+# Tier assignment based on success rate
+def get_tier(rate):
+    if rate >= 0.85: return 4  # frontier
+    if rate >= 0.70: return 3  # medium
+    if rate >= 0.50: return 2  # cheap
+    return 1  # tiny
+model_tiers = {m: get_tier(r) for m, r in all_model_rates.items()}
+# For each task: optimal tier = cheapest tier that succeeds
+opt_tier_dist = defaultdict(int)
+savings_opportunity = 0
+for tid, results in task_results.items():
+    successful_tiers = []
+    for m, r in results.items():
+        if r['valid']:
+            successful_tiers.append(model_tiers.get(m, 3))
+    if successful_tiers:
+        opt = min(successful_tiers)
+        opt_tier_dist[opt] += 1
+        if opt < 4:  # could have used cheaper than frontier
+            savings_opportunity += 1
+total_tasks = len(task_results)
+print(f"  Total unique tasks: {total_tasks}")
+print(f"  Tasks solvable cheaper than frontier: {savings_opportunity} ({savings_opportunity/total_tasks*100:.1f}%)")
+print(f"  Optimal tier distribution:")
+for tier in sorted(opt_tier_dist.keys()):
+    print(f"    Tier {tier}: {opt_tier_dist[tier]} ({opt_tier_dist[tier]/total_tasks*100:.1f}%)")
+# Tool error analysis
+print(f"\n[4] Tool-use cost analysis...")
+total_tool_errors = sum(s["tool_errors"] for s in model_stats.values())
+total_calls = sum(s["total"] for s in model_stats.values())
+print(f"  Total tool errors: {total_tool_errors}")
+print(f"  Tool error rate: {total_tool_errors/max(total_calls,1)*100:.2f}%")
+# Categories of errors
+error_cats = defaultdict(int)
+for row in ds:
+    if row.get('error_type'):
+        error_cats[row['error_type']] += 1
+print(f"\n  Error categories:")
+for cat, count in sorted(error_cats.items(), key=lambda x: -x[1])[:10]:
+    print(f"    {cat}: {count}")
+# Save results
+results = {
+    "total_tasks": total_tasks,
+    "savings_opportunity_pct": savings_opportunity/total_tasks*100,
+    "opt_tier_distribution": dict(opt_tier_dist),
+    "model_success_rates": {m: s["valid"]/max(s["total"],1) for m, s in sorted_models},
+    "tool_error_rate": total_tool_errors/max(total_calls,1),
+}
+with open("/app/bfcl_results.json", "w") as f:
+    json.dump(results, f, indent=2)
+print(f"\nSaved to /app/bfcl_results.json")
+print("DONE!")