Upload eval/bfcl_results.json with huggingface_hub
Browse files- eval/bfcl_results.json +31 -0
eval/bfcl_results.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"total_tasks": 800,
|
| 3 |
+
"savings_opportunity_pct": 84.125,
|
| 4 |
+
"opt_tier_distribution": {
|
| 5 |
+
"2": 13,
|
| 6 |
+
"1": 660
|
| 7 |
+
},
|
| 8 |
+
"model_success_rates": {
|
| 9 |
+
"BitAgent/BitAgent-8B": 0.385,
|
| 10 |
+
"NousResearch/Hermes-2-Pro-Llama-3-8B": 0.02375,
|
| 11 |
+
"NousResearch/Hermes-2-Pro-Mistral-7B": 0.02625,
|
| 12 |
+
"Qwen/QwQ-32B-Preview": 0.0,
|
| 13 |
+
"Qwen/Qwen2-1.5B-Instruct": 0.005,
|
| 14 |
+
"Qwen/Qwen2-7B-Instruct": 0.0325,
|
| 15 |
+
"Qwen/Qwen2.5-1.5B-Instruct": 0.01125,
|
| 16 |
+
"Qwen/Qwen2.5-7B-Instruct": 0.07625,
|
| 17 |
+
"THUDM/glm-4-9b-chat": 0.035,
|
| 18 |
+
"Team-ACE/ToolACE-8B": 0.0775,
|
| 19 |
+
"ZJared/Haha-7B": 0.10375,
|
| 20 |
+
"claude-3-5-sonnet-20241022": 0.075,
|
| 21 |
+
"claude-3-opus-20240229": 0.07125,
|
| 22 |
+
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.00125,
|
| 23 |
+
"gemini-1.5-flash-001": 0.195,
|
| 24 |
+
"gemini-1.5-flash-002": 0.125,
|
| 25 |
+
"gemini-1.5-pro-001-FC": 0.16,
|
| 26 |
+
"gemini-1.5-pro-001": 0.18875,
|
| 27 |
+
"gemini-1.5-pro-002-FC": 0.21625,
|
| 28 |
+
"gemini-2.0-flash-001-FC": 0.17875
|
| 29 |
+
},
|
| 30 |
+
"tool_error_rate": 1.8579889572641257
|
| 31 |
+
}
|