agent-cost-optimizer / eval /benchmark_results.json
narcolepticchicken's picture
Upload eval/benchmark_results.json with huggingface_hub
95c42d4 verified
{
"aco_v8": {
"name": "aco_v8",
"success_rate": 0.796,
"avg_cost": 0.7781665000000001,
"model_cost": 0.7544249999999999,
"tool_cost": 0.0213615,
"ver_cost": 0.0023799999999999997,
"avg_context_tokens": 9352.864,
"verifications": 238,
"avg_tools": 2.727,
"escalations": 0,
"downgrades": 0
},
"frontier": {
"name": "always_frontier",
"success_rate": 0.91,
"avg_cost": 1.0413615,
"model_cost": 1.0413615,
"tool_cost": 0.0,
"ver_cost": 0.0,
"avg_context_tokens": 8000.0,
"verifications": 2000,
"avg_tools": 0.0,
"escalations": 0,
"downgrades": 0
},
"heuristic": {
"name": "heuristic",
"success_rate": 0.845,
"avg_cost": 0.9203665,
"model_cost": 0.9203665,
"tool_cost": 0.0,
"ver_cost": 0.0,
"avg_context_tokens": 8000.0,
"verifications": 2000,
"avg_tools": 0.0,
"escalations": 0,
"downgrades": 0
},
"cheap": {
"name": "always_cheap",
"success_rate": 0.2985,
"avg_cost": 0.07136150000000001,
"model_cost": 0.07136150000000001,
"tool_cost": 0.0,
"ver_cost": 0.0,
"avg_context_tokens": 8000.0,
"verifications": 2000,
"avg_tools": 0.0,
"escalations": 0,
"downgrades": 0
}
}