agent-cost-optimizer / router_models /v6_eval_results.json
narcolepticchicken's picture
Upload router_models/v6_eval_results.json with huggingface_hub
391292b verified
{
"always_frontier": {
"success": 0.8935,
"avg_cost": 1.0,
"unsafe_rate": 0.023,
"false_done": 0.0835,
"tier_dist": {
"4": 2000
},
"escalations": 878,
"downgrades": 420,
"heuristic_only": 702
},
"always_cheap": {
"success": 0.2225,
"avg_cost": 0.04999999999999823,
"unsafe_rate": 0.7775,
"false_done": 0.0,
"tier_dist": {
"1": 2000
},
"escalations": 0,
"downgrades": 2000,
"heuristic_only": 0
},
"heuristic_diff+1": {
"success": 0.8405,
"avg_cost": 0.9271500000000037,
"unsafe_rate": 0.047,
"false_done": 0.1125,
"tier_dist": {
"4": 702,
"3": 651,
"5": 420,
"2": 227
},
"escalations": 0,
"downgrades": 0,
"heuristic_only": 2000
},
"heuristic_floor": {
"success": 0.5875,
"avg_cost": 0.5013749999999922,
"unsafe_rate": 0.28,
"false_done": 0.1325,
"tier_dist": {
"3": 913,
"2": 651,
"4": 209,
"1": 227
},
"escalations": 0,
"downgrades": 2000,
"heuristic_only": 0
},
"oracle": {
"success": 0.998,
"avg_cost": 0.47694999999999094,
"unsafe_rate": 0.0,
"false_done": 0.002,
"tier_dist": {
"3": 644,
"5": 46,
"2": 571,
"4": 294,
"1": 445
},
"escalations": 94,
"downgrades": 1486,
"heuristic_only": 420
},
"hybrid_s0.25_d0.70": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.25_d0.75": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.25_d0.80": {
"success": 0.8225,
"avg_cost": 0.8416500000000027,
"unsafe_rate": 0.058,
"false_done": 0.1195,
"tier_dist": {
"4": 1044,
"3": 651,
"5": 78,
"2": 227
},
"escalations": 0,
"downgrades": 342,
"heuristic_only": 1658
},
"hybrid_s0.25_d0.85": {
"success": 0.838,
"avg_cost": 0.9084000000000035,
"unsafe_rate": 0.048,
"false_done": 0.114,
"tier_dist": {
"4": 777,
"3": 651,
"5": 345,
"2": 227
},
"escalations": 0,
"downgrades": 75,
"heuristic_only": 1925
},
"hybrid_s0.30_d0.70": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.30_d0.75": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.30_d0.80": {
"success": 0.8225,
"avg_cost": 0.8416500000000027,
"unsafe_rate": 0.058,
"false_done": 0.1195,
"tier_dist": {
"4": 1044,
"3": 651,
"5": 78,
"2": 227
},
"escalations": 0,
"downgrades": 342,
"heuristic_only": 1658
},
"hybrid_s0.30_d0.85": {
"success": 0.838,
"avg_cost": 0.9084000000000035,
"unsafe_rate": 0.048,
"false_done": 0.114,
"tier_dist": {
"4": 777,
"3": 651,
"5": 345,
"2": 227
},
"escalations": 0,
"downgrades": 75,
"heuristic_only": 1925
},
"hybrid_s0.35_d0.70": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.35_d0.75": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.35_d0.80": {
"success": 0.8225,
"avg_cost": 0.8416500000000027,
"unsafe_rate": 0.058,
"false_done": 0.1195,
"tier_dist": {
"4": 1044,
"3": 651,
"5": 78,
"2": 227
},
"escalations": 0,
"downgrades": 342,
"heuristic_only": 1658
},
"hybrid_s0.35_d0.85": {
"success": 0.838,
"avg_cost": 0.9084000000000035,
"unsafe_rate": 0.048,
"false_done": 0.114,
"tier_dist": {
"4": 777,
"3": 651,
"5": 345,
"2": 227
},
"escalations": 0,
"downgrades": 75,
"heuristic_only": 1925
},
"hybrid_s0.40_d0.70": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.40_d0.75": {
"success": 0.8175,
"avg_cost": 0.8221500000000023,
"unsafe_rate": 0.0595,
"false_done": 0.123,
"tier_dist": {
"4": 1122,
"3": 651,
"2": 227
},
"escalations": 0,
"downgrades": 420,
"heuristic_only": 1580
},
"hybrid_s0.40_d0.80": {
"success": 0.8225,
"avg_cost": 0.8416500000000027,
"unsafe_rate": 0.058,
"false_done": 0.1195,
"tier_dist": {
"4": 1044,
"3": 651,
"5": 78,
"2": 227
},
"escalations": 0,
"downgrades": 342,
"heuristic_only": 1658
},
"hybrid_s0.40_d0.85": {
"success": 0.838,
"avg_cost": 0.9084000000000035,
"unsafe_rate": 0.048,
"false_done": 0.114,
"tier_dist": {
"4": 777,
"3": 651,
"5": 345,
"2": 227
},
"escalations": 0,
"downgrades": 75,
"heuristic_only": 1925
}
}