speculative-tool-actions / eval_results_v2.json
narcolepticchicken's picture
Upload eval_results_v2.json
26e44c5 verified
{
"metrics": {
"A": {"accuracy": 0.400, "avg_cost": 1.000, "xRandom": 3.6, "xMaj": 1.7},
"B": {"accuracy": 0.510, "avg_cost": 0.150, "xRandom": 4.6, "xMaj": 2.1},
"C": {"accuracy": 0.400, "avg_cost": 1.250, "accept_rate": 0.000, "xRandom": 3.6, "xMaj": 1.7},
"D": {"accuracy": 0.510, "avg_cost": 0.250, "accept_rate": 0.215, "mean_score": -1.515, "xRandom": 4.6, "xMaj": 2.1},
"E": {"accuracy": 0.420, "avg_cost": 0.750, "xRandom": 3.8, "xMaj": 1.8}
},
"baselines": {
"random": 0.111,
"majority": 0.240,
"majority_class": "final_answer"
},
"n": 200,
"distribution": {
"ask_clarification": 20, "final_answer": 48, "file_read": 27,
"tool_call": 31, "repair": 10, "retrieval": 27,
"file_write": 18, "verifier": 14, "BLOCKED": 5
},
"pareto_frontier": [{"config": "B", "cost": 0.150, "accuracy": 0.510}],
"winner": "B",
"conclusion": "Cheap fine-tuned proposer alone is optimal. Speculative routing fails when verifiers cannot discriminate quality.",
"hardware": "A-D: dual A10G (a10g-largex2), E: single A10G"
}