{ "metrics": { "A": {"accuracy": 0.400, "avg_cost": 1.000, "xRandom": 3.6, "xMaj": 1.7}, "B": {"accuracy": 0.510, "avg_cost": 0.150, "xRandom": 4.6, "xMaj": 2.1}, "C": {"accuracy": 0.400, "avg_cost": 1.250, "accept_rate": 0.000, "xRandom": 3.6, "xMaj": 1.7}, "D": {"accuracy": 0.510, "avg_cost": 0.250, "accept_rate": 0.215, "mean_score": -1.515, "xRandom": 4.6, "xMaj": 2.1}, "E": {"accuracy": 0.420, "avg_cost": 0.750, "xRandom": 3.8, "xMaj": 1.8} }, "baselines": { "random": 0.111, "majority": 0.240, "majority_class": "final_answer" }, "n": 200, "distribution": { "ask_clarification": 20, "final_answer": 48, "file_read": 27, "tool_call": 31, "repair": 10, "retrieval": 27, "file_write": 18, "verifier": 14, "BLOCKED": 5 }, "pareto_frontier": [{"config": "B", "cost": 0.150, "accuracy": 0.510}], "winner": "B", "conclusion": "Cheap fine-tuned proposer alone is optimal. Speculative routing fails when verifiers cannot discriminate quality.", "hardware": "A-D: dual A10G (a10g-largex2), E: single A10G" }