| { |
| "metrics": { |
| "A": {"accuracy": 0.400, "avg_cost": 1.000, "xRandom": 3.6, "xMaj": 1.7}, |
| "B": {"accuracy": 0.510, "avg_cost": 0.150, "xRandom": 4.6, "xMaj": 2.1}, |
| "C": {"accuracy": 0.400, "avg_cost": 1.250, "accept_rate": 0.000, "xRandom": 3.6, "xMaj": 1.7}, |
| "D": {"accuracy": 0.510, "avg_cost": 0.250, "accept_rate": 0.215, "mean_score": -1.515, "xRandom": 4.6, "xMaj": 2.1}, |
| "E": {"accuracy": 0.420, "avg_cost": 0.750, "xRandom": 3.8, "xMaj": 1.8} |
| }, |
| "baselines": { |
| "random": 0.111, |
| "majority": 0.240, |
| "majority_class": "final_answer" |
| }, |
| "n": 200, |
| "distribution": { |
| "ask_clarification": 20, "final_answer": 48, "file_read": 27, |
| "tool_call": 31, "repair": 10, "retrieval": 27, |
| "file_write": 18, "verifier": 14, "BLOCKED": 5 |
| }, |
| "pareto_frontier": [{"config": "B", "cost": 0.150, "accuracy": 0.510}], |
| "winner": "B", |
| "conclusion": "Cheap fine-tuned proposer alone is optimal. Speculative routing fails when verifiers cannot discriminate quality.", |
| "hardware": "A-D: dual A10G (a10g-largex2), E: single A10G" |
| } |
|
|