| { | |
| "metrics": { | |
| "A": { | |
| "config": "A", | |
| "accuracy": 0.16, | |
| "avg_cost": 1.0, | |
| "safety": 0.975, | |
| "n": 200, | |
| "by_action": { | |
| "tool_call": 1.0, | |
| "retrieval": 0.037, | |
| "file_read": 0.0, | |
| "file_write": 0.0, | |
| "repair": 0.0, | |
| "verifier": 0.0, | |
| "ask_clarification": 0.0, | |
| "final_answer": 0.0, | |
| "BLOCKED": 0.0 | |
| } | |
| }, | |
| "B": { | |
| "config": "B", | |
| "accuracy": 0.19, | |
| "avg_cost": 0.15, | |
| "safety": 0.975, | |
| "n": 200, | |
| "by_action": { | |
| "tool_call": 0.871, | |
| "retrieval": 0.0, | |
| "file_read": 0.037, | |
| "file_write": 0.0, | |
| "repair": 0.0, | |
| "verifier": 0.0, | |
| "ask_clarification": 0.5, | |
| "final_answer": 0.0, | |
| "BLOCKED": 0.0 | |
| } | |
| }, | |
| "C": { | |
| "config": "C", | |
| "accuracy": 0.19, | |
| "avg_cost": 0.285, | |
| "safety": 0.975, | |
| "n": 200, | |
| "by_action": { | |
| "tool_call": 0.871, | |
| "retrieval": 0.0, | |
| "file_read": 0.037, | |
| "file_write": 0.0, | |
| "repair": 0.0, | |
| "verifier": 0.0, | |
| "ask_clarification": 0.5, | |
| "final_answer": 0.0, | |
| "BLOCKED": 0.0 | |
| }, | |
| "accept_rate": 0.965 | |
| }, | |
| "D": { | |
| "config": "D", | |
| "accuracy": 0.19, | |
| "avg_cost": 0.25, | |
| "safety": 0.975, | |
| "n": 200, | |
| "by_action": { | |
| "tool_call": 0.871, | |
| "retrieval": 0.0, | |
| "file_read": 0.037, | |
| "file_write": 0.0, | |
| "repair": 0.0, | |
| "verifier": 0.0, | |
| "ask_clarification": 0.5, | |
| "final_answer": 0.0, | |
| "BLOCKED": 0.0 | |
| }, | |
| "accept_rate": 0.0, | |
| "mean_score": -2.638, | |
| "min_score": -4.656, | |
| "max_score": -0.875 | |
| }, | |
| "E": { | |
| "config": "E", | |
| "accuracy": 0.17, | |
| "avg_cost": 0.75, | |
| "safety": 0.975, | |
| "n": 200, | |
| "by_action": { | |
| "tool_call": 0.806, | |
| "retrieval": 0.0, | |
| "file_read": 0.037, | |
| "file_write": 0.0, | |
| "repair": 0.0, | |
| "verifier": 0.0, | |
| "ask_clarification": 0.4, | |
| "final_answer": 0.0, | |
| "BLOCKED": 0.0 | |
| }, | |
| "accept_rate": 1.0 | |
| } | |
| }, | |
| "config": { | |
| "cheap_model": "narcolepticchicken/speculative-proposer-qwen3-1.7b", | |
| "verifier_model": "narcolepticchicken/speculative-verifier-qwen3-4b", | |
| "strong_model": "Qwen/Qwen3-8B", | |
| "eval_dataset": "narcolepticchicken/speculative-actions-eval", | |
| "n_examples": 200, | |
| "reward_threshold": 0.0 | |
| }, | |
| "action_distribution": { | |
| "ask_clarification": 20, | |
| "final_answer": 48, | |
| "file_read": 27, | |
| "tool_call": 31, | |
| "repair": 10, | |
| "retrieval": 27, | |
| "file_write": 18, | |
| "verifier": 14, | |
| "BLOCKED": 5 | |
| } | |
| } |