| { |
| "meta": { |
| "model": "meta-llama/Llama-2-7b-chat-hf", |
| "device": "cuda", |
| "dtype": "fp32", |
| "layer": 10, |
| "layers_path": "model.layers", |
| "seed": 123, |
| "task": "gsm8k", |
| "eval_mode": "gen_math", |
| "eval_meta": { |
| "subspace_split": null, |
| "eval_split": "test", |
| "available_splits": [ |
| "train", |
| "test" |
| ], |
| "hf_id": "gsm8k/main" |
| }, |
| "n_eval_loaded": 256, |
| "n_scanned": 256, |
| "base_acc_scan": 0.0390625, |
| "ablt_acc_scan": 0.03125, |
| "flips_total": 9, |
| "flips_used": 9, |
| "patch_steps": [ |
| 0, |
| 1, |
| 2, |
| 3 |
| ], |
| "patch_n_steps": 4, |
| "Qs_path": "Q_shared_layer10.npy", |
| "Qs_shape": [ |
| 4096, |
| 97 |
| ], |
| "gold_text_prefix": " ", |
| "dist_text_prefix": " ", |
| "gold_max_tokens": 0, |
| "distractor_mode": "next_gold", |
| "answer_prefix_effective": "\nLet's think step by step.\nFinal answer (number only):", |
| "max_new_tokens_effective": 64, |
| "run_coeff_controls": false, |
| "use_benchmark_loader": true, |
| "hf_id": "", |
| "hf_split": "test" |
| }, |
| "summary_on_flips": { |
| "patched_self": { |
| "n": 9, |
| "rescued": 8, |
| "rescued_pct": 88.88888888888889 |
| }, |
| "control_time_shuffled": { |
| "n": 9, |
| "rescued": 7, |
| "rescued_pct": 77.77777777777777 |
| }, |
| "control_shared_randvec": { |
| "n": 9, |
| "rescued": 0, |
| "rescued_pct": 0.0 |
| }, |
| "control_rand_subspace": { |
| "n": 9, |
| "rescued": 0, |
| "rescued_pct": 0.0 |
| }, |
| "control_patch_nonshared": { |
| "n": 9, |
| "rescued": 0, |
| "rescued_pct": 0.0 |
| } |
| }, |
| "scan_rows": [ |
| { |
| "ex_id": "gsm8k-test-0", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "55", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 40 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-1", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": true, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-2", |
| "gold_raw": "12", |
| "baseline": { |
| "pred_answer": "23", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-3", |
| "gold_raw": "140", |
| "baseline": { |
| "pred_answer": "420", |
| "correct": false, |
| "n_gen_tokens": 15 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-4", |
| "gold_raw": "36", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "6", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-5", |
| "gold_raw": "3200", |
| "baseline": { |
| "pred_answer": "1400", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-6", |
| "gold_raw": "38", |
| "baseline": { |
| "pred_answer": "23", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "8", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-7", |
| "gold_raw": "32", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-8", |
| "gold_raw": "92", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-9", |
| "gold_raw": "16", |
| "baseline": { |
| "pred_answer": "900", |
| "correct": false, |
| "n_gen_tokens": 21 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-10", |
| "gold_raw": "45", |
| "baseline": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-11", |
| "gold_raw": "270", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 17 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-12", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": true, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-13", |
| "gold_raw": "25", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": true, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "25", |
| "correct": true, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-14", |
| "gold_raw": "800", |
| "baseline": { |
| "pred_answer": "350", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-15", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-16", |
| "gold_raw": "7000", |
| "baseline": { |
| "pred_answer": "450", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "400", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-17", |
| "gold_raw": "25", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-18", |
| "gold_raw": "3", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": true, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-19", |
| "gold_raw": "3430", |
| "baseline": { |
| "pred_answer": "300", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-20", |
| "gold_raw": "106", |
| "baseline": { |
| "pred_answer": "102", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-21", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-22", |
| "gold_raw": "26", |
| "baseline": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 25 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-23", |
| "gold_raw": "750", |
| "baseline": { |
| "pred_answer": "75", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-24", |
| "gold_raw": "9", |
| "baseline": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-25", |
| "gold_raw": "40", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-26", |
| "gold_raw": "14", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-27", |
| "gold_raw": "160", |
| "baseline": { |
| "pred_answer": "128", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 8 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-28", |
| "gold_raw": "6", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 38 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-29", |
| "gold_raw": "132", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "16", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-30", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 12 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-31", |
| "gold_raw": "68", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-32", |
| "gold_raw": "31", |
| "baseline": { |
| "pred_answer": "320", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "$", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-33", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "500", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-34", |
| "gold_raw": "1509", |
| "baseline": { |
| "pred_answer": "781", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-35", |
| "gold_raw": "480", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-36", |
| "gold_raw": "520", |
| "baseline": { |
| "pred_answer": "42.00", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "100.00", |
| "correct": false, |
| "n_gen_tokens": 9 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-37", |
| "gold_raw": "3", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 11 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-38", |
| "gold_raw": "33", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-39", |
| "gold_raw": "120", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-40", |
| "gold_raw": "14", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-41", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-42", |
| "gold_raw": "95200", |
| "baseline": { |
| "pred_answer": "80000", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "40000", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-43", |
| "gold_raw": "77", |
| "baseline": { |
| "pred_answer": "19", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-44", |
| "gold_raw": "81", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-45", |
| "gold_raw": "310", |
| "baseline": { |
| "pred_answer": "210", |
| "correct": false, |
| "n_gen_tokens": 9 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-46", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 9 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-47", |
| "gold_raw": "160", |
| "baseline": { |
| "pred_answer": "240", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "8", |
| "correct": false, |
| "n_gen_tokens": 27 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-48", |
| "gold_raw": "25", |
| "baseline": { |
| "pred_answer": "160", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "0.75", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-49", |
| "gold_raw": "1400", |
| "baseline": { |
| "pred_answer": "800", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "200", |
| "correct": false, |
| "n_gen_tokens": 28 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-50", |
| "gold_raw": "120", |
| "baseline": { |
| "pred_answer": "60", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-51", |
| "gold_raw": "48", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-52", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "21", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-53", |
| "gold_raw": "15400", |
| "baseline": { |
| "pred_answer": "000", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-54", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "140", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-55", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-56", |
| "gold_raw": "14", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-57", |
| "gold_raw": "31", |
| "baseline": { |
| "pred_answer": "16", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 8 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-58", |
| "gold_raw": "36", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-59", |
| "gold_raw": "144", |
| "baseline": { |
| "pred_answer": "160", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 27 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-60", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": true, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "65", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-61", |
| "gold_raw": "750", |
| "baseline": { |
| "pred_answer": "250", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-62", |
| "gold_raw": "38", |
| "baseline": { |
| "pred_answer": "48", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "60", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-63", |
| "gold_raw": "48", |
| "baseline": { |
| "pred_answer": "56", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-64", |
| "gold_raw": "655", |
| "baseline": { |
| "pred_answer": "1625", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-65", |
| "gold_raw": "800", |
| "baseline": { |
| "pred_answer": "0.5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-66", |
| "gold_raw": "7300", |
| "baseline": { |
| "pred_answer": "10400", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "2300", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-67", |
| "gold_raw": "48", |
| "baseline": { |
| "pred_answer": "16", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-68", |
| "gold_raw": "4", |
| "baseline": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": true, |
| "n_gen_tokens": 9 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-69", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": true, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-70", |
| "gold_raw": "23", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-71", |
| "gold_raw": "225", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-72", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "13", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-73", |
| "gold_raw": "82", |
| "baseline": { |
| "pred_answer": "32", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-74", |
| "gold_raw": "1218", |
| "baseline": { |
| "pred_answer": "952", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-75", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "10.00", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": true, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-76", |
| "gold_raw": "36", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-77", |
| "gold_raw": "13", |
| "baseline": { |
| "pred_answer": "12.50", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "20.50", |
| "correct": false, |
| "n_gen_tokens": 8 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-78", |
| "gold_raw": "11", |
| "baseline": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-79", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-80", |
| "gold_raw": "440", |
| "baseline": { |
| "pred_answer": "400", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-81", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-82", |
| "gold_raw": "45", |
| "baseline": { |
| "pred_answer": "48", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-83", |
| "gold_raw": "54", |
| "baseline": { |
| "pred_answer": "109", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-84", |
| "gold_raw": "6", |
| "baseline": { |
| "pred_answer": "9", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-85", |
| "gold_raw": "240", |
| "baseline": { |
| "pred_answer": "1200", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-86", |
| "gold_raw": "428", |
| "baseline": { |
| "pred_answer": "400", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "278", |
| "correct": false, |
| "n_gen_tokens": 59 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-87", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "3.5", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": true, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-88", |
| "gold_raw": "255", |
| "baseline": { |
| "pred_answer": "305", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-89", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 9 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-90", |
| "gold_raw": "9", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 23 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-91", |
| "gold_raw": "157", |
| "baseline": { |
| "pred_answer": "175", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-92", |
| "gold_raw": "56", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-93", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-94", |
| "gold_raw": "144", |
| "baseline": { |
| "pred_answer": "36", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-95", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-96", |
| "gold_raw": "4", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-97", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-98", |
| "gold_raw": "42", |
| "baseline": { |
| "pred_answer": "48", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-99", |
| "gold_raw": "7", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-100", |
| "gold_raw": "250", |
| "baseline": { |
| "pred_answer": "350", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "150", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-101", |
| "gold_raw": "12", |
| "baseline": { |
| "pred_answer": "11", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-102", |
| "gold_raw": "7", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-103", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-104", |
| "gold_raw": "26", |
| "baseline": { |
| "pred_answer": "25.00", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "19.5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-105", |
| "gold_raw": "42", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "26", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-106", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "300", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-107", |
| "gold_raw": "14400", |
| "baseline": { |
| "pred_answer": "12000", |
| "correct": false, |
| "n_gen_tokens": 24 |
| }, |
| "ablated": { |
| "pred_answer": "5000000000000000000000000000000000000000000000000000000000000", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-108", |
| "gold_raw": "400", |
| "baseline": { |
| "pred_answer": "450", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-109", |
| "gold_raw": "40", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 60 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-110", |
| "gold_raw": "83", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-111", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "19", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-112", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "400", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "6", |
| "correct": false, |
| "n_gen_tokens": 57 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-113", |
| "gold_raw": "180", |
| "baseline": { |
| "pred_answer": "72", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "6.0", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-114", |
| "gold_raw": "1450000", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "000", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-115", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-116", |
| "gold_raw": "1000", |
| "baseline": { |
| "pred_answer": "10000", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-117", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3.00", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-118", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": true, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-119", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "500000000000000000000000000000000000000000000000000000000000000", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-120", |
| "gold_raw": "335", |
| "baseline": { |
| "pred_answer": "250", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-121", |
| "gold_raw": "60", |
| "baseline": { |
| "pred_answer": "34", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-122", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-123", |
| "gold_raw": "9500", |
| "baseline": { |
| "pred_answer": "70000", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-124", |
| "gold_raw": "160", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-125", |
| "gold_raw": "1050", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-126", |
| "gold_raw": "91", |
| "baseline": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-127", |
| "gold_raw": "21", |
| "baseline": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "36", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-128", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "21", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-129", |
| "gold_raw": "36", |
| "baseline": { |
| "pred_answer": "42", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-130", |
| "gold_raw": "36", |
| "baseline": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 21 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-131", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-132", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "9", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-133", |
| "gold_raw": "32", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-134", |
| "gold_raw": "18", |
| "baseline": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-135", |
| "gold_raw": "4", |
| "baseline": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "17", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-136", |
| "gold_raw": "48", |
| "baseline": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-137", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-138", |
| "gold_raw": "21", |
| "baseline": { |
| "pred_answer": "11", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 19 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-139", |
| "gold_raw": "25", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-140", |
| "gold_raw": "3000", |
| "baseline": { |
| "pred_answer": "000", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-141", |
| "gold_raw": "40", |
| "baseline": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-142", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "28", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-143", |
| "gold_raw": "90", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-144", |
| "gold_raw": "23", |
| "baseline": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-145", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "32", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-146", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "180", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-147", |
| "gold_raw": "122", |
| "baseline": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 19 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-148", |
| "gold_raw": "300", |
| "baseline": { |
| "pred_answer": "300", |
| "correct": true, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-149", |
| "gold_raw": "448", |
| "baseline": { |
| "pred_answer": "112", |
| "correct": false, |
| "n_gen_tokens": 7 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-150", |
| "gold_raw": "2450", |
| "baseline": { |
| "pred_answer": "1500", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-151", |
| "gold_raw": "803", |
| "baseline": { |
| "pred_answer": "365", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-152", |
| "gold_raw": "16", |
| "baseline": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-153", |
| "gold_raw": "280", |
| "baseline": { |
| "pred_answer": "350", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-154", |
| "gold_raw": "13", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-155", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "13.5", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "45", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-156", |
| "gold_raw": "14", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-157", |
| "gold_raw": "32", |
| "baseline": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-158", |
| "gold_raw": "105", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-159", |
| "gold_raw": "71", |
| "baseline": { |
| "pred_answer": "121", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-160", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "8", |
| "correct": false, |
| "n_gen_tokens": 10 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-161", |
| "gold_raw": "30", |
| "baseline": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-162", |
| "gold_raw": "95", |
| "baseline": { |
| "pred_answer": "90", |
| "correct": false, |
| "n_gen_tokens": 15 |
| }, |
| "ablated": { |
| "pred_answer": "60", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-163", |
| "gold_raw": "147", |
| "baseline": { |
| "pred_answer": "130", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 36 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-164", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-165", |
| "gold_raw": "40000", |
| "baseline": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-166", |
| "gold_raw": "12", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-167", |
| "gold_raw": "129200", |
| "baseline": { |
| "pred_answer": "144000", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-168", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-169", |
| "gold_raw": "45", |
| "baseline": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-170", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-171", |
| "gold_raw": "1170", |
| "baseline": { |
| "pred_answer": "1500", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "300", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-172", |
| "gold_raw": "192", |
| "baseline": { |
| "pred_answer": "32", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 8 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-173", |
| "gold_raw": "14", |
| "baseline": { |
| "pred_answer": "22", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "7", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-174", |
| "gold_raw": "144", |
| "baseline": { |
| "pred_answer": "48", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "6", |
| "correct": false, |
| "n_gen_tokens": 46 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-175", |
| "gold_raw": "350", |
| "baseline": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-176", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "42", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-177", |
| "gold_raw": "7", |
| "baseline": { |
| "pred_answer": "3.75", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-178", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": true, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-179", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "8", |
| "correct": true, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-180", |
| "gold_raw": "3160", |
| "baseline": { |
| "pred_answer": "4800", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-181", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "82.5", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "80", |
| "correct": true, |
| "n_gen_tokens": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-182", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-183", |
| "gold_raw": "40", |
| "baseline": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-184", |
| "gold_raw": "78", |
| "baseline": { |
| "pred_answer": "45", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-185", |
| "gold_raw": "273", |
| "baseline": { |
| "pred_answer": "220", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-186", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 26 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-187", |
| "gold_raw": "195", |
| "baseline": { |
| "pred_answer": "90", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-188", |
| "gold_raw": "1128", |
| "baseline": { |
| "pred_answer": "168", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-189", |
| "gold_raw": "172", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-190", |
| "gold_raw": "30", |
| "baseline": { |
| "pred_answer": "53", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "50", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-191", |
| "gold_raw": "30", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-192", |
| "gold_raw": "92", |
| "baseline": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-193", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-194", |
| "gold_raw": "540", |
| "baseline": { |
| "pred_answer": "91", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-195", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "32", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-196", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-197", |
| "gold_raw": "38", |
| "baseline": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "14", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-198", |
| "gold_raw": "4000", |
| "baseline": { |
| "pred_answer": "300", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-199", |
| "gold_raw": "594", |
| "baseline": { |
| "pred_answer": "200", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-200", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "6", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-201", |
| "gold_raw": "142", |
| "baseline": { |
| "pred_answer": "104", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-202", |
| "gold_raw": "9", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-203", |
| "gold_raw": "6", |
| "baseline": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "6", |
| "correct": true, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-204", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "260", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "40", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-205", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-206", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "155", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "35", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-207", |
| "gold_raw": "22", |
| "baseline": { |
| "pred_answer": "21", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-208", |
| "gold_raw": "16", |
| "baseline": { |
| "pred_answer": "96", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-209", |
| "gold_raw": "16", |
| "baseline": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-210", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "2.5", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "3.5", |
| "correct": false, |
| "n_gen_tokens": 56 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-211", |
| "gold_raw": "23", |
| "baseline": { |
| "pred_answer": "10", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-212", |
| "gold_raw": "30", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-213", |
| "gold_raw": "14000", |
| "baseline": { |
| "pred_answer": "000", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 33 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-214", |
| "gold_raw": "60", |
| "baseline": { |
| "pred_answer": "8", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "16", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-215", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-216", |
| "gold_raw": "3", |
| "baseline": { |
| "pred_answer": "6.25", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 19 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-217", |
| "gold_raw": "30", |
| "baseline": { |
| "pred_answer": "34", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-218", |
| "gold_raw": "1920", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-219", |
| "gold_raw": "84", |
| "baseline": { |
| "pred_answer": "140", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-220", |
| "gold_raw": "8", |
| "baseline": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-221", |
| "gold_raw": "12", |
| "baseline": { |
| "pred_answer": "1200", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-222", |
| "gold_raw": "260", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-223", |
| "gold_raw": "288", |
| "baseline": { |
| "pred_answer": "330", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-224", |
| "gold_raw": "3", |
| "baseline": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 12 |
| }, |
| "ablated": { |
| "pred_answer": "$", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-225", |
| "gold_raw": "1596", |
| "baseline": { |
| "pred_answer": "1620", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "400", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-226", |
| "gold_raw": "81", |
| "baseline": { |
| "pred_answer": "2700", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-227", |
| "gold_raw": "56", |
| "baseline": { |
| "pred_answer": "108", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "60", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-228", |
| "gold_raw": "1490", |
| "baseline": { |
| "pred_answer": "200", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "5", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-229", |
| "gold_raw": "2", |
| "baseline": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 15 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-230", |
| "gold_raw": "20", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "55", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-231", |
| "gold_raw": "11", |
| "baseline": { |
| "pred_answer": "10.5", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-232", |
| "gold_raw": "120", |
| "baseline": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "105", |
| "correct": false, |
| "n_gen_tokens": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-233", |
| "gold_raw": "45", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-234", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "8", |
| "correct": false, |
| "n_gen_tokens": 3 |
| }, |
| "ablated": { |
| "pred_answer": "30", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-235", |
| "gold_raw": "9", |
| "baseline": { |
| "pred_answer": "6.75", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "6.5", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-236", |
| "gold_raw": "33", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "13", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-237", |
| "gold_raw": "150", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": true, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "25", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-238", |
| "gold_raw": "60", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-239", |
| "gold_raw": "4", |
| "baseline": { |
| "pred_answer": "9", |
| "correct": false, |
| "n_gen_tokens": 11 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-240", |
| "gold_raw": "7", |
| "baseline": { |
| "pred_answer": "24", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-241", |
| "gold_raw": "3140", |
| "baseline": { |
| "pred_answer": "1200", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "20000", |
| "correct": false, |
| "n_gen_tokens": 17 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-242", |
| "gold_raw": "19", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 1 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-243", |
| "gold_raw": "6", |
| "baseline": { |
| "pred_answer": "3.75", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "4", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-244", |
| "gold_raw": "90", |
| "baseline": { |
| "pred_answer": "120", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "180", |
| "correct": false, |
| "n_gen_tokens": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-245", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-246", |
| "gold_raw": "130000", |
| "baseline": { |
| "pred_answer": "000", |
| "correct": false, |
| "n_gen_tokens": 10 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-247", |
| "gold_raw": "10", |
| "baseline": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "70", |
| "correct": false, |
| "n_gen_tokens": 19 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-248", |
| "gold_raw": "525", |
| "baseline": { |
| "pred_answer": "105", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "18", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-249", |
| "gold_raw": "180", |
| "baseline": { |
| "pred_answer": "130", |
| "correct": false, |
| "n_gen_tokens": 5 |
| }, |
| "ablated": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-250", |
| "gold_raw": "1200", |
| "baseline": { |
| "pred_answer": "1200", |
| "correct": true, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-251", |
| "gold_raw": "25", |
| "baseline": { |
| "pred_answer": "20", |
| "correct": false, |
| "n_gen_tokens": 4 |
| }, |
| "ablated": { |
| "pred_answer": "45", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-252", |
| "gold_raw": "21", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": false, |
| "n_gen_tokens": 64 |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false, |
| "n_gen_tokens": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-253", |
| "gold_raw": "2304", |
| "baseline": { |
| "pred_answer": "192", |
| "correct": false, |
| "n_gen_tokens": 16 |
| }, |
| "ablated": { |
| "pred_answer": "1", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-254", |
| "gold_raw": "2325", |
| "baseline": { |
| "pred_answer": "1250", |
| "correct": false, |
| "n_gen_tokens": 6 |
| }, |
| "ablated": { |
| "pred_answer": "<number>", |
| "correct": false, |
| "n_gen_tokens": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-255", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "12", |
| "correct": false, |
| "n_gen_tokens": 8 |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false, |
| "n_gen_tokens": 64 |
| } |
| } |
| ], |
| "flip_rows": [ |
| { |
| "ex_id": "gsm8k-test-1", |
| "gold_raw": "80", |
| "baseline": { |
| "pred_answer": "80", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "80", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "80", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "200", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-12", |
| "gold_raw": "100", |
| "baseline": { |
| "pred_answer": "100", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "120", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "100", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "100", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "60", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "2", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "120", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-18", |
| "gold_raw": "3", |
| "baseline": { |
| "pred_answer": "3", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "2", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "3", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "3", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "4", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "2", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "2", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-60", |
| "gold_raw": "5", |
| "baseline": { |
| "pred_answer": "5", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "65", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "5", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "58", |
| "correct": false |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "<number>", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "10", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "65", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-69", |
| "gold_raw": "15", |
| "baseline": { |
| "pred_answer": "15", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "15", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "15", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-148", |
| "gold_raw": "300", |
| "baseline": { |
| "pred_answer": "300", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "100", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "300", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "300", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "100", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "100", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "100", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-178", |
| "gold_raw": "50", |
| "baseline": { |
| "pred_answer": "50", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "35", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "50", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "50", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "35", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "35", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "35", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-237", |
| "gold_raw": "150", |
| "baseline": { |
| "pred_answer": "150", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "25", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "150", |
| "correct": true |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "150", |
| "correct": true |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "2", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "25", |
| "correct": false |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-250", |
| "gold_raw": "1200", |
| "baseline": { |
| "pred_answer": "1200", |
| "correct": true |
| }, |
| "ablated": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "patched_self": { |
| "pred_answer": "40", |
| "correct": false |
| }, |
| "control_time_shuffled": { |
| "pred_answer": "40", |
| "correct": false |
| }, |
| "control_shared_randvec": { |
| "pred_answer": "", |
| "correct": false |
| }, |
| "control_rand_subspace": { |
| "pred_answer": "400", |
| "correct": false |
| }, |
| "control_patch_nonshared": { |
| "pred_answer": "", |
| "correct": false |
| } |
| } |
| ] |
| } |