{ "meta": { "model": "meta-llama/Llama-2-7b-chat-hf", "device": "cuda", "dtype": "fp32", "layer": 10, "layers_path": "model.layers", "seed": 123, "task": "gsm8k", "eval_mode": "gen_math", "eval_meta": { "subspace_split": null, "eval_split": "test", "available_splits": [ "train", "test" ], "hf_id": "gsm8k/main" }, "n_eval_loaded": 256, "n_scanned": 256, "base_acc_scan": 0.0390625, "ablt_acc_scan": 0.03125, "flips_total": 9, "flips_used": 9, "patch_steps": [ 0, 1, 2, 3 ], "patch_n_steps": 4, "Qs_path": "Q_shared_layer10.npy", "Qs_shape": [ 4096, 97 ], "gold_text_prefix": " ", "dist_text_prefix": " ", "gold_max_tokens": 0, "distractor_mode": "next_gold", "answer_prefix_effective": "\nLet's think step by step.\nFinal answer (number only):", "max_new_tokens_effective": 64, "run_coeff_controls": false, "use_benchmark_loader": true, "hf_id": "", "hf_split": "test" }, "summary_on_flips": { "patched_self": { "n": 9, "rescued": 8, "rescued_pct": 88.88888888888889 }, "control_time_shuffled": { "n": 9, "rescued": 7, "rescued_pct": 77.77777777777777 }, "control_shared_randvec": { "n": 9, "rescued": 0, "rescued_pct": 0.0 }, "control_rand_subspace": { "n": 9, "rescued": 0, "rescued_pct": 0.0 }, "control_patch_nonshared": { "n": 9, "rescued": 0, "rescued_pct": 0.0 } }, "scan_rows": [ { "ex_id": "gsm8k-test-0", "gold_raw": "50", "baseline": { "pred_answer": "55", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 40 } }, { "ex_id": "gsm8k-test-1", "gold_raw": "80", "baseline": { "pred_answer": "80", "correct": true, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-2", "gold_raw": "12", "baseline": { "pred_answer": "23", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-3", "gold_raw": "140", "baseline": { "pred_answer": "420", "correct": false, "n_gen_tokens": 15 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-4", "gold_raw": "36", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "6", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-5", "gold_raw": "3200", "baseline": { "pred_answer": "1400", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-6", "gold_raw": "38", "baseline": { "pred_answer": "23", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "8", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-7", "gold_raw": "32", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-8", "gold_raw": "92", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "20", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-9", "gold_raw": "16", "baseline": { "pred_answer": "900", "correct": false, "n_gen_tokens": 21 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-10", "gold_raw": "45", "baseline": { "pred_answer": "30", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "35", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-11", "gold_raw": "270", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 17 } }, { "ex_id": "gsm8k-test-12", "gold_raw": "100", "baseline": { "pred_answer": "100", "correct": true, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "120", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-13", "gold_raw": "25", "baseline": { "pred_answer": "25", "correct": true, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "25", "correct": true, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-14", "gold_raw": "800", "baseline": { "pred_answer": "350", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-15", "gold_raw": "2", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-16", "gold_raw": "7000", "baseline": { "pred_answer": "450", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "400", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-17", "gold_raw": "25", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-18", "gold_raw": "3", "baseline": { "pred_answer": "3", "correct": true, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-19", "gold_raw": "3430", "baseline": { "pred_answer": "300", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "1", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-20", "gold_raw": "106", "baseline": { "pred_answer": "102", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-21", "gold_raw": "80", "baseline": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-22", "gold_raw": "26", "baseline": { "pred_answer": "7", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 25 } }, { "ex_id": "gsm8k-test-23", "gold_raw": "750", "baseline": { "pred_answer": "75", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-24", "gold_raw": "9", "baseline": { "pred_answer": "30", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-25", "gold_raw": "40", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "12", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-26", "gold_raw": "14", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-27", "gold_raw": "160", "baseline": { "pred_answer": "128", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "120", "correct": false, "n_gen_tokens": 8 } }, { "ex_id": "gsm8k-test-28", "gold_raw": "6", "baseline": { "pred_answer": "100", "correct": false, "n_gen_tokens": 38 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-29", "gold_raw": "132", "baseline": { "pred_answer": "80", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "16", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-30", "gold_raw": "8", "baseline": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "30", "correct": false, "n_gen_tokens": 12 } }, { "ex_id": "gsm8k-test-31", "gold_raw": "68", "baseline": { "pred_answer": "50", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-32", "gold_raw": "31", "baseline": { "pred_answer": "320", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "$", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-33", "gold_raw": "100", "baseline": { "pred_answer": "500", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-34", "gold_raw": "1509", "baseline": { "pred_answer": "781", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-35", "gold_raw": "480", "baseline": { "pred_answer": "150", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-36", "gold_raw": "520", "baseline": { "pred_answer": "42.00", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "100.00", "correct": false, "n_gen_tokens": 9 } }, { "ex_id": "gsm8k-test-37", "gold_raw": "3", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "30", "correct": false, "n_gen_tokens": 11 } }, { "ex_id": "gsm8k-test-38", "gold_raw": "33", "baseline": { "pred_answer": "25", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-39", "gold_raw": "120", "baseline": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "12", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-40", "gold_raw": "14", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-41", "gold_raw": "20", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 7 } }, { "ex_id": "gsm8k-test-42", "gold_raw": "95200", "baseline": { "pred_answer": "80000", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "40000", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-43", "gold_raw": "77", "baseline": { "pred_answer": "19", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-44", "gold_raw": "81", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-45", "gold_raw": "310", "baseline": { "pred_answer": "210", "correct": false, "n_gen_tokens": 9 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-46", "gold_raw": "100", "baseline": { "pred_answer": "80", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "70", "correct": false, "n_gen_tokens": 9 } }, { "ex_id": "gsm8k-test-47", "gold_raw": "160", "baseline": { "pred_answer": "240", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "8", "correct": false, "n_gen_tokens": 27 } }, { "ex_id": "gsm8k-test-48", "gold_raw": "25", "baseline": { "pred_answer": "160", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "0.75", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-49", "gold_raw": "1400", "baseline": { "pred_answer": "800", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "200", "correct": false, "n_gen_tokens": 28 } }, { "ex_id": "gsm8k-test-50", "gold_raw": "120", "baseline": { "pred_answer": "60", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-51", "gold_raw": "48", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-52", "gold_raw": "50", "baseline": { "pred_answer": "21", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-53", "gold_raw": "15400", "baseline": { "pred_answer": "000", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "50", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-54", "gold_raw": "80", "baseline": { "pred_answer": "140", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-55", "gold_raw": "5", "baseline": { "pred_answer": "14", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "10", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-56", "gold_raw": "14", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-57", "gold_raw": "31", "baseline": { "pred_answer": "16", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 8 } }, { "ex_id": "gsm8k-test-58", "gold_raw": "36", "baseline": { "pred_answer": "150", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-59", "gold_raw": "144", "baseline": { "pred_answer": "160", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "12", "correct": false, "n_gen_tokens": 27 } }, { "ex_id": "gsm8k-test-60", "gold_raw": "5", "baseline": { "pred_answer": "5", "correct": true, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "65", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-61", "gold_raw": "750", "baseline": { "pred_answer": "250", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "120", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-62", "gold_raw": "38", "baseline": { "pred_answer": "48", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "60", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-63", "gold_raw": "48", "baseline": { "pred_answer": "56", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-64", "gold_raw": "655", "baseline": { "pred_answer": "1625", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-65", "gold_raw": "800", "baseline": { "pred_answer": "0.5", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-66", "gold_raw": "7300", "baseline": { "pred_answer": "10400", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "2300", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-67", "gold_raw": "48", "baseline": { "pred_answer": "16", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-68", "gold_raw": "4", "baseline": { "pred_answer": "7", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "4", "correct": true, "n_gen_tokens": 9 } }, { "ex_id": "gsm8k-test-69", "gold_raw": "15", "baseline": { "pred_answer": "15", "correct": true, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-70", "gold_raw": "23", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-71", "gold_raw": "225", "baseline": { "pred_answer": "150", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-72", "gold_raw": "15", "baseline": { "pred_answer": "13", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-73", "gold_raw": "82", "baseline": { "pred_answer": "32", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-74", "gold_raw": "1218", "baseline": { "pred_answer": "952", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-75", "gold_raw": "2", "baseline": { "pred_answer": "10.00", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "2", "correct": true, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-76", "gold_raw": "36", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-77", "gold_raw": "13", "baseline": { "pred_answer": "12.50", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "20.50", "correct": false, "n_gen_tokens": 8 } }, { "ex_id": "gsm8k-test-78", "gold_raw": "11", "baseline": { "pred_answer": "2", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-79", "gold_raw": "8", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-80", "gold_raw": "440", "baseline": { "pred_answer": "400", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "1", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-81", "gold_raw": "2", "baseline": { "pred_answer": "4", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-82", "gold_raw": "45", "baseline": { "pred_answer": "48", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "7", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-83", "gold_raw": "54", "baseline": { "pred_answer": "109", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-84", "gold_raw": "6", "baseline": { "pred_answer": "9", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "18", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-85", "gold_raw": "240", "baseline": { "pred_answer": "1200", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-86", "gold_raw": "428", "baseline": { "pred_answer": "400", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "278", "correct": false, "n_gen_tokens": 59 } }, { "ex_id": "gsm8k-test-87", "gold_raw": "5", "baseline": { "pred_answer": "3.5", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "5", "correct": true, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-88", "gold_raw": "255", "baseline": { "pred_answer": "305", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "120", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-89", "gold_raw": "10", "baseline": { "pred_answer": "4", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "7", "correct": false, "n_gen_tokens": 9 } }, { "ex_id": "gsm8k-test-90", "gold_raw": "9", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "35", "correct": false, "n_gen_tokens": 23 } }, { "ex_id": "gsm8k-test-91", "gold_raw": "157", "baseline": { "pred_answer": "175", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "1", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-92", "gold_raw": "56", "baseline": { "pred_answer": "80", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-93", "gold_raw": "5", "baseline": { "pred_answer": "35", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "10", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-94", "gold_raw": "144", "baseline": { "pred_answer": "36", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-95", "gold_raw": "50", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-96", "gold_raw": "4", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "24", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-97", "gold_raw": "50", "baseline": { "pred_answer": "25", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-98", "gold_raw": "42", "baseline": { "pred_answer": "48", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-99", "gold_raw": "7", "baseline": { "pred_answer": "50", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-100", "gold_raw": "250", "baseline": { "pred_answer": "350", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "150", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-101", "gold_raw": "12", "baseline": { "pred_answer": "11", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-102", "gold_raw": "7", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-103", "gold_raw": "8", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-104", "gold_raw": "26", "baseline": { "pred_answer": "25.00", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "19.5", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-105", "gold_raw": "42", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "26", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-106", "gold_raw": "5", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "300", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-107", "gold_raw": "14400", "baseline": { "pred_answer": "12000", "correct": false, "n_gen_tokens": 24 }, "ablated": { "pred_answer": "5000000000000000000000000000000000000000000000000000000000000", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-108", "gold_raw": "400", "baseline": { "pred_answer": "450", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-109", "gold_raw": "40", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 60 } }, { "ex_id": "gsm8k-test-110", "gold_raw": "83", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-111", "gold_raw": "10", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "19", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-112", "gold_raw": "80", "baseline": { "pred_answer": "400", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "6", "correct": false, "n_gen_tokens": 57 } }, { "ex_id": "gsm8k-test-113", "gold_raw": "180", "baseline": { "pred_answer": "72", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "6.0", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-114", "gold_raw": "1450000", "baseline": { "pred_answer": "5", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "000", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-115", "gold_raw": "15", "baseline": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-116", "gold_raw": "1000", "baseline": { "pred_answer": "10000", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-117", "gold_raw": "2", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3.00", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-118", "gold_raw": "15", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "15", "correct": true, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-119", "gold_raw": "100", "baseline": { "pred_answer": "50", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "500000000000000000000000000000000000000000000000000000000000000", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-120", "gold_raw": "335", "baseline": { "pred_answer": "250", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-121", "gold_raw": "60", "baseline": { "pred_answer": "34", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-122", "gold_raw": "5", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-123", "gold_raw": "9500", "baseline": { "pred_answer": "70000", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-124", "gold_raw": "160", "baseline": { "pred_answer": "80", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-125", "gold_raw": "1050", "baseline": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-126", "gold_raw": "91", "baseline": { "pred_answer": "35", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-127", "gold_raw": "21", "baseline": { "pred_answer": "24", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "36", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-128", "gold_raw": "20", "baseline": { "pred_answer": "21", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-129", "gold_raw": "36", "baseline": { "pred_answer": "42", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-130", "gold_raw": "36", "baseline": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 21 } }, { "ex_id": "gsm8k-test-131", "gold_raw": "10", "baseline": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-132", "gold_raw": "5", "baseline": { "pred_answer": "9", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-133", "gold_raw": "32", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-134", "gold_raw": "18", "baseline": { "pred_answer": "14", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-135", "gold_raw": "4", "baseline": { "pred_answer": "7", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "17", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-136", "gold_raw": "48", "baseline": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-137", "gold_raw": "8", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-138", "gold_raw": "21", "baseline": { "pred_answer": "11", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 19 } }, { "ex_id": "gsm8k-test-139", "gold_raw": "25", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-140", "gold_raw": "3000", "baseline": { "pred_answer": "000", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-141", "gold_raw": "40", "baseline": { "pred_answer": "18", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-142", "gold_raw": "50", "baseline": { "pred_answer": "28", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-143", "gold_raw": "90", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-144", "gold_raw": "23", "baseline": { "pred_answer": "35", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "70", "correct": false, "n_gen_tokens": 7 } }, { "ex_id": "gsm8k-test-145", "gold_raw": "2", "baseline": { "pred_answer": "32", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "50", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-146", "gold_raw": "50", "baseline": { "pred_answer": "180", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "30", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-147", "gold_raw": "122", "baseline": { "pred_answer": "70", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 19 } }, { "ex_id": "gsm8k-test-148", "gold_raw": "300", "baseline": { "pred_answer": "300", "correct": true, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 7 } }, { "ex_id": "gsm8k-test-149", "gold_raw": "448", "baseline": { "pred_answer": "112", "correct": false, "n_gen_tokens": 7 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 7 } }, { "ex_id": "gsm8k-test-150", "gold_raw": "2450", "baseline": { "pred_answer": "1500", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-151", "gold_raw": "803", "baseline": { "pred_answer": "365", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "1", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-152", "gold_raw": "16", "baseline": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-153", "gold_raw": "280", "baseline": { "pred_answer": "350", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "7", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-154", "gold_raw": "13", "baseline": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-155", "gold_raw": "20", "baseline": { "pred_answer": "13.5", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "45", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-156", "gold_raw": "14", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-157", "gold_raw": "32", "baseline": { "pred_answer": "18", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "7", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-158", "gold_raw": "105", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-159", "gold_raw": "71", "baseline": { "pred_answer": "121", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-160", "gold_raw": "5", "baseline": { "pred_answer": "8", "correct": false, "n_gen_tokens": 10 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-161", "gold_raw": "30", "baseline": { "pred_answer": "24", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-162", "gold_raw": "95", "baseline": { "pred_answer": "90", "correct": false, "n_gen_tokens": 15 }, "ablated": { "pred_answer": "60", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-163", "gold_raw": "147", "baseline": { "pred_answer": "130", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 36 } }, { "ex_id": "gsm8k-test-164", "gold_raw": "10", "baseline": { "pred_answer": "50", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-165", "gold_raw": "40000", "baseline": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-166", "gold_raw": "12", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-167", "gold_raw": "129200", "baseline": { "pred_answer": "144000", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-168", "gold_raw": "5", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-169", "gold_raw": "45", "baseline": { "pred_answer": "35", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-170", "gold_raw": "20", "baseline": { "pred_answer": "25", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-171", "gold_raw": "1170", "baseline": { "pred_answer": "1500", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "300", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-172", "gold_raw": "192", "baseline": { "pred_answer": "32", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 8 } }, { "ex_id": "gsm8k-test-173", "gold_raw": "14", "baseline": { "pred_answer": "22", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "7", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-174", "gold_raw": "144", "baseline": { "pred_answer": "48", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "6", "correct": false, "n_gen_tokens": 46 } }, { "ex_id": "gsm8k-test-175", "gold_raw": "350", "baseline": { "pred_answer": "70", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-176", "gold_raw": "50", "baseline": { "pred_answer": "42", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-177", "gold_raw": "7", "baseline": { "pred_answer": "3.75", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-178", "gold_raw": "50", "baseline": { "pred_answer": "50", "correct": true, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "35", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-179", "gold_raw": "8", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "8", "correct": true, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-180", "gold_raw": "3160", "baseline": { "pred_answer": "4800", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-181", "gold_raw": "80", "baseline": { "pred_answer": "82.5", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "80", "correct": true, "n_gen_tokens": 7 } }, { "ex_id": "gsm8k-test-182", "gold_raw": "50", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-183", "gold_raw": "40", "baseline": { "pred_answer": "70", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-184", "gold_raw": "78", "baseline": { "pred_answer": "45", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-185", "gold_raw": "273", "baseline": { "pred_answer": "220", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-186", "gold_raw": "2", "baseline": { "pred_answer": "18", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 26 } }, { "ex_id": "gsm8k-test-187", "gold_raw": "195", "baseline": { "pred_answer": "90", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-188", "gold_raw": "1128", "baseline": { "pred_answer": "168", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-189", "gold_raw": "172", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-190", "gold_raw": "30", "baseline": { "pred_answer": "53", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "50", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-191", "gold_raw": "30", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "20", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-192", "gold_raw": "92", "baseline": { "pred_answer": "40", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-193", "gold_raw": "20", "baseline": { "pred_answer": "14", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-194", "gold_raw": "540", "baseline": { "pred_answer": "91", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-195", "gold_raw": "10", "baseline": { "pred_answer": "32", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-196", "gold_raw": "10", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-197", "gold_raw": "38", "baseline": { "pred_answer": "18", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "14", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-198", "gold_raw": "4000", "baseline": { "pred_answer": "300", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-199", "gold_raw": "594", "baseline": { "pred_answer": "200", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-200", "gold_raw": "2", "baseline": { "pred_answer": "6", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-201", "gold_raw": "142", "baseline": { "pred_answer": "104", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-202", "gold_raw": "9", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-203", "gold_raw": "6", "baseline": { "pred_answer": "18", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "6", "correct": true, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-204", "gold_raw": "100", "baseline": { "pred_answer": "260", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "40", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-205", "gold_raw": "10", "baseline": { "pred_answer": "35", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-206", "gold_raw": "15", "baseline": { "pred_answer": "155", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "35", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-207", "gold_raw": "22", "baseline": { "pred_answer": "21", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-208", "gold_raw": "16", "baseline": { "pred_answer": "96", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "100", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-209", "gold_raw": "16", "baseline": { "pred_answer": "30", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-210", "gold_raw": "5", "baseline": { "pred_answer": "2.5", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "3.5", "correct": false, "n_gen_tokens": 56 } }, { "ex_id": "gsm8k-test-211", "gold_raw": "23", "baseline": { "pred_answer": "10", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-212", "gold_raw": "30", "baseline": { "pred_answer": "25", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-213", "gold_raw": "14000", "baseline": { "pred_answer": "000", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 33 } }, { "ex_id": "gsm8k-test-214", "gold_raw": "60", "baseline": { "pred_answer": "8", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "16", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-215", "gold_raw": "2", "baseline": { "pred_answer": "30", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-216", "gold_raw": "3", "baseline": { "pred_answer": "6.25", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 19 } }, { "ex_id": "gsm8k-test-217", "gold_raw": "30", "baseline": { "pred_answer": "34", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "24", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-218", "gold_raw": "1920", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-219", "gold_raw": "84", "baseline": { "pred_answer": "140", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-220", "gold_raw": "8", "baseline": { "pred_answer": "24", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-221", "gold_raw": "12", "baseline": { "pred_answer": "1200", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-222", "gold_raw": "260", "baseline": { "pred_answer": "150", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-223", "gold_raw": "288", "baseline": { "pred_answer": "330", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-224", "gold_raw": "3", "baseline": { "pred_answer": "25", "correct": false, "n_gen_tokens": 12 }, "ablated": { "pred_answer": "$", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-225", "gold_raw": "1596", "baseline": { "pred_answer": "1620", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "400", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-226", "gold_raw": "81", "baseline": { "pred_answer": "2700", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-227", "gold_raw": "56", "baseline": { "pred_answer": "108", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "60", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-228", "gold_raw": "1490", "baseline": { "pred_answer": "200", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "5", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-229", "gold_raw": "2", "baseline": { "pred_answer": "1", "correct": false, "n_gen_tokens": 15 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-230", "gold_raw": "20", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "55", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-231", "gold_raw": "11", "baseline": { "pred_answer": "10.5", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-232", "gold_raw": "120", "baseline": { "pred_answer": "70", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "105", "correct": false, "n_gen_tokens": 6 } }, { "ex_id": "gsm8k-test-233", "gold_raw": "45", "baseline": { "pred_answer": "100", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-234", "gold_raw": "10", "baseline": { "pred_answer": "8", "correct": false, "n_gen_tokens": 3 }, "ablated": { "pred_answer": "30", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-235", "gold_raw": "9", "baseline": { "pred_answer": "6.75", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "6.5", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-236", "gold_raw": "33", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "13", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-237", "gold_raw": "150", "baseline": { "pred_answer": "150", "correct": true, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "25", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-238", "gold_raw": "60", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-239", "gold_raw": "4", "baseline": { "pred_answer": "9", "correct": false, "n_gen_tokens": 11 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-240", "gold_raw": "7", "baseline": { "pred_answer": "24", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-241", "gold_raw": "3140", "baseline": { "pred_answer": "1200", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "20000", "correct": false, "n_gen_tokens": 17 } }, { "ex_id": "gsm8k-test-242", "gold_raw": "19", "baseline": { "pred_answer": "15", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 1 } }, { "ex_id": "gsm8k-test-243", "gold_raw": "6", "baseline": { "pred_answer": "3.75", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "4", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-244", "gold_raw": "90", "baseline": { "pred_answer": "120", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "180", "correct": false, "n_gen_tokens": 5 } }, { "ex_id": "gsm8k-test-245", "gold_raw": "10", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-246", "gold_raw": "130000", "baseline": { "pred_answer": "000", "correct": false, "n_gen_tokens": 10 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-247", "gold_raw": "10", "baseline": { "pred_answer": "70", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "70", "correct": false, "n_gen_tokens": 19 } }, { "ex_id": "gsm8k-test-248", "gold_raw": "525", "baseline": { "pred_answer": "105", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "18", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-249", "gold_raw": "180", "baseline": { "pred_answer": "130", "correct": false, "n_gen_tokens": 5 }, "ablated": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-250", "gold_raw": "1200", "baseline": { "pred_answer": "1200", "correct": true, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 3 } }, { "ex_id": "gsm8k-test-251", "gold_raw": "25", "baseline": { "pred_answer": "20", "correct": false, "n_gen_tokens": 4 }, "ablated": { "pred_answer": "45", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-252", "gold_raw": "21", "baseline": { "pred_answer": "3", "correct": false, "n_gen_tokens": 64 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 2 } }, { "ex_id": "gsm8k-test-253", "gold_raw": "2304", "baseline": { "pred_answer": "192", "correct": false, "n_gen_tokens": 16 }, "ablated": { "pred_answer": "1", "correct": false, "n_gen_tokens": 64 } }, { "ex_id": "gsm8k-test-254", "gold_raw": "2325", "baseline": { "pred_answer": "1250", "correct": false, "n_gen_tokens": 6 }, "ablated": { "pred_answer": "", "correct": false, "n_gen_tokens": 4 } }, { "ex_id": "gsm8k-test-255", "gold_raw": "15", "baseline": { "pred_answer": "12", "correct": false, "n_gen_tokens": 8 }, "ablated": { "pred_answer": "2", "correct": false, "n_gen_tokens": 64 } } ], "flip_rows": [ { "ex_id": "gsm8k-test-1", "gold_raw": "80", "baseline": { "pred_answer": "80", "correct": true }, "ablated": { "pred_answer": "", "correct": false }, "patched_self": { "pred_answer": "80", "correct": true }, "control_time_shuffled": { "pred_answer": "80", "correct": true }, "control_shared_randvec": { "pred_answer": "200", "correct": false }, "control_rand_subspace": { "pred_answer": "", "correct": false }, "control_patch_nonshared": { "pred_answer": "", "correct": false } }, { "ex_id": "gsm8k-test-12", "gold_raw": "100", "baseline": { "pred_answer": "100", "correct": true }, "ablated": { "pred_answer": "120", "correct": false }, "patched_self": { "pred_answer": "100", "correct": true }, "control_time_shuffled": { "pred_answer": "100", "correct": true }, "control_shared_randvec": { "pred_answer": "60", "correct": false }, "control_rand_subspace": { "pred_answer": "2", "correct": false }, "control_patch_nonshared": { "pred_answer": "120", "correct": false } }, { "ex_id": "gsm8k-test-18", "gold_raw": "3", "baseline": { "pred_answer": "3", "correct": true }, "ablated": { "pred_answer": "2", "correct": false }, "patched_self": { "pred_answer": "3", "correct": true }, "control_time_shuffled": { "pred_answer": "3", "correct": true }, "control_shared_randvec": { "pred_answer": "4", "correct": false }, "control_rand_subspace": { "pred_answer": "2", "correct": false }, "control_patch_nonshared": { "pred_answer": "2", "correct": false } }, { "ex_id": "gsm8k-test-60", "gold_raw": "5", "baseline": { "pred_answer": "5", "correct": true }, "ablated": { "pred_answer": "65", "correct": false }, "patched_self": { "pred_answer": "5", "correct": true }, "control_time_shuffled": { "pred_answer": "58", "correct": false }, "control_shared_randvec": { "pred_answer": "", "correct": false }, "control_rand_subspace": { "pred_answer": "10", "correct": false }, "control_patch_nonshared": { "pred_answer": "65", "correct": false } }, { "ex_id": "gsm8k-test-69", "gold_raw": "15", "baseline": { "pred_answer": "15", "correct": true }, "ablated": { "pred_answer": "", "correct": false }, "patched_self": { "pred_answer": "15", "correct": true }, "control_time_shuffled": { "pred_answer": "15", "correct": true }, "control_shared_randvec": { "pred_answer": "", "correct": false }, "control_rand_subspace": { "pred_answer": "", "correct": false }, "control_patch_nonshared": { "pred_answer": "", "correct": false } }, { "ex_id": "gsm8k-test-148", "gold_raw": "300", "baseline": { "pred_answer": "300", "correct": true }, "ablated": { "pred_answer": "100", "correct": false }, "patched_self": { "pred_answer": "300", "correct": true }, "control_time_shuffled": { "pred_answer": "300", "correct": true }, "control_shared_randvec": { "pred_answer": "100", "correct": false }, "control_rand_subspace": { "pred_answer": "100", "correct": false }, "control_patch_nonshared": { "pred_answer": "100", "correct": false } }, { "ex_id": "gsm8k-test-178", "gold_raw": "50", "baseline": { "pred_answer": "50", "correct": true }, "ablated": { "pred_answer": "35", "correct": false }, "patched_self": { "pred_answer": "50", "correct": true }, "control_time_shuffled": { "pred_answer": "50", "correct": true }, "control_shared_randvec": { "pred_answer": "35", "correct": false }, "control_rand_subspace": { "pred_answer": "35", "correct": false }, "control_patch_nonshared": { "pred_answer": "35", "correct": false } }, { "ex_id": "gsm8k-test-237", "gold_raw": "150", "baseline": { "pred_answer": "150", "correct": true }, "ablated": { "pred_answer": "25", "correct": false }, "patched_self": { "pred_answer": "150", "correct": true }, "control_time_shuffled": { "pred_answer": "150", "correct": true }, "control_shared_randvec": { "pred_answer": "", "correct": false }, "control_rand_subspace": { "pred_answer": "2", "correct": false }, "control_patch_nonshared": { "pred_answer": "25", "correct": false } }, { "ex_id": "gsm8k-test-250", "gold_raw": "1200", "baseline": { "pred_answer": "1200", "correct": true }, "ablated": { "pred_answer": "", "correct": false }, "patched_self": { "pred_answer": "40", "correct": false }, "control_time_shuffled": { "pred_answer": "40", "correct": false }, "control_shared_randvec": { "pred_answer": "", "correct": false }, "control_rand_subspace": { "pred_answer": "400", "correct": false }, "control_patch_nonshared": { "pred_answer": "", "correct": false } } ] }