Zishan-Shao's picture
Upload folder using huggingface_hub
aa0e435 verified
{
"meta": {
"model": "meta-llama/Llama-2-7b-chat-hf",
"device": "cuda",
"dtype": "fp32",
"layer": 10,
"layers_path": "model.layers",
"seed": 123,
"task": "gsm8k",
"eval_mode": "gen_math",
"eval_meta": {
"subspace_split": null,
"eval_split": "test",
"available_splits": [
"train",
"test"
],
"hf_id": "gsm8k/main"
},
"n_eval_loaded": 256,
"n_scanned": 256,
"base_acc_scan": 0.0390625,
"ablt_acc_scan": 0.03125,
"flips_total": 9,
"flips_used": 9,
"patch_steps": [
0,
1,
2,
3
],
"patch_n_steps": 4,
"Qs_path": "Q_shared_layer10.npy",
"Qs_shape": [
4096,
97
],
"gold_text_prefix": " ",
"dist_text_prefix": " ",
"gold_max_tokens": 0,
"distractor_mode": "next_gold",
"answer_prefix_effective": "\nLet's think step by step.\nFinal answer (number only):",
"max_new_tokens_effective": 64,
"run_coeff_controls": false,
"use_benchmark_loader": true,
"hf_id": "",
"hf_split": "test"
},
"summary_on_flips": {
"patched_self": {
"n": 9,
"rescued": 8,
"rescued_pct": 88.88888888888889
},
"control_time_shuffled": {
"n": 9,
"rescued": 7,
"rescued_pct": 77.77777777777777
},
"control_shared_randvec": {
"n": 9,
"rescued": 0,
"rescued_pct": 0.0
},
"control_rand_subspace": {
"n": 9,
"rescued": 0,
"rescued_pct": 0.0
},
"control_patch_nonshared": {
"n": 9,
"rescued": 0,
"rescued_pct": 0.0
}
},
"scan_rows": [
{
"ex_id": "gsm8k-test-0",
"gold_raw": "50",
"baseline": {
"pred_answer": "55",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 40
}
},
{
"ex_id": "gsm8k-test-1",
"gold_raw": "80",
"baseline": {
"pred_answer": "80",
"correct": true,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-2",
"gold_raw": "12",
"baseline": {
"pred_answer": "23",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-3",
"gold_raw": "140",
"baseline": {
"pred_answer": "420",
"correct": false,
"n_gen_tokens": 15
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-4",
"gold_raw": "36",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "6",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-5",
"gold_raw": "3200",
"baseline": {
"pred_answer": "1400",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-6",
"gold_raw": "38",
"baseline": {
"pred_answer": "23",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "8",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-7",
"gold_raw": "32",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-8",
"gold_raw": "92",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-9",
"gold_raw": "16",
"baseline": {
"pred_answer": "900",
"correct": false,
"n_gen_tokens": 21
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-10",
"gold_raw": "45",
"baseline": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-11",
"gold_raw": "270",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 17
}
},
{
"ex_id": "gsm8k-test-12",
"gold_raw": "100",
"baseline": {
"pred_answer": "100",
"correct": true,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-13",
"gold_raw": "25",
"baseline": {
"pred_answer": "25",
"correct": true,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "25",
"correct": true,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-14",
"gold_raw": "800",
"baseline": {
"pred_answer": "350",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-15",
"gold_raw": "2",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-16",
"gold_raw": "7000",
"baseline": {
"pred_answer": "450",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "400",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-17",
"gold_raw": "25",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-18",
"gold_raw": "3",
"baseline": {
"pred_answer": "3",
"correct": true,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-19",
"gold_raw": "3430",
"baseline": {
"pred_answer": "300",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-20",
"gold_raw": "106",
"baseline": {
"pred_answer": "102",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-21",
"gold_raw": "80",
"baseline": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-22",
"gold_raw": "26",
"baseline": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 25
}
},
{
"ex_id": "gsm8k-test-23",
"gold_raw": "750",
"baseline": {
"pred_answer": "75",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-24",
"gold_raw": "9",
"baseline": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-25",
"gold_raw": "40",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-26",
"gold_raw": "14",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-27",
"gold_raw": "160",
"baseline": {
"pred_answer": "128",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 8
}
},
{
"ex_id": "gsm8k-test-28",
"gold_raw": "6",
"baseline": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 38
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-29",
"gold_raw": "132",
"baseline": {
"pred_answer": "80",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "16",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-30",
"gold_raw": "8",
"baseline": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 12
}
},
{
"ex_id": "gsm8k-test-31",
"gold_raw": "68",
"baseline": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-32",
"gold_raw": "31",
"baseline": {
"pred_answer": "320",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "$",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-33",
"gold_raw": "100",
"baseline": {
"pred_answer": "500",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-34",
"gold_raw": "1509",
"baseline": {
"pred_answer": "781",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-35",
"gold_raw": "480",
"baseline": {
"pred_answer": "150",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-36",
"gold_raw": "520",
"baseline": {
"pred_answer": "42.00",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "100.00",
"correct": false,
"n_gen_tokens": 9
}
},
{
"ex_id": "gsm8k-test-37",
"gold_raw": "3",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 11
}
},
{
"ex_id": "gsm8k-test-38",
"gold_raw": "33",
"baseline": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-39",
"gold_raw": "120",
"baseline": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-40",
"gold_raw": "14",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-41",
"gold_raw": "20",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 7
}
},
{
"ex_id": "gsm8k-test-42",
"gold_raw": "95200",
"baseline": {
"pred_answer": "80000",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "40000",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-43",
"gold_raw": "77",
"baseline": {
"pred_answer": "19",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-44",
"gold_raw": "81",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-45",
"gold_raw": "310",
"baseline": {
"pred_answer": "210",
"correct": false,
"n_gen_tokens": 9
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-46",
"gold_raw": "100",
"baseline": {
"pred_answer": "80",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 9
}
},
{
"ex_id": "gsm8k-test-47",
"gold_raw": "160",
"baseline": {
"pred_answer": "240",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "8",
"correct": false,
"n_gen_tokens": 27
}
},
{
"ex_id": "gsm8k-test-48",
"gold_raw": "25",
"baseline": {
"pred_answer": "160",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "0.75",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-49",
"gold_raw": "1400",
"baseline": {
"pred_answer": "800",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "200",
"correct": false,
"n_gen_tokens": 28
}
},
{
"ex_id": "gsm8k-test-50",
"gold_raw": "120",
"baseline": {
"pred_answer": "60",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-51",
"gold_raw": "48",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-52",
"gold_raw": "50",
"baseline": {
"pred_answer": "21",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-53",
"gold_raw": "15400",
"baseline": {
"pred_answer": "000",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-54",
"gold_raw": "80",
"baseline": {
"pred_answer": "140",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-55",
"gold_raw": "5",
"baseline": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-56",
"gold_raw": "14",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-57",
"gold_raw": "31",
"baseline": {
"pred_answer": "16",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 8
}
},
{
"ex_id": "gsm8k-test-58",
"gold_raw": "36",
"baseline": {
"pred_answer": "150",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-59",
"gold_raw": "144",
"baseline": {
"pred_answer": "160",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 27
}
},
{
"ex_id": "gsm8k-test-60",
"gold_raw": "5",
"baseline": {
"pred_answer": "5",
"correct": true,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "65",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-61",
"gold_raw": "750",
"baseline": {
"pred_answer": "250",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-62",
"gold_raw": "38",
"baseline": {
"pred_answer": "48",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "60",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-63",
"gold_raw": "48",
"baseline": {
"pred_answer": "56",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-64",
"gold_raw": "655",
"baseline": {
"pred_answer": "1625",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-65",
"gold_raw": "800",
"baseline": {
"pred_answer": "0.5",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-66",
"gold_raw": "7300",
"baseline": {
"pred_answer": "10400",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "2300",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-67",
"gold_raw": "48",
"baseline": {
"pred_answer": "16",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-68",
"gold_raw": "4",
"baseline": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "4",
"correct": true,
"n_gen_tokens": 9
}
},
{
"ex_id": "gsm8k-test-69",
"gold_raw": "15",
"baseline": {
"pred_answer": "15",
"correct": true,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-70",
"gold_raw": "23",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-71",
"gold_raw": "225",
"baseline": {
"pred_answer": "150",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-72",
"gold_raw": "15",
"baseline": {
"pred_answer": "13",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-73",
"gold_raw": "82",
"baseline": {
"pred_answer": "32",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-74",
"gold_raw": "1218",
"baseline": {
"pred_answer": "952",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-75",
"gold_raw": "2",
"baseline": {
"pred_answer": "10.00",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "2",
"correct": true,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-76",
"gold_raw": "36",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-77",
"gold_raw": "13",
"baseline": {
"pred_answer": "12.50",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "20.50",
"correct": false,
"n_gen_tokens": 8
}
},
{
"ex_id": "gsm8k-test-78",
"gold_raw": "11",
"baseline": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-79",
"gold_raw": "8",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-80",
"gold_raw": "440",
"baseline": {
"pred_answer": "400",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-81",
"gold_raw": "2",
"baseline": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-82",
"gold_raw": "45",
"baseline": {
"pred_answer": "48",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-83",
"gold_raw": "54",
"baseline": {
"pred_answer": "109",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-84",
"gold_raw": "6",
"baseline": {
"pred_answer": "9",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-85",
"gold_raw": "240",
"baseline": {
"pred_answer": "1200",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-86",
"gold_raw": "428",
"baseline": {
"pred_answer": "400",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "278",
"correct": false,
"n_gen_tokens": 59
}
},
{
"ex_id": "gsm8k-test-87",
"gold_raw": "5",
"baseline": {
"pred_answer": "3.5",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "5",
"correct": true,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-88",
"gold_raw": "255",
"baseline": {
"pred_answer": "305",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-89",
"gold_raw": "10",
"baseline": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 9
}
},
{
"ex_id": "gsm8k-test-90",
"gold_raw": "9",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 23
}
},
{
"ex_id": "gsm8k-test-91",
"gold_raw": "157",
"baseline": {
"pred_answer": "175",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-92",
"gold_raw": "56",
"baseline": {
"pred_answer": "80",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-93",
"gold_raw": "5",
"baseline": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-94",
"gold_raw": "144",
"baseline": {
"pred_answer": "36",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-95",
"gold_raw": "50",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-96",
"gold_raw": "4",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-97",
"gold_raw": "50",
"baseline": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-98",
"gold_raw": "42",
"baseline": {
"pred_answer": "48",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-99",
"gold_raw": "7",
"baseline": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-100",
"gold_raw": "250",
"baseline": {
"pred_answer": "350",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "150",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-101",
"gold_raw": "12",
"baseline": {
"pred_answer": "11",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-102",
"gold_raw": "7",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-103",
"gold_raw": "8",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-104",
"gold_raw": "26",
"baseline": {
"pred_answer": "25.00",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "19.5",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-105",
"gold_raw": "42",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "26",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-106",
"gold_raw": "5",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "300",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-107",
"gold_raw": "14400",
"baseline": {
"pred_answer": "12000",
"correct": false,
"n_gen_tokens": 24
},
"ablated": {
"pred_answer": "5000000000000000000000000000000000000000000000000000000000000",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-108",
"gold_raw": "400",
"baseline": {
"pred_answer": "450",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-109",
"gold_raw": "40",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 60
}
},
{
"ex_id": "gsm8k-test-110",
"gold_raw": "83",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-111",
"gold_raw": "10",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "19",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-112",
"gold_raw": "80",
"baseline": {
"pred_answer": "400",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "6",
"correct": false,
"n_gen_tokens": 57
}
},
{
"ex_id": "gsm8k-test-113",
"gold_raw": "180",
"baseline": {
"pred_answer": "72",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "6.0",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-114",
"gold_raw": "1450000",
"baseline": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "000",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-115",
"gold_raw": "15",
"baseline": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-116",
"gold_raw": "1000",
"baseline": {
"pred_answer": "10000",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-117",
"gold_raw": "2",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3.00",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-118",
"gold_raw": "15",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "15",
"correct": true,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-119",
"gold_raw": "100",
"baseline": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "500000000000000000000000000000000000000000000000000000000000000",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-120",
"gold_raw": "335",
"baseline": {
"pred_answer": "250",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-121",
"gold_raw": "60",
"baseline": {
"pred_answer": "34",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-122",
"gold_raw": "5",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-123",
"gold_raw": "9500",
"baseline": {
"pred_answer": "70000",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-124",
"gold_raw": "160",
"baseline": {
"pred_answer": "80",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-125",
"gold_raw": "1050",
"baseline": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-126",
"gold_raw": "91",
"baseline": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-127",
"gold_raw": "21",
"baseline": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "36",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-128",
"gold_raw": "20",
"baseline": {
"pred_answer": "21",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-129",
"gold_raw": "36",
"baseline": {
"pred_answer": "42",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-130",
"gold_raw": "36",
"baseline": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 21
}
},
{
"ex_id": "gsm8k-test-131",
"gold_raw": "10",
"baseline": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-132",
"gold_raw": "5",
"baseline": {
"pred_answer": "9",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-133",
"gold_raw": "32",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-134",
"gold_raw": "18",
"baseline": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-135",
"gold_raw": "4",
"baseline": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "17",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-136",
"gold_raw": "48",
"baseline": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-137",
"gold_raw": "8",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-138",
"gold_raw": "21",
"baseline": {
"pred_answer": "11",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 19
}
},
{
"ex_id": "gsm8k-test-139",
"gold_raw": "25",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-140",
"gold_raw": "3000",
"baseline": {
"pred_answer": "000",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-141",
"gold_raw": "40",
"baseline": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-142",
"gold_raw": "50",
"baseline": {
"pred_answer": "28",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-143",
"gold_raw": "90",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-144",
"gold_raw": "23",
"baseline": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 7
}
},
{
"ex_id": "gsm8k-test-145",
"gold_raw": "2",
"baseline": {
"pred_answer": "32",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-146",
"gold_raw": "50",
"baseline": {
"pred_answer": "180",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-147",
"gold_raw": "122",
"baseline": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 19
}
},
{
"ex_id": "gsm8k-test-148",
"gold_raw": "300",
"baseline": {
"pred_answer": "300",
"correct": true,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 7
}
},
{
"ex_id": "gsm8k-test-149",
"gold_raw": "448",
"baseline": {
"pred_answer": "112",
"correct": false,
"n_gen_tokens": 7
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 7
}
},
{
"ex_id": "gsm8k-test-150",
"gold_raw": "2450",
"baseline": {
"pred_answer": "1500",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-151",
"gold_raw": "803",
"baseline": {
"pred_answer": "365",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-152",
"gold_raw": "16",
"baseline": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-153",
"gold_raw": "280",
"baseline": {
"pred_answer": "350",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-154",
"gold_raw": "13",
"baseline": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-155",
"gold_raw": "20",
"baseline": {
"pred_answer": "13.5",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "45",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-156",
"gold_raw": "14",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-157",
"gold_raw": "32",
"baseline": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-158",
"gold_raw": "105",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-159",
"gold_raw": "71",
"baseline": {
"pred_answer": "121",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-160",
"gold_raw": "5",
"baseline": {
"pred_answer": "8",
"correct": false,
"n_gen_tokens": 10
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-161",
"gold_raw": "30",
"baseline": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-162",
"gold_raw": "95",
"baseline": {
"pred_answer": "90",
"correct": false,
"n_gen_tokens": 15
},
"ablated": {
"pred_answer": "60",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-163",
"gold_raw": "147",
"baseline": {
"pred_answer": "130",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 36
}
},
{
"ex_id": "gsm8k-test-164",
"gold_raw": "10",
"baseline": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-165",
"gold_raw": "40000",
"baseline": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-166",
"gold_raw": "12",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-167",
"gold_raw": "129200",
"baseline": {
"pred_answer": "144000",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-168",
"gold_raw": "5",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-169",
"gold_raw": "45",
"baseline": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-170",
"gold_raw": "20",
"baseline": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-171",
"gold_raw": "1170",
"baseline": {
"pred_answer": "1500",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "300",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-172",
"gold_raw": "192",
"baseline": {
"pred_answer": "32",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 8
}
},
{
"ex_id": "gsm8k-test-173",
"gold_raw": "14",
"baseline": {
"pred_answer": "22",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "7",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-174",
"gold_raw": "144",
"baseline": {
"pred_answer": "48",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "6",
"correct": false,
"n_gen_tokens": 46
}
},
{
"ex_id": "gsm8k-test-175",
"gold_raw": "350",
"baseline": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-176",
"gold_raw": "50",
"baseline": {
"pred_answer": "42",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-177",
"gold_raw": "7",
"baseline": {
"pred_answer": "3.75",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-178",
"gold_raw": "50",
"baseline": {
"pred_answer": "50",
"correct": true,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-179",
"gold_raw": "8",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "8",
"correct": true,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-180",
"gold_raw": "3160",
"baseline": {
"pred_answer": "4800",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-181",
"gold_raw": "80",
"baseline": {
"pred_answer": "82.5",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "80",
"correct": true,
"n_gen_tokens": 7
}
},
{
"ex_id": "gsm8k-test-182",
"gold_raw": "50",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-183",
"gold_raw": "40",
"baseline": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-184",
"gold_raw": "78",
"baseline": {
"pred_answer": "45",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-185",
"gold_raw": "273",
"baseline": {
"pred_answer": "220",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-186",
"gold_raw": "2",
"baseline": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 26
}
},
{
"ex_id": "gsm8k-test-187",
"gold_raw": "195",
"baseline": {
"pred_answer": "90",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-188",
"gold_raw": "1128",
"baseline": {
"pred_answer": "168",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-189",
"gold_raw": "172",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-190",
"gold_raw": "30",
"baseline": {
"pred_answer": "53",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "50",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-191",
"gold_raw": "30",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-192",
"gold_raw": "92",
"baseline": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-193",
"gold_raw": "20",
"baseline": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-194",
"gold_raw": "540",
"baseline": {
"pred_answer": "91",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-195",
"gold_raw": "10",
"baseline": {
"pred_answer": "32",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-196",
"gold_raw": "10",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-197",
"gold_raw": "38",
"baseline": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "14",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-198",
"gold_raw": "4000",
"baseline": {
"pred_answer": "300",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-199",
"gold_raw": "594",
"baseline": {
"pred_answer": "200",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-200",
"gold_raw": "2",
"baseline": {
"pred_answer": "6",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-201",
"gold_raw": "142",
"baseline": {
"pred_answer": "104",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-202",
"gold_raw": "9",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-203",
"gold_raw": "6",
"baseline": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "6",
"correct": true,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-204",
"gold_raw": "100",
"baseline": {
"pred_answer": "260",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "40",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-205",
"gold_raw": "10",
"baseline": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-206",
"gold_raw": "15",
"baseline": {
"pred_answer": "155",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "35",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-207",
"gold_raw": "22",
"baseline": {
"pred_answer": "21",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-208",
"gold_raw": "16",
"baseline": {
"pred_answer": "96",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-209",
"gold_raw": "16",
"baseline": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-210",
"gold_raw": "5",
"baseline": {
"pred_answer": "2.5",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "3.5",
"correct": false,
"n_gen_tokens": 56
}
},
{
"ex_id": "gsm8k-test-211",
"gold_raw": "23",
"baseline": {
"pred_answer": "10",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-212",
"gold_raw": "30",
"baseline": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-213",
"gold_raw": "14000",
"baseline": {
"pred_answer": "000",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 33
}
},
{
"ex_id": "gsm8k-test-214",
"gold_raw": "60",
"baseline": {
"pred_answer": "8",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "16",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-215",
"gold_raw": "2",
"baseline": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-216",
"gold_raw": "3",
"baseline": {
"pred_answer": "6.25",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 19
}
},
{
"ex_id": "gsm8k-test-217",
"gold_raw": "30",
"baseline": {
"pred_answer": "34",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-218",
"gold_raw": "1920",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-219",
"gold_raw": "84",
"baseline": {
"pred_answer": "140",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-220",
"gold_raw": "8",
"baseline": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-221",
"gold_raw": "12",
"baseline": {
"pred_answer": "1200",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-222",
"gold_raw": "260",
"baseline": {
"pred_answer": "150",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-223",
"gold_raw": "288",
"baseline": {
"pred_answer": "330",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-224",
"gold_raw": "3",
"baseline": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 12
},
"ablated": {
"pred_answer": "$",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-225",
"gold_raw": "1596",
"baseline": {
"pred_answer": "1620",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "400",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-226",
"gold_raw": "81",
"baseline": {
"pred_answer": "2700",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-227",
"gold_raw": "56",
"baseline": {
"pred_answer": "108",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "60",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-228",
"gold_raw": "1490",
"baseline": {
"pred_answer": "200",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "5",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-229",
"gold_raw": "2",
"baseline": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 15
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-230",
"gold_raw": "20",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "55",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-231",
"gold_raw": "11",
"baseline": {
"pred_answer": "10.5",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-232",
"gold_raw": "120",
"baseline": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "105",
"correct": false,
"n_gen_tokens": 6
}
},
{
"ex_id": "gsm8k-test-233",
"gold_raw": "45",
"baseline": {
"pred_answer": "100",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-234",
"gold_raw": "10",
"baseline": {
"pred_answer": "8",
"correct": false,
"n_gen_tokens": 3
},
"ablated": {
"pred_answer": "30",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-235",
"gold_raw": "9",
"baseline": {
"pred_answer": "6.75",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "6.5",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-236",
"gold_raw": "33",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "13",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-237",
"gold_raw": "150",
"baseline": {
"pred_answer": "150",
"correct": true,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "25",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-238",
"gold_raw": "60",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-239",
"gold_raw": "4",
"baseline": {
"pred_answer": "9",
"correct": false,
"n_gen_tokens": 11
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-240",
"gold_raw": "7",
"baseline": {
"pred_answer": "24",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-241",
"gold_raw": "3140",
"baseline": {
"pred_answer": "1200",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "20000",
"correct": false,
"n_gen_tokens": 17
}
},
{
"ex_id": "gsm8k-test-242",
"gold_raw": "19",
"baseline": {
"pred_answer": "15",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 1
}
},
{
"ex_id": "gsm8k-test-243",
"gold_raw": "6",
"baseline": {
"pred_answer": "3.75",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "4",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-244",
"gold_raw": "90",
"baseline": {
"pred_answer": "120",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "180",
"correct": false,
"n_gen_tokens": 5
}
},
{
"ex_id": "gsm8k-test-245",
"gold_raw": "10",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-246",
"gold_raw": "130000",
"baseline": {
"pred_answer": "000",
"correct": false,
"n_gen_tokens": 10
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-247",
"gold_raw": "10",
"baseline": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "70",
"correct": false,
"n_gen_tokens": 19
}
},
{
"ex_id": "gsm8k-test-248",
"gold_raw": "525",
"baseline": {
"pred_answer": "105",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "18",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-249",
"gold_raw": "180",
"baseline": {
"pred_answer": "130",
"correct": false,
"n_gen_tokens": 5
},
"ablated": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-250",
"gold_raw": "1200",
"baseline": {
"pred_answer": "1200",
"correct": true,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 3
}
},
{
"ex_id": "gsm8k-test-251",
"gold_raw": "25",
"baseline": {
"pred_answer": "20",
"correct": false,
"n_gen_tokens": 4
},
"ablated": {
"pred_answer": "45",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-252",
"gold_raw": "21",
"baseline": {
"pred_answer": "3",
"correct": false,
"n_gen_tokens": 64
},
"ablated": {
"pred_answer": "",
"correct": false,
"n_gen_tokens": 2
}
},
{
"ex_id": "gsm8k-test-253",
"gold_raw": "2304",
"baseline": {
"pred_answer": "192",
"correct": false,
"n_gen_tokens": 16
},
"ablated": {
"pred_answer": "1",
"correct": false,
"n_gen_tokens": 64
}
},
{
"ex_id": "gsm8k-test-254",
"gold_raw": "2325",
"baseline": {
"pred_answer": "1250",
"correct": false,
"n_gen_tokens": 6
},
"ablated": {
"pred_answer": "<number>",
"correct": false,
"n_gen_tokens": 4
}
},
{
"ex_id": "gsm8k-test-255",
"gold_raw": "15",
"baseline": {
"pred_answer": "12",
"correct": false,
"n_gen_tokens": 8
},
"ablated": {
"pred_answer": "2",
"correct": false,
"n_gen_tokens": 64
}
}
],
"flip_rows": [
{
"ex_id": "gsm8k-test-1",
"gold_raw": "80",
"baseline": {
"pred_answer": "80",
"correct": true
},
"ablated": {
"pred_answer": "",
"correct": false
},
"patched_self": {
"pred_answer": "80",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "80",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "200",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "",
"correct": false
}
},
{
"ex_id": "gsm8k-test-12",
"gold_raw": "100",
"baseline": {
"pred_answer": "100",
"correct": true
},
"ablated": {
"pred_answer": "120",
"correct": false
},
"patched_self": {
"pred_answer": "100",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "100",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "60",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "2",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "120",
"correct": false
}
},
{
"ex_id": "gsm8k-test-18",
"gold_raw": "3",
"baseline": {
"pred_answer": "3",
"correct": true
},
"ablated": {
"pred_answer": "2",
"correct": false
},
"patched_self": {
"pred_answer": "3",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "3",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "4",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "2",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "2",
"correct": false
}
},
{
"ex_id": "gsm8k-test-60",
"gold_raw": "5",
"baseline": {
"pred_answer": "5",
"correct": true
},
"ablated": {
"pred_answer": "65",
"correct": false
},
"patched_self": {
"pred_answer": "5",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "58",
"correct": false
},
"control_shared_randvec": {
"pred_answer": "<number>",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "10",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "65",
"correct": false
}
},
{
"ex_id": "gsm8k-test-69",
"gold_raw": "15",
"baseline": {
"pred_answer": "15",
"correct": true
},
"ablated": {
"pred_answer": "",
"correct": false
},
"patched_self": {
"pred_answer": "15",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "15",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "",
"correct": false
}
},
{
"ex_id": "gsm8k-test-148",
"gold_raw": "300",
"baseline": {
"pred_answer": "300",
"correct": true
},
"ablated": {
"pred_answer": "100",
"correct": false
},
"patched_self": {
"pred_answer": "300",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "300",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "100",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "100",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "100",
"correct": false
}
},
{
"ex_id": "gsm8k-test-178",
"gold_raw": "50",
"baseline": {
"pred_answer": "50",
"correct": true
},
"ablated": {
"pred_answer": "35",
"correct": false
},
"patched_self": {
"pred_answer": "50",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "50",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "35",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "35",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "35",
"correct": false
}
},
{
"ex_id": "gsm8k-test-237",
"gold_raw": "150",
"baseline": {
"pred_answer": "150",
"correct": true
},
"ablated": {
"pred_answer": "25",
"correct": false
},
"patched_self": {
"pred_answer": "150",
"correct": true
},
"control_time_shuffled": {
"pred_answer": "150",
"correct": true
},
"control_shared_randvec": {
"pred_answer": "",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "2",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "25",
"correct": false
}
},
{
"ex_id": "gsm8k-test-250",
"gold_raw": "1200",
"baseline": {
"pred_answer": "1200",
"correct": true
},
"ablated": {
"pred_answer": "",
"correct": false
},
"patched_self": {
"pred_answer": "40",
"correct": false
},
"control_time_shuffled": {
"pred_answer": "40",
"correct": false
},
"control_shared_randvec": {
"pred_answer": "",
"correct": false
},
"control_rand_subspace": {
"pred_answer": "400",
"correct": false
},
"control_patch_nonshared": {
"pred_answer": "",
"correct": false
}
}
]
}