decodeshare / artifacts /patch_back /results /openanswer /gsm8k_pairlogprob.json
Zishan-Shao's picture
Upload folder using huggingface_hub
aa0e435 verified
{
"meta": {
"model": "meta-llama/Llama-2-7b-chat-hf",
"device": "cuda",
"dtype": "fp32",
"layer": 10,
"layers_path": "model.layers",
"seed": 123,
"task": "gsm8k",
"eval_mode": "pair_logprob",
"eval_meta": {
"subspace_split": null,
"eval_split": "test",
"available_splits": [
"train",
"test"
],
"hf_id": "gsm8k/main"
},
"n_eval_loaded": 256,
"n_scanned": 256,
"base_acc_scan": 0.625,
"ablt_acc_scan": 0.59375,
"flips_total": 31,
"flips_used": 31,
"patch_steps": [
0
],
"patch_n_steps": 1,
"Qs_path": "Q_shared_layer10.npy",
"Qs_shape": [
4096,
97
],
"gold_text_prefix": " ",
"dist_text_prefix": " ",
"gold_max_tokens": 0,
"distractor_mode": "next_gold",
"answer_prefix_effective": "\nFinal answer:",
"max_new_tokens_effective": 64,
"run_coeff_controls": false,
"use_benchmark_loader": true,
"hf_id": "",
"hf_split": "test"
},
"summary_on_flips": {
"patched_self": {
"n": 31,
"rescued": 11,
"rescued_pct": 35.483870967741936,
"mean_delta_margin_vs_ablated": 1.5506091117858887,
"median_delta_margin_vs_ablated": 1.8901087045669556
},
"control_time_shuffled": {
"n": 31,
"rescued": 11,
"rescued_pct": 35.483870967741936,
"mean_delta_margin_vs_ablated": 1.536723256111145,
"median_delta_margin_vs_ablated": 1.8725682497024536
},
"control_shared_randvec": {
"n": 31,
"rescued": 1,
"rescued_pct": 3.225806451612903,
"mean_delta_margin_vs_ablated": -0.34435272216796875,
"median_delta_margin_vs_ablated": -0.03446388244628906
},
"control_rand_subspace": {
"n": 31,
"rescued": 0,
"rescued_pct": 0.0,
"mean_delta_margin_vs_ablated": -0.26384562253952026,
"median_delta_margin_vs_ablated": -0.18717603385448456
},
"control_patch_nonshared": {
"n": 31,
"rescued": 0,
"rescued_pct": 0.0,
"mean_delta_margin_vs_ablated": 8.61082810388325e-07,
"median_delta_margin_vs_ablated": 1.0728836059570312e-06
}
},
"scan_rows": [
{
"ex_id": "gsm8k-test-0",
"gold_norm": "50",
"dist_norm": "80",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.0485178977251053,
"lp_gold": -13.375523149967194,
"lp_dist": -15.424041047692299,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.2319180071353912,
"lp_gold": -6.67494124174118,
"lp_dist": -8.906859248876572,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-1",
"gold_norm": "80",
"dist_norm": "12",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.218225084245205,
"lp_gold": -16.316218174993992,
"lp_dist": -17.534443259239197,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6342043727636337,
"lp_gold": -18.493512138724327,
"lp_dist": -17.859307765960693,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-2",
"gold_norm": "12",
"dist_norm": "140",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.582239151000977,
"lp_gold": -19.479307651519775,
"lp_dist": -29.061546802520752,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.06528377532959,
"lp_gold": -16.74149775505066,
"lp_dist": -20.80678153038025,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-3",
"gold_norm": "140",
"dist_norm": "36",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.5653446912765503,
"lp_gold": -19.955466985702515,
"lp_dist": -17.390122294425964,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.37339717149734497,
"lp_gold": -14.124846756458282,
"lp_dist": -13.751449584960938,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-4",
"gold_norm": "36",
"dist_norm": "3200",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 17.496737867593765,
"lp_gold": -13.73099598288536,
"lp_dist": -31.227733850479126,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 9.695431232452393,
"lp_gold": -7.723996877670288,
"lp_dist": -17.41942811012268,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-5",
"gold_norm": "3200",
"dist_norm": "38",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.075981711270288,
"lp_gold": -15.808944131014869,
"lp_dist": -19.884925842285156,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.112908275797963,
"lp_gold": -17.281133087351918,
"lp_dist": -15.168224811553955,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-6",
"gold_norm": "38",
"dist_norm": "32",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.0733052492141724,
"lp_gold": -17.57793438434601,
"lp_dist": -15.504629135131836,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.1486949920654297,
"lp_gold": -20.525099754333496,
"lp_dist": -20.376404762268066,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-7",
"gold_norm": "32",
"dist_norm": "92",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.4189205169677734,
"lp_gold": -16.66067409515381,
"lp_dist": -20.079594612121582,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.4474713802337646,
"lp_gold": -15.954271793365479,
"lp_dist": -18.401743173599243,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-8",
"gold_norm": "92",
"dist_norm": "16",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.1885854713618755,
"lp_gold": -20.10318946838379,
"lp_dist": -16.914603997021914,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.5822286009788513,
"lp_gold": -15.157714128494263,
"lp_dist": -12.575485527515411,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-9",
"gold_norm": "16",
"dist_norm": "45",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.206469178199768,
"lp_gold": -20.85190773010254,
"lp_dist": -17.64543855190277,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.272299289703369,
"lp_gold": -11.194756746292114,
"lp_dist": -13.467056035995483,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-10",
"gold_norm": "45",
"dist_norm": "270",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 11.154298067092896,
"lp_gold": -17.49683403968811,
"lp_dist": -28.651132106781006,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.5754551887512207,
"lp_gold": -13.103037357330322,
"lp_dist": -16.678492546081543,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-11",
"gold_norm": "270",
"dist_norm": "100",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.8893778324127197,
"lp_gold": -21.884052515029907,
"lp_dist": -17.994674682617188,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.458778917789459,
"lp_gold": -14.477847814559937,
"lp_dist": -10.019068896770477,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-12",
"gold_norm": "100",
"dist_norm": "25",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.500066578388214,
"lp_gold": -11.581663310527802,
"lp_dist": -17.081729888916016,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.39761683344841,
"lp_gold": -9.308580189943314,
"lp_dist": -11.706197023391724,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-13",
"gold_norm": "25",
"dist_norm": "800",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 20.992703570984304,
"lp_gold": -13.314849936403334,
"lp_dist": -34.30755350738764,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.717362227849662,
"lp_gold": -11.016974148340523,
"lp_dist": -18.734336376190186,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-14",
"gold_norm": "800",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.7909989710897207,
"lp_gold": -10.428668463602662,
"lp_dist": -13.219667434692383,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.28849396109581,
"lp_gold": -15.488610118627548,
"lp_dist": -9.200116157531738,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-15",
"gold_norm": "2",
"dist_norm": "7000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 14.264829635620117,
"lp_gold": -12.60490345954895,
"lp_dist": -26.869733095169067,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 10.93172574043274,
"lp_gold": -10.274073839187622,
"lp_dist": -21.20579957962036,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-16",
"gold_norm": "7000",
"dist_norm": "25",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.534815393853933,
"lp_gold": -21.196847282815725,
"lp_dist": -16.662031888961792,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.092950224876404,
"lp_gold": -21.74782168865204,
"lp_dist": -14.654871463775635,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-17",
"gold_norm": "25",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.14834189414978,
"lp_gold": -15.3827223777771,
"lp_dist": -11.23438048362732,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.8233906030654907,
"lp_gold": -9.256547331809998,
"lp_dist": -8.433156728744507,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-18",
"gold_norm": "3",
"dist_norm": "3430",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 31.8187518119812,
"lp_gold": -10.239798672497272,
"lp_dist": -42.058550484478474,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 16.154653549194336,
"lp_gold": -7.938319206237793,
"lp_dist": -24.09297275543213,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-19",
"gold_norm": "3430",
"dist_norm": "106",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.379680693149567,
"lp_gold": -23.033769607543945,
"lp_dist": -18.65408891439438,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -9.992487907409668,
"lp_gold": -21.179072380065918,
"lp_dist": -11.18658447265625,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-20",
"gold_norm": "106",
"dist_norm": "80",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.92336449585855,
"lp_gold": -15.76830449141562,
"lp_dist": -20.69166898727417,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6747859213501215,
"lp_gold": -17.988985607400537,
"lp_dist": -17.314199686050415,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-21",
"gold_norm": "80",
"dist_norm": "26",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.6582831740379333,
"lp_gold": -10.863374054431915,
"lp_dist": -13.521657228469849,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.6402748823165894,
"lp_gold": -11.24216091632843,
"lp_dist": -12.88243579864502,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-22",
"gold_norm": "26",
"dist_norm": "750",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.5539721846580505,
"lp_gold": -21.11834144592285,
"lp_dist": -25.672313630580902,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.063919067382812,
"lp_gold": -12.24570107460022,
"lp_dist": -20.309620141983032,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-23",
"gold_norm": "750",
"dist_norm": "9",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.4596693105995655,
"lp_gold": -14.162512499839067,
"lp_dist": -13.702843189239502,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.3531132936477661,
"lp_gold": -11.305097699165344,
"lp_dist": -10.951984405517578,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-24",
"gold_norm": "9",
"dist_norm": "40",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.006132304668426514,
"lp_gold": -15.839151382446289,
"lp_dist": -15.845283687114716,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.110446274280548,
"lp_gold": -12.25863265991211,
"lp_dist": -10.148186385631561,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-25",
"gold_norm": "40",
"dist_norm": "14",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.5176091194152832,
"lp_gold": -16.009315252304077,
"lp_dist": -15.491706132888794,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.6056453585624695,
"lp_gold": -14.140560686588287,
"lp_dist": -15.746206045150757,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-26",
"gold_norm": "14",
"dist_norm": "160",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.092049598693848,
"lp_gold": -12.284036666154861,
"lp_dist": -25.37608626484871,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.250454902648926,
"lp_gold": -11.187321424484253,
"lp_dist": -22.43777632713318,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-27",
"gold_norm": "160",
"dist_norm": "6",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.597537249326706,
"lp_gold": -12.841732293367386,
"lp_dist": -14.439269542694092,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.338132083415985,
"lp_gold": -17.455387771129608,
"lp_dist": -10.117255687713623,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-28",
"gold_norm": "6",
"dist_norm": "132",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.4012770652771,
"lp_gold": -12.933898210525513,
"lp_dist": -18.335175275802612,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.1544095277786255,
"lp_gold": -9.050714015960693,
"lp_dist": -16.20512354373932,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-29",
"gold_norm": "132",
"dist_norm": "8",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -8.112765461206436,
"lp_gold": -18.76314067840576,
"lp_dist": -10.650375217199326,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.735072135925293,
"lp_gold": -16.513195633888245,
"lp_dist": -7.778123497962952,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-30",
"gold_norm": "8",
"dist_norm": "68",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2700021266937256,
"lp_gold": -11.72844409942627,
"lp_dist": -13.998446226119995,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.666181564331055,
"lp_gold": -7.928534984588623,
"lp_dist": -20.594716548919678,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-31",
"gold_norm": "68",
"dist_norm": "31",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.820281505584717,
"lp_gold": -15.837103843688965,
"lp_dist": -19.65738534927368,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.25214481353759766,
"lp_gold": -12.841001033782959,
"lp_dist": -12.588856220245361,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-32",
"gold_norm": "31",
"dist_norm": "100",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.4860659539699554,
"lp_gold": -13.60796919465065,
"lp_dist": -20.094035148620605,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.465068936347961,
"lp_gold": -14.221534967422485,
"lp_dist": -18.686603903770447,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-33",
"gold_norm": "100",
"dist_norm": "1509",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.144955581985414,
"lp_gold": -13.722247913479805,
"lp_dist": -26.86720349546522,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.218970347195864,
"lp_gold": -9.529480028897524,
"lp_dist": -22.748450376093388,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-34",
"gold_norm": "1509",
"dist_norm": "480",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.574819326400757,
"lp_gold": -23.19943141937256,
"lp_dist": -16.6246120929718,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.709552764892578,
"lp_gold": -18.728264808654785,
"lp_dist": -11.018712043762207,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-35",
"gold_norm": "480",
"dist_norm": "520",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.5111888945102692,
"lp_gold": -12.033819317817688,
"lp_dist": -13.545008212327957,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.8960548639297485,
"lp_gold": -19.025392055511475,
"lp_dist": -16.129337191581726,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-36",
"gold_norm": "520",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.531530350446701,
"lp_gold": -14.413595885038376,
"lp_dist": -11.882065534591675,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.166505575180054,
"lp_gold": -12.240307569503784,
"lp_dist": -8.07380199432373,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-37",
"gold_norm": "3",
"dist_norm": "33",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.900035858154297,
"lp_gold": -12.652887344360352,
"lp_dist": -20.55292320251465,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.076467990875244,
"lp_gold": -7.961295485496521,
"lp_dist": -14.037763476371765,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-38",
"gold_norm": "33",
"dist_norm": "120",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 15.355147242546082,
"lp_gold": -12.304473280906677,
"lp_dist": -27.65962052345276,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 10.39077889919281,
"lp_gold": -11.524258255958557,
"lp_dist": -21.915037155151367,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-39",
"gold_norm": "120",
"dist_norm": "14",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.22332683950662613,
"lp_gold": -14.252640329301357,
"lp_dist": -14.475967168807983,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.3626268804073334,
"lp_gold": -10.953217655420303,
"lp_dist": -13.315844535827637,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-40",
"gold_norm": "14",
"dist_norm": "20",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -7.754596941173077,
"lp_gold": -17.770805835723877,
"lp_dist": -10.0162088945508,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.5667039155960083,
"lp_gold": -8.350147247314453,
"lp_dist": -7.783443331718445,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-41",
"gold_norm": "20",
"dist_norm": "95200",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 33.85109522007406,
"lp_gold": -16.739925840869546,
"lp_dist": -50.5910210609436,
"n_tokens_gold": 3,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 23.679821968078613,
"lp_gold": -9.72859787940979,
"lp_dist": -33.4084198474884,
"n_tokens_gold": 3,
"n_tokens_dist": 6
}
},
{
"ex_id": "gsm8k-test-42",
"gold_norm": "95200",
"dist_norm": "77",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.7502023852430284,
"lp_gold": -19.08837911253795,
"lp_dist": -16.338176727294922,
"n_tokens_gold": 6,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.2610103897750378,
"lp_gold": -21.381718140095472,
"lp_dist": -18.120707750320435,
"n_tokens_gold": 6,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-43",
"gold_norm": "77",
"dist_norm": "81",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.041521549224853516,
"lp_gold": -20.968489170074463,
"lp_dist": -20.92696762084961,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.2143611907958984,
"lp_gold": -14.122482776641846,
"lp_dist": -16.336843967437744,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-44",
"gold_norm": "81",
"dist_norm": "310",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.942630738019943,
"lp_gold": -12.305748492479324,
"lp_dist": -21.248379230499268,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.2553623914718628,
"lp_gold": -13.983943223953247,
"lp_dist": -14.23930561542511,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-45",
"gold_norm": "310",
"dist_norm": "100",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9456039071083069,
"lp_gold": -16.172270894050598,
"lp_dist": -17.117874801158905,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.856696009635925,
"lp_gold": -16.36608850955963,
"lp_dist": -10.509392499923706,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-46",
"gold_norm": "100",
"dist_norm": "160",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.3894251135061495,
"lp_gold": -12.40682859485969,
"lp_dist": -15.796253708365839,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.0833441019058228,
"lp_gold": -16.92129546403885,
"lp_dist": -19.00463956594467,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-47",
"gold_norm": "160",
"dist_norm": "25",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.824264804366976,
"lp_gold": -14.298029144760221,
"lp_dist": -15.122293949127197,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.2988634258508682,
"lp_gold": -13.753293856978416,
"lp_dist": -14.052157282829285,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-48",
"gold_norm": "25",
"dist_norm": "1400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.125034153461456,
"lp_gold": -16.447975158691406,
"lp_dist": -23.573009312152863,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.08089584112167358,
"lp_gold": -13.430449962615967,
"lp_dist": -13.349554121494293,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-49",
"gold_norm": "1400",
"dist_norm": "120",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.9659185571945272,
"lp_gold": -15.535773673269432,
"lp_dist": -12.569855116074905,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.700849339365959,
"lp_gold": -12.968689993023872,
"lp_dist": -9.267840653657913,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-50",
"gold_norm": "120",
"dist_norm": "48",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.5417392253875732,
"lp_gold": -22.00163245201111,
"lp_dist": -25.54337167739868,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.251087546348572,
"lp_gold": -15.896643280982971,
"lp_dist": -21.147730827331543,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-51",
"gold_norm": "48",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9340271949768066,
"lp_gold": -11.738685846328735,
"lp_dist": -12.672713041305542,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.7094523906707764,
"lp_gold": -6.564473628997803,
"lp_dist": -8.273926019668579,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-52",
"gold_norm": "50",
"dist_norm": "15400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 19.066895127296448,
"lp_gold": -13.457320094108582,
"lp_dist": -32.52421522140503,
"n_tokens_gold": 3,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.942362904548645,
"lp_gold": -15.37247109413147,
"lp_dist": -22.314833998680115,
"n_tokens_gold": 3,
"n_tokens_dist": 6
}
},
{
"ex_id": "gsm8k-test-53",
"gold_norm": "15400",
"dist_norm": "80",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.8634248977759853,
"lp_gold": -20.78267443238292,
"lp_dist": -18.919249534606934,
"n_tokens_gold": 6,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.060255475342274,
"lp_gold": -20.97295517474413,
"lp_dist": -16.912699699401855,
"n_tokens_gold": 6,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-54",
"gold_norm": "80",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.1981298923492432,
"lp_gold": -18.350556135177612,
"lp_dist": -17.15242624282837,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.7748947478830814,
"lp_gold": -7.663371529430151,
"lp_dist": -11.438266277313232,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-55",
"gold_norm": "5",
"dist_norm": "14",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.197003062348813,
"lp_gold": -18.496329307556152,
"lp_dist": -12.29932624520734,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.144104599952698,
"lp_gold": -7.03524386882782,
"lp_dist": -14.179348468780518,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-56",
"gold_norm": "14",
"dist_norm": "31",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.266514074697625,
"lp_gold": -18.428703057870734,
"lp_dist": -26.69521713256836,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.1581010818481445,
"lp_gold": -12.810563087463379,
"lp_dist": -16.968664169311523,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-57",
"gold_norm": "31",
"dist_norm": "36",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.7361334562301636,
"lp_gold": -14.841211199760437,
"lp_dist": -13.105077743530273,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6261711120605469,
"lp_gold": -13.683454990386963,
"lp_dist": -13.057283878326416,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-58",
"gold_norm": "36",
"dist_norm": "144",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.37734442949295044,
"lp_gold": -15.673691511154175,
"lp_dist": -16.051035940647125,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.521233081817627,
"lp_gold": -17.215554237365723,
"lp_dist": -20.73678731918335,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-59",
"gold_norm": "144",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.0488511323928833,
"lp_gold": -15.628765225410461,
"lp_dist": -13.579914093017578,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.597236156463623,
"lp_gold": -16.582991123199463,
"lp_dist": -13.98575496673584,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-60",
"gold_norm": "5",
"dist_norm": "750",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 11.9340181350708,
"lp_gold": -15.854983806610107,
"lp_dist": -27.789001941680908,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.577134609222412,
"lp_gold": -16.538414001464844,
"lp_dist": -24.115548610687256,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-61",
"gold_norm": "750",
"dist_norm": "38",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.7815818190574646,
"lp_gold": -18.73184484243393,
"lp_dist": -17.950263023376465,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.5384882092475891,
"lp_gold": -11.338704288005829,
"lp_dist": -11.877192497253418,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-62",
"gold_norm": "38",
"dist_norm": "48",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.48445200920105,
"lp_gold": -17.717634916305542,
"lp_dist": -11.233182907104492,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.8313806354999542,
"lp_gold": -12.223527193069458,
"lp_dist": -8.392146557569504,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-63",
"gold_norm": "48",
"dist_norm": "655",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 19.08035659790039,
"lp_gold": -14.92322301864624,
"lp_dist": -34.00357961654663,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.06407618522644,
"lp_gold": -10.739889144897461,
"lp_dist": -23.8039653301239,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-64",
"gold_norm": "655",
"dist_norm": "800",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.8698419332504272,
"lp_gold": -17.930187582969666,
"lp_dist": -19.800029516220093,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4062272310256958,
"lp_gold": -14.47088611125946,
"lp_dist": -13.064658880233765,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-65",
"gold_norm": "800",
"dist_norm": "7300",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 11.858935464173555,
"lp_gold": -13.705154906958342,
"lp_dist": -25.564090371131897,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.260221555829048,
"lp_gold": -14.01822917163372,
"lp_dist": -25.27845072746277,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-66",
"gold_norm": "7300",
"dist_norm": "48",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.354404352605343,
"lp_gold": -20.02805521339178,
"lp_dist": -17.673650860786438,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.008578598499298096,
"lp_gold": -13.163566768169403,
"lp_dist": -13.172145366668701,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-67",
"gold_norm": "48",
"dist_norm": "4",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.25957900285720825,
"lp_gold": -17.536937534809113,
"lp_dist": -17.277358531951904,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.7365117073059082,
"lp_gold": -10.189218521118164,
"lp_dist": -8.452706813812256,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-68",
"gold_norm": "4",
"dist_norm": "15",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.0612575560808182,
"lp_gold": -15.63377046585083,
"lp_dist": -14.572512909770012,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.9296765327453613,
"lp_gold": -9.090213418006897,
"lp_dist": -12.019889950752258,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-69",
"gold_norm": "15",
"dist_norm": "23",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.69707328081131,
"lp_gold": -14.628733813762665,
"lp_dist": -23.325807094573975,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.665110111236572,
"lp_gold": -12.951319694519043,
"lp_dist": -17.616429805755615,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-70",
"gold_norm": "23",
"dist_norm": "225",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 11.066251754760742,
"lp_gold": -16.185874462127686,
"lp_dist": -27.252126216888428,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.176498889923096,
"lp_gold": -14.897132635116577,
"lp_dist": -20.073631525039673,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-71",
"gold_norm": "225",
"dist_norm": "15",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.3921156525611877,
"lp_gold": -16.729829609394073,
"lp_dist": -13.337713956832886,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.107092618942261,
"lp_gold": -17.290175914764404,
"lp_dist": -11.183083295822144,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-72",
"gold_norm": "15",
"dist_norm": "82",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.4555931091308594,
"lp_gold": -13.475011110305786,
"lp_dist": -16.930604219436646,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 9.554435849189758,
"lp_gold": -11.591153025627136,
"lp_dist": -21.145588874816895,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-73",
"gold_norm": "82",
"dist_norm": "1218",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.144330263137817,
"lp_gold": -14.399481773376465,
"lp_dist": -24.543812036514282,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.632067203521729,
"lp_gold": -11.898912191390991,
"lp_dist": -25.53097939491272,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-74",
"gold_norm": "1218",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -14.016261577606201,
"lp_gold": -28.183964252471924,
"lp_dist": -14.167702674865723,
"n_tokens_gold": 5,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -9.392110228538513,
"lp_gold": -21.020013689994812,
"lp_dist": -11.627903461456299,
"n_tokens_gold": 5,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-75",
"gold_norm": "2",
"dist_norm": "36",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.396540880203247,
"lp_gold": -13.256061553955078,
"lp_dist": -15.652602434158325,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.963439464569092,
"lp_gold": -9.20676326751709,
"lp_dist": -16.17020273208618,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-76",
"gold_norm": "36",
"dist_norm": "13",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.540708303451538,
"lp_gold": -18.379968881607056,
"lp_dist": -14.839260578155518,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6864776611328125,
"lp_gold": -14.157576084136963,
"lp_dist": -13.47109842300415,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-77",
"gold_norm": "13",
"dist_norm": "11",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.3661365509033203,
"lp_gold": -15.502496480941772,
"lp_dist": -15.868633031845093,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.4615020751953125,
"lp_gold": -14.546976089477539,
"lp_dist": -15.008478164672852,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-78",
"gold_norm": "11",
"dist_norm": "8",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.7840418815612793,
"lp_gold": -21.872905254364014,
"lp_dist": -18.088863372802734,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.002429008483887,
"lp_gold": -13.64483380317688,
"lp_dist": -9.642404794692993,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-79",
"gold_norm": "8",
"dist_norm": "440",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.767229557037354,
"lp_gold": -17.545647621154785,
"lp_dist": -34.31287717819214,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.454898834228516,
"lp_gold": -12.977333545684814,
"lp_dist": -24.43223237991333,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-80",
"gold_norm": "440",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.289784381631762,
"lp_gold": -15.706766793970019,
"lp_dist": -13.416982412338257,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.212664246559143,
"lp_gold": -17.012100338935852,
"lp_dist": -13.799436092376709,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-81",
"gold_norm": "2",
"dist_norm": "45",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.003484487533569,
"lp_gold": -13.458641052246094,
"lp_dist": -20.462125539779663,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.808353066444397,
"lp_gold": -8.184542536735535,
"lp_dist": -13.992895603179932,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-82",
"gold_norm": "45",
"dist_norm": "54",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.24556124210357666,
"lp_gold": -15.131654500961304,
"lp_dist": -14.886093258857727,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.5171573162078857,
"lp_gold": -10.597485780715942,
"lp_dist": -12.114643096923828,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-83",
"gold_norm": "54",
"dist_norm": "6",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.36360502243042,
"lp_gold": -13.119836330413818,
"lp_dist": -10.756231307983398,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.9401865005493164,
"lp_gold": -13.909927368164062,
"lp_dist": -10.969740867614746,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-84",
"gold_norm": "6",
"dist_norm": "240",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.525604009628296,
"lp_gold": -14.586916446685791,
"lp_dist": -31.112520456314087,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.351407527923584,
"lp_gold": -8.294451236724854,
"lp_dist": -20.645858764648438,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-85",
"gold_norm": "240",
"dist_norm": "428",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.200618744827807,
"lp_gold": -10.596241324208677,
"lp_dist": -15.796860069036484,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.575641840696335,
"lp_gold": -10.782865315675735,
"lp_dist": -16.35850715637207,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-86",
"gold_norm": "428",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -11.445145592093468,
"lp_gold": -21.372050523757935,
"lp_dist": -9.926904931664467,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.8857234716415405,
"lp_gold": -13.641488909721375,
"lp_dist": -10.755765438079834,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-87",
"gold_norm": "5",
"dist_norm": "255",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.83140754699707,
"lp_gold": -11.654325008392334,
"lp_dist": -25.485732555389404,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 15.030074834823608,
"lp_gold": -10.351794719696045,
"lp_dist": -25.381869554519653,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-88",
"gold_norm": "255",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.704339981079102,
"lp_gold": -22.688746690750122,
"lp_dist": -17.98440670967102,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.790473118424416,
"lp_gold": -23.528611078858376,
"lp_dist": -18.73813796043396,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-89",
"gold_norm": "10",
"dist_norm": "9",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.6458263397216797,
"lp_gold": -14.022311687469482,
"lp_dist": -12.376485347747803,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.548365592956543,
"lp_gold": -11.53894329071045,
"lp_dist": -9.990577697753906,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-90",
"gold_norm": "9",
"dist_norm": "157",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 11.838926374912262,
"lp_gold": -14.729028940200806,
"lp_dist": -26.567955315113068,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.449068486690521,
"lp_gold": -13.700207710266113,
"lp_dist": -18.149276196956635,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-91",
"gold_norm": "157",
"dist_norm": "56",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.06636106967926025,
"lp_gold": -13.999522089958191,
"lp_dist": -13.93316102027893,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.18358194828033447,
"lp_gold": -12.361805081367493,
"lp_dist": -12.178223133087158,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-92",
"gold_norm": "56",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.014044851064682007,
"lp_gold": -13.622624963521957,
"lp_dist": -13.608580112457275,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.375391006469727,
"lp_gold": -15.001872539520264,
"lp_dist": -9.626481533050537,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-93",
"gold_norm": "5",
"dist_norm": "144",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.107487440109253,
"lp_gold": -21.01281452178955,
"lp_dist": -31.120301961898804,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.960463047027588,
"lp_gold": -8.431816339492798,
"lp_dist": -20.392279386520386,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-94",
"gold_norm": "144",
"dist_norm": "50",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.8007860428187996,
"lp_gold": -16.124470019945875,
"lp_dist": -15.323683977127075,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.586140275001526,
"lp_gold": -13.455833077430725,
"lp_dist": -10.8696928024292,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-95",
"gold_norm": "50",
"dist_norm": "4",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.431166723370552,
"lp_gold": -14.887599676847458,
"lp_dist": -10.456432953476906,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.6977656185626984,
"lp_gold": -7.156943529844284,
"lp_dist": -8.854709148406982,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-96",
"gold_norm": "4",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.549594163894653,
"lp_gold": -12.547298669815063,
"lp_dist": -19.096892833709717,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.145283490419388,
"lp_gold": -8.248348951339722,
"lp_dist": -10.39363244175911,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-97",
"gold_norm": "50",
"dist_norm": "42",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.1262907013297081,
"lp_gold": -12.730935551226139,
"lp_dist": -12.857226252555847,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.2466068267822266,
"lp_gold": -9.356587171554565,
"lp_dist": -12.603193998336792,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-98",
"gold_norm": "42",
"dist_norm": "7",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -5.333731412887573,
"lp_gold": -18.121748208999634,
"lp_dist": -12.78801679611206,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.454429030418396,
"lp_gold": -11.41723620891571,
"lp_dist": -10.962807178497314,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-99",
"gold_norm": "7",
"dist_norm": "250",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.109447717666626,
"lp_gold": -17.65127396583557,
"lp_dist": -27.760721683502197,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.160389855504036,
"lp_gold": -12.602290153503418,
"lp_dist": -15.762680009007454,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-100",
"gold_norm": "250",
"dist_norm": "12",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.937235951423645,
"lp_gold": -16.930358290672302,
"lp_dist": -18.867594242095947,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.8317363262176514,
"lp_gold": -17.415368795394897,
"lp_dist": -15.583632469177246,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-101",
"gold_norm": "12",
"dist_norm": "7",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.4091547727584839,
"lp_gold": -11.118706822395325,
"lp_dist": -12.527861595153809,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.3088250160217285,
"lp_gold": -9.039770126342773,
"lp_dist": -11.348595142364502,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-102",
"gold_norm": "7",
"dist_norm": "8",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.3529720306396484,
"lp_gold": -18.84420108795166,
"lp_dist": -17.49122905731201,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.1846144199371338,
"lp_gold": -10.204635620117188,
"lp_dist": -9.020021200180054,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-103",
"gold_norm": "8",
"dist_norm": "26",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.383567810058594,
"lp_gold": -15.868620872497559,
"lp_dist": -26.252188682556152,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.597115993499756,
"lp_gold": -10.359461784362793,
"lp_dist": -18.95657777786255,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-104",
"gold_norm": "26",
"dist_norm": "42",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.550231754779816,
"lp_gold": -16.079154193401337,
"lp_dist": -20.629385948181152,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.743180751800537,
"lp_gold": -20.936619758605957,
"lp_dist": -17.19343900680542,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-105",
"gold_norm": "42",
"dist_norm": "5",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2448320388793945,
"lp_gold": -17.29369354248047,
"lp_dist": -19.538525581359863,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.48531031608581543,
"lp_gold": -14.891574144363403,
"lp_dist": -14.406263828277588,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-106",
"gold_norm": "5",
"dist_norm": "14400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 17.918405055999756,
"lp_gold": -16.223863124847412,
"lp_dist": -34.14226818084717,
"n_tokens_gold": 2,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 18.689422607421875,
"lp_gold": -8.31445324420929,
"lp_dist": -27.003875851631165,
"n_tokens_gold": 2,
"n_tokens_dist": 6
}
},
{
"ex_id": "gsm8k-test-107",
"gold_norm": "14400",
"dist_norm": "400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8998411595821381,
"lp_gold": -19.079706698656082,
"lp_dist": -19.97954785823822,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.72913409024477,
"lp_gold": -22.703229255974293,
"lp_dist": -20.974095165729523,
"n_tokens_gold": 6,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-108",
"gold_norm": "400",
"dist_norm": "40",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.00030357998912222683,
"lp_gold": -12.683453394594835,
"lp_dist": -12.683149814605713,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.014066597446799278,
"lp_gold": -7.615564605221152,
"lp_dist": -7.601498007774353,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-109",
"gold_norm": "40",
"dist_norm": "83",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.819165468215942,
"lp_gold": -15.469541311264038,
"lp_dist": -21.28870677947998,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.161367028951645,
"lp_gold": -13.052747160196304,
"lp_dist": -21.21411418914795,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-110",
"gold_norm": "83",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9284783601760864,
"lp_gold": -14.152065396308899,
"lp_dist": -15.080543756484985,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.6964447498321533,
"lp_gold": -11.795601606369019,
"lp_dist": -8.099156856536865,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-111",
"gold_norm": "10",
"dist_norm": "80",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.51949143409729,
"lp_gold": -16.031707048416138,
"lp_dist": -22.551198482513428,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.987100124359131,
"lp_gold": -14.24100637435913,
"lp_dist": -20.22810649871826,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-112",
"gold_norm": "80",
"dist_norm": "180",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.0028108435217291,
"lp_gold": -14.749124482274055,
"lp_dist": -15.751935325795785,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.4700966998934746,
"lp_gold": -9.443349197506905,
"lp_dist": -11.91344589740038,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-113",
"gold_norm": "180",
"dist_norm": "1450000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 21.25386095046997,
"lp_gold": -27.976417541503906,
"lp_dist": -49.23027849197388,
"n_tokens_gold": 4,
"n_tokens_dist": 8
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.907071352005005,
"lp_gold": -15.60246878862381,
"lp_dist": -29.509540140628815,
"n_tokens_gold": 4,
"n_tokens_dist": 8
}
},
{
"ex_id": "gsm8k-test-114",
"gold_norm": "1450000",
"dist_norm": "15",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -5.66898221289739,
"lp_gold": -18.9738236120902,
"lp_dist": -13.30484139919281,
"n_tokens_gold": 8,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.122156334575266,
"lp_gold": -18.38626338308677,
"lp_dist": -10.264107048511505,
"n_tokens_gold": 8,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-115",
"gold_norm": "15",
"dist_norm": "1000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.356005743145943,
"lp_gold": -21.951110124588013,
"lp_dist": -30.307115867733955,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.199884116649628,
"lp_gold": -11.521326780319214,
"lp_dist": -19.72121089696884,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-116",
"gold_norm": "1000",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.19174530085001606,
"lp_gold": -13.59133626993571,
"lp_dist": -13.399590969085693,
"n_tokens_gold": 5,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.19964181631803513,
"lp_gold": -10.637855164706707,
"lp_dist": -10.438213348388672,
"n_tokens_gold": 5,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-117",
"gold_norm": "2",
"dist_norm": "15",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.722217559814453,
"lp_gold": -18.274237632751465,
"lp_dist": -24.996455192565918,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.5563411712646484,
"lp_gold": -12.917238712310791,
"lp_dist": -14.47357988357544,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-118",
"gold_norm": "15",
"dist_norm": "100",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.2469024658203125,
"lp_gold": -14.20986533164978,
"lp_dist": -17.456767797470093,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.755454957485199,
"lp_gold": -7.566788613796234,
"lp_dist": -16.322243571281433,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-119",
"gold_norm": "100",
"dist_norm": "335",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.725587379769422,
"lp_gold": -11.31663225905504,
"lp_dist": -28.042219638824463,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.39561602845788,
"lp_gold": -11.48399594053626,
"lp_dist": -19.87961196899414,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-120",
"gold_norm": "335",
"dist_norm": "60",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.718258023262024,
"lp_gold": -16.84885323047638,
"lp_dist": -18.567111253738403,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.495100736618042,
"lp_gold": -16.78837823867798,
"lp_dist": -13.293277502059937,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-121",
"gold_norm": "60",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.7624173164367676,
"lp_gold": -16.792863368988037,
"lp_dist": -14.03044605255127,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.886390924453735,
"lp_gold": -13.833235025405884,
"lp_dist": -7.946844100952148,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-122",
"gold_norm": "5",
"dist_norm": "9500",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.83846201375127,
"lp_gold": -13.696074485778809,
"lp_dist": -30.534536499530077,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 15.658583138138056,
"lp_gold": -8.453100323677063,
"lp_dist": -24.11168346181512,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-123",
"gold_norm": "9500",
"dist_norm": "160",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.3972220839932561,
"lp_gold": -20.279614341445267,
"lp_dist": -19.88239225745201,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.62424199283123,
"lp_gold": -20.69040386378765,
"lp_dist": -18.06616187095642,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-124",
"gold_norm": "160",
"dist_norm": "1050",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.60233561617497,
"lp_gold": -18.274476603444782,
"lp_dist": -34.87681221961975,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.274514463730156,
"lp_gold": -13.8317128745839,
"lp_dist": -22.106227338314056,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-125",
"gold_norm": "1050",
"dist_norm": "91",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.0837584948167205,
"lp_gold": -23.45734657999128,
"lp_dist": -21.37358808517456,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.869788646697998,
"lp_gold": -20.34303617477417,
"lp_dist": -17.473247528076172,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-126",
"gold_norm": "91",
"dist_norm": "21",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.658190071582794,
"lp_gold": -20.631014347076416,
"lp_dist": -13.972824275493622,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.4234659671783447,
"lp_gold": -17.770225048065186,
"lp_dist": -15.34675908088684,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-127",
"gold_norm": "21",
"dist_norm": "20",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.925290822982788,
"lp_gold": -16.08545808121562,
"lp_dist": -14.160167258232832,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.1263790130615234,
"lp_gold": -13.073553562164307,
"lp_dist": -10.947174549102783,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-128",
"gold_norm": "20",
"dist_norm": "36",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.3922419548034668,
"lp_gold": -19.574571132659912,
"lp_dist": -19.96681308746338,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.179409921169281,
"lp_gold": -8.625289022922516,
"lp_dist": -10.804698944091797,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-129",
"gold_norm": "36",
"dist_norm": "36",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -12.731476545333862,
"lp_dist": -12.731476545333862,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -11.376878261566162,
"lp_dist": -11.376878261566162,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-130",
"gold_norm": "36",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.552075147628784,
"lp_gold": -14.53696346282959,
"lp_dist": -11.984888315200806,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -11.998991012573242,
"lp_gold": -20.990919589996338,
"lp_dist": -8.991928577423096,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-131",
"gold_norm": "10",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -8.991529874503613,
"lp_gold": -16.921989023685455,
"lp_dist": -7.930459149181843,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.6315575018525124,
"lp_gold": -6.705006085336208,
"lp_dist": -8.33656358718872,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-132",
"gold_norm": "5",
"dist_norm": "32",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.572678565979004,
"lp_gold": -12.411277294158936,
"lp_dist": -18.98395586013794,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.04911994934082,
"lp_gold": -8.343387603759766,
"lp_dist": -12.392507553100586,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-133",
"gold_norm": "32",
"dist_norm": "18",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.6729400157928467,
"lp_gold": -15.0261971950531,
"lp_dist": -14.353257179260254,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.8697174787521362,
"lp_gold": -10.817826390266418,
"lp_dist": -11.687543869018555,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-134",
"gold_norm": "18",
"dist_norm": "4",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.9239641074091196,
"lp_gold": -15.171829616650939,
"lp_dist": -19.09579372406006,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.7061721086502075,
"lp_gold": -9.731460690498352,
"lp_dist": -9.025288581848145,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-135",
"gold_norm": "4",
"dist_norm": "48",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.912032127380371,
"lp_gold": -11.661430835723877,
"lp_dist": -21.573462963104248,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 10.949398040771484,
"lp_gold": -13.34396743774414,
"lp_dist": -24.293365478515625,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-136",
"gold_norm": "48",
"dist_norm": "8",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.2137904167175293,
"lp_gold": -13.246366620063782,
"lp_dist": -12.032576203346252,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.785146713256836,
"lp_gold": -15.54097604751587,
"lp_dist": -10.755829334259033,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-137",
"gold_norm": "8",
"dist_norm": "21",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.050054788589478,
"lp_gold": -14.328342199325562,
"lp_dist": -23.37839698791504,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.801905870437622,
"lp_gold": -9.647645235061646,
"lp_dist": -13.449551105499268,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-138",
"gold_norm": "21",
"dist_norm": "25",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.6713391542434692,
"lp_gold": -17.788984179496765,
"lp_dist": -20.460323333740234,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.8663175106048584,
"lp_gold": -14.334570407867432,
"lp_dist": -13.468252897262573,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-139",
"gold_norm": "25",
"dist_norm": "3000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.303999066352844,
"lp_gold": -13.958717346191406,
"lp_dist": -24.26271641254425,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.2307329177856445,
"lp_gold": -7.678596615791321,
"lp_dist": -13.909329533576965,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-140",
"gold_norm": "3000",
"dist_norm": "40",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.8300985433161259,
"lp_gold": -17.700348053127527,
"lp_dist": -15.870249509811401,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.4543883726000786,
"lp_gold": -14.831104047596455,
"lp_dist": -15.285492420196533,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-141",
"gold_norm": "40",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.7375423088669777,
"lp_gold": -13.080543011426926,
"lp_dist": -14.818085320293903,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.48419055342674255,
"lp_gold": -11.210194662213326,
"lp_dist": -11.694385215640068,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-142",
"gold_norm": "50",
"dist_norm": "90",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.239281177520752,
"lp_gold": -13.6494460105896,
"lp_dist": -12.410164833068848,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.5999003648757935,
"lp_gold": -9.939468264579773,
"lp_dist": -11.539368629455566,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-143",
"gold_norm": "90",
"dist_norm": "23",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.3710329532623291,
"lp_gold": -20.11902666091919,
"lp_dist": -19.74799370765686,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.965712070465088,
"lp_gold": -15.742670059204102,
"lp_dist": -18.70838212966919,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-144",
"gold_norm": "23",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.9211039543151855,
"lp_gold": -17.69726538658142,
"lp_dist": -12.776161432266235,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.012661933898926,
"lp_gold": -12.401761054992676,
"lp_dist": -8.38909912109375,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-145",
"gold_norm": "2",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2492438331246376,
"lp_gold": -13.602060556411743,
"lp_dist": -14.85130438953638,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.6407327204942703,
"lp_gold": -12.194403648376465,
"lp_dist": -9.553670927882195,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-146",
"gold_norm": "50",
"dist_norm": "122",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.681281805038452,
"lp_gold": -16.079622983932495,
"lp_dist": -20.760904788970947,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.314260721206665,
"lp_gold": -11.10973858833313,
"lp_dist": -18.423999309539795,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-147",
"gold_norm": "122",
"dist_norm": "300",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.33387154340744,
"lp_gold": -18.671439349651337,
"lp_dist": -25.005310893058777,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.5978607535362244,
"lp_gold": -15.271158814430237,
"lp_dist": -15.869019567966461,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-148",
"gold_norm": "300",
"dist_norm": "448",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.451393851355533,
"lp_gold": -17.04763673870184,
"lp_dist": -30.499030590057373,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.197498982015532,
"lp_gold": -14.669522101816256,
"lp_dist": -27.867021083831787,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-149",
"gold_norm": "448",
"dist_norm": "2450",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.949738264083862,
"lp_gold": -25.570088386535645,
"lp_dist": -31.519826650619507,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.753702878952026,
"lp_gold": -20.34923005104065,
"lp_dist": -26.102932929992676,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-150",
"gold_norm": "2450",
"dist_norm": "803",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.90772854257375,
"lp_gold": -16.1460417015478,
"lp_dist": -22.05377024412155,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.6608445048332214,
"lp_gold": -14.72765988111496,
"lp_dist": -15.388504385948181,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-151",
"gold_norm": "803",
"dist_norm": "16",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.22600507736206055,
"lp_gold": -20.624857425689697,
"lp_dist": -20.850862503051758,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.131483793258667,
"lp_gold": -19.228359699249268,
"lp_dist": -14.0968759059906,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-152",
"gold_norm": "16",
"dist_norm": "280",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.489606261253357,
"lp_gold": -13.729230046272278,
"lp_dist": -23.218836307525635,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.312919616699219,
"lp_gold": -9.783522605895996,
"lp_dist": -23.096442222595215,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-153",
"gold_norm": "280",
"dist_norm": "13",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.694299184717238,
"lp_gold": -15.244264638982713,
"lp_dist": -18.93856382369995,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.8231047093868256,
"lp_gold": -12.64960965514183,
"lp_dist": -11.826504945755005,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-154",
"gold_norm": "13",
"dist_norm": "20",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.0602927803993225,
"lp_gold": -15.215918719768524,
"lp_dist": -17.276211500167847,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.633309006690979,
"lp_gold": -12.038846015930176,
"lp_dist": -11.405537009239197,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-155",
"gold_norm": "20",
"dist_norm": "14",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.590712457895279,
"lp_gold": -18.284266233444214,
"lp_dist": -15.693553775548935,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.9870389699935913,
"lp_gold": -10.093660473823547,
"lp_dist": -12.080699443817139,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-156",
"gold_norm": "14",
"dist_norm": "32",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.239875316619873,
"lp_gold": -20.827781200408936,
"lp_dist": -25.06765651702881,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.260110855102539,
"lp_gold": -13.14831280708313,
"lp_dist": -15.408423662185669,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-157",
"gold_norm": "32",
"dist_norm": "105",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 12.336161613464355,
"lp_gold": -20.509262084960938,
"lp_dist": -32.84542369842529,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.297417879104614,
"lp_gold": -9.766870260238647,
"lp_dist": -16.06428813934326,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-158",
"gold_norm": "105",
"dist_norm": "71",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.14784783124923706,
"lp_gold": -21.096359431743622,
"lp_dist": -20.948511600494385,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4936403930187225,
"lp_gold": -13.522361606359482,
"lp_dist": -12.02872121334076,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-159",
"gold_norm": "71",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.2031426429748535,
"lp_gold": -17.497971057891846,
"lp_dist": -16.294828414916992,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.283137083053589,
"lp_gold": -15.376394033432007,
"lp_dist": -12.093256950378418,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-160",
"gold_norm": "5",
"dist_norm": "30",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.20781135559082,
"lp_gold": -9.617193222045898,
"lp_dist": -14.825004577636719,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.284235119819641,
"lp_gold": -6.052669286727905,
"lp_dist": -9.336904406547546,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-161",
"gold_norm": "30",
"dist_norm": "95",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.308123826980591,
"lp_gold": -12.008728742599487,
"lp_dist": -18.316852569580078,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.064014196395874,
"lp_gold": -11.074568510055542,
"lp_dist": -16.138582706451416,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-162",
"gold_norm": "95",
"dist_norm": "147",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 14.487586110830307,
"lp_gold": -13.062050491571426,
"lp_dist": -27.549636602401733,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.7768335342407227,
"lp_gold": -18.569175243377686,
"lp_dist": -22.346008777618408,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-163",
"gold_norm": "147",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -5.9637865126132965,
"lp_gold": -13.891173975542188,
"lp_dist": -7.927387462928891,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.2743232250213623,
"lp_gold": -11.51265001296997,
"lp_dist": -11.238326787948608,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-164",
"gold_norm": "10",
"dist_norm": "40000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.0207504555583,
"lp_gold": -12.818441897630692,
"lp_dist": -28.83919235318899,
"n_tokens_gold": 3,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 15.683642621152103,
"lp_gold": -6.558068131096661,
"lp_dist": -22.241710752248764,
"n_tokens_gold": 3,
"n_tokens_dist": 6
}
},
{
"ex_id": "gsm8k-test-165",
"gold_norm": "40000",
"dist_norm": "12",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.0927228182554245,
"lp_gold": -21.754829093813896,
"lp_dist": -15.662106275558472,
"n_tokens_gold": 6,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.87526479922235,
"lp_gold": -16.18092787824571,
"lp_dist": -9.305663079023361,
"n_tokens_gold": 6,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-166",
"gold_norm": "12",
"dist_norm": "129200",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 17.634711742401123,
"lp_gold": -20.60856056213379,
"lp_dist": -38.24327230453491,
"n_tokens_gold": 3,
"n_tokens_dist": 7
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 16.27542757987976,
"lp_gold": -20.810616493225098,
"lp_dist": -37.08604407310486,
"n_tokens_gold": 3,
"n_tokens_dist": 7
}
},
{
"ex_id": "gsm8k-test-167",
"gold_norm": "129200",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -8.816860965336673,
"lp_gold": -24.884800246800296,
"lp_dist": -16.067939281463623,
"n_tokens_gold": 7,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -12.447648953646421,
"lp_gold": -26.88768095895648,
"lp_dist": -14.440032005310059,
"n_tokens_gold": 7,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-168",
"gold_norm": "5",
"dist_norm": "45",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.262725353240967,
"lp_gold": -11.162125587463379,
"lp_dist": -18.424850940704346,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.129469871520996,
"lp_gold": -6.1789350509643555,
"lp_dist": -18.30840492248535,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-169",
"gold_norm": "45",
"dist_norm": "20",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.7432926744222641,
"lp_gold": -13.45964826643467,
"lp_dist": -14.202940940856934,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.6613135635852814,
"lp_gold": -9.49636921286583,
"lp_dist": -12.15768277645111,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-170",
"gold_norm": "20",
"dist_norm": "1170",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 28.943727374076843,
"lp_gold": -13.921928644180298,
"lp_dist": -42.86565601825714,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.730559349060059,
"lp_gold": -15.503687143325806,
"lp_dist": -28.234246492385864,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-171",
"gold_norm": "1170",
"dist_norm": "192",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.4793071039021015,
"lp_gold": -25.866613794118166,
"lp_dist": -23.387306690216064,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.7092228829860687,
"lp_gold": -17.528388172388077,
"lp_dist": -20.237611055374146,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-172",
"gold_norm": "192",
"dist_norm": "14",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.4512736797332764,
"lp_gold": -20.52132660150528,
"lp_dist": -18.070052921772003,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.270252227783203,
"lp_gold": -18.500007390975952,
"lp_dist": -10.229755163192749,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-173",
"gold_norm": "14",
"dist_norm": "144",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.012117385864258,
"lp_gold": -19.01213574409485,
"lp_dist": -25.024253129959106,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.6395363807678223,
"lp_gold": -17.207820653915405,
"lp_dist": -20.847357034683228,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-174",
"gold_norm": "144",
"dist_norm": "350",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.220242428360507,
"lp_gold": -20.075438094558194,
"lp_dist": -29.2956805229187,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.203243670635857,
"lp_gold": -15.825250687426887,
"lp_dist": -24.028494358062744,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-175",
"gold_norm": "350",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9250896275043488,
"lp_gold": -22.99281856417656,
"lp_dist": -23.917908191680908,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.236621737480164,
"lp_gold": -17.58333122730255,
"lp_dist": -10.346709489822388,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-176",
"gold_norm": "50",
"dist_norm": "7",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.0019221305847167969,
"lp_gold": -14.026922941207886,
"lp_dist": -14.025000810623169,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.7933419942855835,
"lp_gold": -10.8789883852005,
"lp_dist": -10.085646390914917,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-177",
"gold_norm": "7",
"dist_norm": "50",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.36652660369873047,
"lp_gold": -14.853872776031494,
"lp_dist": -14.487346172332764,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.34556770324707,
"lp_gold": -7.91382908821106,
"lp_dist": -13.25939679145813,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-178",
"gold_norm": "50",
"dist_norm": "8",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.387412190437317,
"lp_gold": -12.106852412223816,
"lp_dist": -15.494264602661133,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.1638234257698059,
"lp_gold": -11.002189338207245,
"lp_dist": -11.16601276397705,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-179",
"gold_norm": "8",
"dist_norm": "3160",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 27.79468995332718,
"lp_gold": -10.52354496717453,
"lp_dist": -38.31823492050171,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 26.891671419143677,
"lp_gold": -6.7627270221710205,
"lp_dist": -33.6543984413147,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-180",
"gold_norm": "3160",
"dist_norm": "80",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -14.208902917802334,
"lp_gold": -23.272248081862926,
"lp_dist": -9.063345164060593,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.284509412944317,
"lp_gold": -18.512411706149578,
"lp_dist": -10.227902293205261,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-181",
"gold_norm": "80",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.666141152381897,
"lp_gold": -11.426292300224304,
"lp_dist": -18.0924334526062,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.4361144304275513,
"lp_gold": -8.623531460762024,
"lp_dist": -12.059645891189575,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-182",
"gold_norm": "50",
"dist_norm": "40",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.3921025022864342,
"lp_gold": -16.256853722035885,
"lp_dist": -15.86475121974945,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.5148345530033112,
"lp_gold": -11.546109974384308,
"lp_dist": -11.031275421380997,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-183",
"gold_norm": "40",
"dist_norm": "78",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.310973192565143,
"lp_gold": -11.783679460175335,
"lp_dist": -22.09465265274048,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.1819764897227287,
"lp_gold": -8.697102136909962,
"lp_dist": -11.87907862663269,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-184",
"gold_norm": "78",
"dist_norm": "273",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.903703212738037,
"lp_gold": -21.98734474182129,
"lp_dist": -31.891047954559326,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.26316213607788,
"lp_gold": -13.972403526306152,
"lp_dist": -25.235565662384033,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-185",
"gold_norm": "273",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.715305805206299,
"lp_gold": -18.186378479003906,
"lp_dist": -13.471072673797607,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.78067421913147,
"lp_gold": -15.708798170089722,
"lp_dist": -9.928123950958252,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-186",
"gold_norm": "2",
"dist_norm": "195",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 12.730733886361122,
"lp_gold": -12.38149118423462,
"lp_dist": -25.11222507059574,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.67141306400299,
"lp_gold": -8.928462505340576,
"lp_dist": -21.599875569343567,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-187",
"gold_norm": "195",
"dist_norm": "1128",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 15.125296980142593,
"lp_gold": -20.79087921977043,
"lp_dist": -35.916176199913025,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.027642607688904,
"lp_gold": -17.632879853248596,
"lp_dist": -29.6605224609375,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-188",
"gold_norm": "1128",
"dist_norm": "172",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.5262241810560226,
"lp_gold": -21.482525154948235,
"lp_dist": -18.956300973892212,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.822476863861084,
"lp_gold": -22.20119798183441,
"lp_dist": -14.378721117973328,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-189",
"gold_norm": "172",
"dist_norm": "30",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -10.2972651720047,
"lp_gold": -25.187148094177246,
"lp_dist": -14.889882922172546,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.863584116101265,
"lp_gold": -19.34168529510498,
"lp_dist": -11.478101179003716,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-190",
"gold_norm": "30",
"dist_norm": "30",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.69141697883606,
"lp_dist": -20.69141697883606,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -14.99114179611206,
"lp_dist": -14.99114179611206,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-191",
"gold_norm": "30",
"dist_norm": "92",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 14.041085667908192,
"lp_gold": -11.936107210814953,
"lp_dist": -25.977192878723145,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 10.562521874904633,
"lp_gold": -8.070083677768707,
"lp_dist": -18.63260555267334,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-192",
"gold_norm": "92",
"dist_norm": "20",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.352982759475708,
"lp_gold": -19.694137811660767,
"lp_dist": -19.34115505218506,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.613435983657837,
"lp_gold": -16.650076866149902,
"lp_dist": -9.036640882492065,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-193",
"gold_norm": "20",
"dist_norm": "540",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.857290744781494,
"lp_gold": -12.000582933425903,
"lp_dist": -17.857873678207397,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.146737933158875,
"lp_gold": -8.929409623146057,
"lp_dist": -20.07614755630493,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-194",
"gold_norm": "540",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.4153643026947975,
"lp_gold": -17.50879267603159,
"lp_dist": -17.093428373336792,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.476008802652359,
"lp_gold": -18.04153409600258,
"lp_dist": -11.56552529335022,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-195",
"gold_norm": "10",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -13.743456363677979,
"lp_dist": -13.743456363677979,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -11.619784355163574,
"lp_dist": -11.619784355163574,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-196",
"gold_norm": "10",
"dist_norm": "38",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.615695416927338,
"lp_gold": -13.448404610157013,
"lp_dist": -21.06410002708435,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.2176668643951416,
"lp_gold": -12.037200689315796,
"lp_dist": -12.254867553710938,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-197",
"gold_norm": "38",
"dist_norm": "4000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 19.051328860223293,
"lp_gold": -18.077466011047363,
"lp_dist": -37.12879487127066,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.882668375968933,
"lp_gold": -18.516667366027832,
"lp_dist": -27.399335741996765,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-198",
"gold_norm": "4000",
"dist_norm": "594",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 10.496505833114497,
"lp_gold": -20.328653597389348,
"lp_dist": -30.825159430503845,
"n_tokens_gold": 5,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.5506546348333359,
"lp_gold": -16.97495509684086,
"lp_dist": -18.525609731674194,
"n_tokens_gold": 5,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-199",
"gold_norm": "594",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -13.668475985527039,
"lp_gold": -24.507625102996826,
"lp_dist": -10.839149117469788,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -12.574656009674072,
"lp_gold": -20.77769374847412,
"lp_dist": -8.203037738800049,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-200",
"gold_norm": "2",
"dist_norm": "142",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 20.30094861984253,
"lp_gold": -13.631542205810547,
"lp_dist": -33.932490825653076,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.00571084022522,
"lp_gold": -11.859813690185547,
"lp_dist": -24.865524530410767,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-201",
"gold_norm": "142",
"dist_norm": "9",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.474197149276733,
"lp_gold": -17.457672357559204,
"lp_dist": -12.98347520828247,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.4977235794067383,
"lp_gold": -15.609914779663086,
"lp_dist": -13.112191200256348,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-202",
"gold_norm": "9",
"dist_norm": "6",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.3756675720214844,
"lp_gold": -12.738621711730957,
"lp_dist": -11.362954139709473,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.2889900207519531,
"lp_gold": -12.575027465820312,
"lp_dist": -11.28603744506836,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-203",
"gold_norm": "6",
"dist_norm": "100",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.780631840229034,
"lp_gold": -15.693442344665527,
"lp_dist": -22.47407418489456,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.30402946472168,
"lp_gold": -9.650990724563599,
"lp_dist": -16.95502018928528,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-204",
"gold_norm": "100",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.23692995309829712,
"lp_gold": -17.014364540576935,
"lp_dist": -16.777434587478638,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6824193000793457,
"lp_gold": -11.578391790390015,
"lp_dist": -10.895972490310669,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-205",
"gold_norm": "10",
"dist_norm": "15",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.3394050598144531,
"lp_gold": -18.575839042663574,
"lp_dist": -18.23643398284912,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.5095596313476562,
"lp_gold": -18.252729892730713,
"lp_dist": -18.76228952407837,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-206",
"gold_norm": "15",
"dist_norm": "22",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2520769834518433,
"lp_gold": -16.89211142063141,
"lp_dist": -18.144188404083252,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.4231153726577759,
"lp_gold": -14.139848232269287,
"lp_dist": -15.562963604927063,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-207",
"gold_norm": "22",
"dist_norm": "16",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.621975004673004,
"lp_gold": -11.219844043254852,
"lp_dist": -13.841819047927856,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.833777904510498,
"lp_gold": -10.41726541519165,
"lp_dist": -9.583487510681152,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-208",
"gold_norm": "16",
"dist_norm": "16",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -11.36221162811853,
"lp_dist": -11.36221162811853,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -13.241074323654175,
"lp_dist": -13.241074323654175,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-209",
"gold_norm": "16",
"dist_norm": "5",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -7.475744724273682,
"lp_gold": -19.41911506652832,
"lp_dist": -11.943370342254639,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.438729763031006,
"lp_gold": -15.213366270065308,
"lp_dist": -10.774636507034302,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-210",
"gold_norm": "5",
"dist_norm": "23",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.465651273727417,
"lp_gold": -15.02237606048584,
"lp_dist": -21.488027334213257,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.424657344818115,
"lp_gold": -8.115961074829102,
"lp_dist": -15.540618419647217,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-211",
"gold_norm": "23",
"dist_norm": "30",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.4010072350502014,
"lp_gold": -15.865891933441162,
"lp_dist": -15.46488469839096,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.2698206305503845,
"lp_gold": -18.3642840385437,
"lp_dist": -13.094463407993317,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-212",
"gold_norm": "30",
"dist_norm": "14000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 23.520719528198242,
"lp_gold": -19.58930778503418,
"lp_dist": -43.11002731323242,
"n_tokens_gold": 3,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 18.009515285491943,
"lp_gold": -13.2671217918396,
"lp_dist": -31.276637077331543,
"n_tokens_gold": 3,
"n_tokens_dist": 6
}
},
{
"ex_id": "gsm8k-test-213",
"gold_norm": "14000",
"dist_norm": "60",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.45991032384336,
"lp_gold": -21.998028149828315,
"lp_dist": -15.538117825984955,
"n_tokens_gold": 6,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.547004419262521,
"lp_gold": -14.710521432454698,
"lp_dist": -13.163517013192177,
"n_tokens_gold": 6,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-214",
"gold_norm": "60",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -8.948975563049316,
"lp_gold": -23.77088451385498,
"lp_dist": -14.821908950805664,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.54961109161377,
"lp_gold": -19.638930320739746,
"lp_dist": -11.089319229125977,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-215",
"gold_norm": "2",
"dist_norm": "3",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.387197434902191,
"lp_gold": -9.424871981143951,
"lp_dist": -12.812069416046143,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.5975170135498047,
"lp_gold": -9.744040250778198,
"lp_dist": -8.146523237228394,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-216",
"gold_norm": "3",
"dist_norm": "30",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.872649192810059,
"lp_gold": -14.82950735092163,
"lp_dist": -20.70215654373169,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.080531597137451,
"lp_gold": -11.414220809936523,
"lp_dist": -13.494752407073975,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-217",
"gold_norm": "30",
"dist_norm": "1920",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 21.697412703186274,
"lp_gold": -13.986085917800665,
"lp_dist": -35.68349862098694,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 12.313387870788574,
"lp_gold": -12.878417491912842,
"lp_dist": -25.191805362701416,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-218",
"gold_norm": "1920",
"dist_norm": "84",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.6947197169065475,
"lp_gold": -25.65590851008892,
"lp_dist": -23.961188793182373,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.878614127635956,
"lp_gold": -21.65609782934189,
"lp_dist": -14.777483701705933,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-219",
"gold_norm": "84",
"dist_norm": "8",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.1246260404586792,
"lp_gold": -13.33847463130951,
"lp_dist": -12.21384859085083,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.7234134674072266,
"lp_gold": -12.977782487869263,
"lp_dist": -10.254369020462036,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-220",
"gold_norm": "8",
"dist_norm": "12",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.304245948791504,
"lp_gold": -15.825139999389648,
"lp_dist": -14.520894050598145,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.1817718744277954,
"lp_gold": -12.667408466339111,
"lp_dist": -12.849180340766907,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-221",
"gold_norm": "12",
"dist_norm": "260",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.42927235364914,
"lp_gold": -9.642007768154144,
"lp_dist": -18.071280121803284,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.9336384534835815,
"lp_gold": -7.166749358177185,
"lp_dist": -14.100387811660767,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-222",
"gold_norm": "260",
"dist_norm": "288",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.7796518057584763,
"lp_gold": -18.89673836529255,
"lp_dist": -22.676390171051025,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.0456210374832153,
"lp_gold": -16.525109887123108,
"lp_dist": -18.570730924606323,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-223",
"gold_norm": "288",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -19.030277393758297,
"lp_gold": -26.646236896514893,
"lp_dist": -7.615959502756596,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -15.758692368865013,
"lp_gold": -20.088956594467163,
"lp_dist": -4.33026422560215,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-224",
"gold_norm": "3",
"dist_norm": "1596",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.595462799072266,
"lp_gold": -14.905784606933594,
"lp_dist": -31.50124740600586,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 16.13599407672882,
"lp_gold": -13.950559616088867,
"lp_dist": -30.086553692817688,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-225",
"gold_norm": "1596",
"dist_norm": "81",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.9876238331198692,
"lp_gold": -17.49847326427698,
"lp_dist": -19.48609709739685,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.1633647084236145,
"lp_gold": -24.1348779797554,
"lp_dist": -17.971513271331787,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-226",
"gold_norm": "81",
"dist_norm": "56",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.605985850095749,
"lp_gold": -15.771342545747757,
"lp_dist": -19.377328395843506,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.6331486701965332,
"lp_gold": -16.669665813446045,
"lp_dist": -15.036517143249512,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-227",
"gold_norm": "56",
"dist_norm": "1490",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.239468812942505,
"lp_gold": -14.987546801567078,
"lp_dist": -31.227015614509583,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.551798105239868,
"lp_gold": -13.551477909088135,
"lp_dist": -25.103276014328003,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-228",
"gold_norm": "1490",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -19.20861628651619,
"lp_gold": -30.783629894256592,
"lp_dist": -11.575013607740402,
"n_tokens_gold": 5,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -13.095399379730225,
"lp_gold": -24.58754062652588,
"lp_dist": -11.492141246795654,
"n_tokens_gold": 5,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-229",
"gold_norm": "2",
"dist_norm": "20",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.965806007385254,
"lp_gold": -11.508173823356628,
"lp_dist": -17.473979830741882,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.32473722100257874,
"lp_gold": -11.093923568725586,
"lp_dist": -11.418660789728165,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-230",
"gold_norm": "20",
"dist_norm": "11",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.1644073724746704,
"lp_gold": -14.290618896484375,
"lp_dist": -13.126211524009705,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.0560493469238281,
"lp_gold": -12.166522026062012,
"lp_dist": -13.22257137298584,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-231",
"gold_norm": "11",
"dist_norm": "120",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.219505786895752,
"lp_gold": -14.914972305297852,
"lp_dist": -13.6954665184021,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.7893390655517578,
"lp_gold": -11.986905813217163,
"lp_dist": -13.776244878768921,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-232",
"gold_norm": "120",
"dist_norm": "45",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.419019672088325,
"lp_gold": -15.604393211193383,
"lp_dist": -20.023412883281708,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.708264172077179,
"lp_gold": -12.799233138561249,
"lp_dist": -13.507497310638428,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-233",
"gold_norm": "45",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.5151035785675049,
"lp_gold": -11.359116911888123,
"lp_dist": -9.844013333320618,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.095129489898682,
"lp_gold": -14.39400601387024,
"lp_dist": -6.298876523971558,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-234",
"gold_norm": "10",
"dist_norm": "9",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.05903661251068115,
"lp_gold": -13.167555451393127,
"lp_dist": -13.226592063903809,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.7660583406686783,
"lp_gold": -6.627322778105736,
"lp_dist": -10.393381118774414,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-235",
"gold_norm": "9",
"dist_norm": "33",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.191148281097412,
"lp_gold": -14.321090459823608,
"lp_dist": -21.51223874092102,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.510578155517578,
"lp_gold": -13.556029319763184,
"lp_dist": -16.06660747528076,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-236",
"gold_norm": "33",
"dist_norm": "150",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.496721982955933,
"lp_gold": -19.231878995895386,
"lp_dist": -28.72860097885132,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.684651017189026,
"lp_gold": -13.231264114379883,
"lp_dist": -18.91591513156891,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-237",
"gold_norm": "150",
"dist_norm": "60",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.496449222322553,
"lp_gold": -12.51727462792769,
"lp_dist": -20.013723850250244,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.5342855900526047,
"lp_gold": -11.47604425251484,
"lp_dist": -12.010329842567444,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-238",
"gold_norm": "60",
"dist_norm": "4",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.38430750370025635,
"lp_gold": -13.078525424003601,
"lp_dist": -13.462832927703857,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.5117335319519043,
"lp_gold": -10.930900573730469,
"lp_dist": -9.419167041778564,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-239",
"gold_norm": "4",
"dist_norm": "7",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.3680305480957031,
"lp_gold": -12.839935302734375,
"lp_dist": -13.207965850830078,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.0732803344726562,
"lp_gold": -11.00877571105957,
"lp_dist": -12.082056045532227,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-240",
"gold_norm": "7",
"dist_norm": "3140",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 22.65280568599701,
"lp_gold": -12.292606830596924,
"lp_dist": -34.94541251659393,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 18.48963165283203,
"lp_gold": -9.809606552124023,
"lp_dist": -28.299238204956055,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-241",
"gold_norm": "3140",
"dist_norm": "19",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -8.30290687084198,
"lp_gold": -22.36732530593872,
"lp_dist": -14.06441843509674,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -13.02580738067627,
"lp_gold": -25.325818061828613,
"lp_dist": -12.300010681152344,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-242",
"gold_norm": "19",
"dist_norm": "6",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.181618273258209,
"lp_gold": -12.157159745693207,
"lp_dist": -17.338778018951416,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4070416688919067,
"lp_gold": -10.997576355934143,
"lp_dist": -9.590534687042236,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-243",
"gold_norm": "6",
"dist_norm": "90",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.3495291471481323,
"lp_gold": -19.138280868530273,
"lp_dist": -22.487810015678406,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.812922954559326,
"lp_gold": -11.337668418884277,
"lp_dist": -17.150591373443604,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-244",
"gold_norm": "90",
"dist_norm": "10",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.6097938957027509,
"lp_gold": -13.304834717731865,
"lp_dist": -11.695040822029114,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.5438682280946523,
"lp_gold": -7.949862555367872,
"lp_dist": -10.493730783462524,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-245",
"gold_norm": "10",
"dist_norm": "130000",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 26.08018460869789,
"lp_gold": -8.254620164632797,
"lp_dist": -34.33480477333069,
"n_tokens_gold": 3,
"n_tokens_dist": 7
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.28964650630951,
"lp_gold": -7.219025731086731,
"lp_dist": -20.50867223739624,
"n_tokens_gold": 3,
"n_tokens_dist": 7
}
},
{
"ex_id": "gsm8k-test-246",
"gold_norm": "130000",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.715070237376494,
"lp_gold": -12.472108629561262,
"lp_dist": -17.187178866937757,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.156454911455512,
"lp_gold": -13.647521084174514,
"lp_dist": -6.491066172719002,
"n_tokens_gold": 7,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-247",
"gold_norm": "10",
"dist_norm": "525",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 17.677427016198635,
"lp_gold": -15.68656424432993,
"lp_dist": -33.363991260528564,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.708948612213135,
"lp_gold": -11.453617930412292,
"lp_dist": -23.162566542625427,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-248",
"gold_norm": "525",
"dist_norm": "180",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.852982550859451,
"lp_gold": -11.824598759412766,
"lp_dist": -18.677581310272217,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.6725820302963257,
"lp_gold": -11.64935302734375,
"lp_dist": -12.321935057640076,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-249",
"gold_norm": "180",
"dist_norm": "1200",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.544742852449417,
"lp_gold": -8.964889764785767,
"lp_dist": -22.509632617235184,
"n_tokens_gold": 4,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 15.23933944106102,
"lp_gold": -7.498180732131004,
"lp_dist": -22.737520173192024,
"n_tokens_gold": 4,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-250",
"gold_norm": "1200",
"dist_norm": "25",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.737206868827343,
"lp_gold": -9.13471419364214,
"lp_dist": -18.871921062469482,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.1686492152512074,
"lp_gold": -8.043186407536268,
"lp_dist": -10.211835622787476,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-251",
"gold_norm": "25",
"dist_norm": "21",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8187389373779297,
"lp_gold": -12.140745043754578,
"lp_dist": -12.959483981132507,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.511936068534851,
"lp_gold": -6.530247092247009,
"lp_dist": -10.04218316078186,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-252",
"gold_norm": "21",
"dist_norm": "2304",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 16.503239154815674,
"lp_gold": -17.229759454727173,
"lp_dist": -33.73299860954285,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 14.275152683258057,
"lp_gold": -13.177631378173828,
"lp_dist": -27.452784061431885,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-253",
"gold_norm": "2304",
"dist_norm": "2325",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.856137990951538,
"lp_gold": -25.891463041305542,
"lp_dist": -30.74760103225708,
"n_tokens_gold": 5,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.9878709018230438,
"lp_gold": -25.637850552797318,
"lp_dist": -26.62572145462036,
"n_tokens_gold": 5,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-254",
"gold_norm": "2325",
"dist_norm": "15",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.660225659608841,
"lp_gold": -13.980357348918915,
"lp_dist": -11.320131689310074,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.668605744838715,
"lp_gold": -16.375044524669647,
"lp_dist": -11.706438779830933,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-255",
"gold_norm": "15",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.416126787662506,
"lp_gold": -14.69802612066269,
"lp_dist": -21.114152908325195,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.287871062755585,
"lp_gold": -10.296239674091339,
"lp_dist": -17.584110736846924,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
}
],
"flip_rows": [
{
"ex_id": "gsm8k-test-1",
"gold_norm": "80",
"dist_norm": "12",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.218225084245205,
"lp_gold": -16.316218174993992,
"lp_dist": -17.534443259239197,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6342043727636337,
"lp_gold": -18.493512138724327,
"lp_dist": -17.859307765960693,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -2.4292978644371033,
"lp_gold": -17.579256772994995,
"lp_dist": -15.149958908557892,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -2.5465927198529243,
"lp_gold": -17.71218091994524,
"lp_dist": -15.165588200092316,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "gold",
"correct": true,
"margin": 0.1809745579957962,
"lp_gold": -14.873322412371635,
"lp_dist": -15.054296970367432,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.3117252141237259,
"lp_gold": -17.119899585843086,
"lp_dist": -16.80817437171936,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.6342088878154755,
"lp_gold": -18.49351069331169,
"lp_dist": -17.859301805496216,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-5",
"gold_norm": "3200",
"dist_norm": "38",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.075981711270288,
"lp_gold": -15.808944131014869,
"lp_dist": -19.884925842285156,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.112908275797963,
"lp_gold": -17.281133087351918,
"lp_dist": -15.168224811553955,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.29585185274481773,
"lp_gold": -18.88566479459405,
"lp_dist": -19.181516647338867,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.4162398586049676,
"lp_gold": -18.945031284354627,
"lp_dist": -19.361271142959595,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.9830470234155655,
"lp_gold": -17.90198041498661,
"lp_dist": -16.918933391571045,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.479109164327383,
"lp_gold": -18.568949338048697,
"lp_dist": -16.089840173721313,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -2.112905891612172,
"lp_gold": -17.28112688846886,
"lp_dist": -15.16822099685669,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-14",
"gold_norm": "800",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.7909989710897207,
"lp_gold": -10.428668463602662,
"lp_dist": -13.219667434692383,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.28849396109581,
"lp_gold": -15.488610118627548,
"lp_dist": -9.200116157531738,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -3.221886307001114,
"lp_gold": -17.41419091820717,
"lp_dist": -14.192304611206055,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -3.1399324536323547,
"lp_gold": -16.989762604236603,
"lp_dist": -13.849830150604248,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -5.316341996192932,
"lp_gold": -16.00817358493805,
"lp_dist": -10.691831588745117,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -6.094054877758026,
"lp_gold": -17.299417197704315,
"lp_dist": -11.205362319946289,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -6.288493096828461,
"lp_gold": -15.488613307476044,
"lp_dist": -9.200120210647583,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-20",
"gold_norm": "106",
"dist_norm": "80",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.92336449585855,
"lp_gold": -15.76830449141562,
"lp_dist": -20.69166898727417,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.6747859213501215,
"lp_gold": -17.988985607400537,
"lp_dist": -17.314199686050415,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -3.9746298789978027,
"lp_gold": -22.760347604751587,
"lp_dist": -18.785717725753784,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -3.769364595413208,
"lp_gold": -22.492818355560303,
"lp_dist": -18.723453760147095,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.8601390519179404,
"lp_gold": -22.44402221823111,
"lp_dist": -19.58388316631317,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.127611498348415,
"lp_gold": -16.912333111278713,
"lp_dist": -14.784721612930298,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.6747779380530119,
"lp_gold": -17.988986684009433,
"lp_dist": -17.31420874595642,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-24",
"gold_norm": "9",
"dist_norm": "40",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.006132304668426514,
"lp_gold": -15.839151382446289,
"lp_dist": -15.845283687114716,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.110446274280548,
"lp_gold": -12.25863265991211,
"lp_dist": -10.148186385631561,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.04043316841125488,
"lp_gold": -14.208327531814575,
"lp_dist": -14.24876070022583,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.17713472247123718,
"lp_gold": -14.100894212722778,
"lp_dist": -14.278028935194016,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.736970603466034,
"lp_gold": -13.546704292297363,
"lp_dist": -11.80973368883133,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.5443955063819885,
"lp_gold": -14.465348243713379,
"lp_dist": -12.92095273733139,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -2.110443741083145,
"lp_gold": -12.258633613586426,
"lp_dist": -10.14818987250328,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-27",
"gold_norm": "160",
"dist_norm": "6",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.597537249326706,
"lp_gold": -12.841732293367386,
"lp_dist": -14.439269542694092,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.338132083415985,
"lp_gold": -17.455387771129608,
"lp_dist": -10.117255687713623,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -2.827625960111618,
"lp_gold": -15.895989626646042,
"lp_dist": -13.068363666534424,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -2.9387161433696747,
"lp_gold": -16.01883837580681,
"lp_dist": -13.080122232437134,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -8.034337878227234,
"lp_gold": -23.63157594203949,
"lp_dist": -15.597238063812256,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -7.914303302764893,
"lp_gold": -19.091363430023193,
"lp_dist": -11.1770601272583,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -7.338137567043304,
"lp_gold": -17.455391585826874,
"lp_dist": -10.11725401878357,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-31",
"gold_norm": "68",
"dist_norm": "31",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.820281505584717,
"lp_gold": -15.837103843688965,
"lp_dist": -19.65738534927368,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.25214481353759766,
"lp_gold": -12.841001033782959,
"lp_dist": -12.588856220245361,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 4.767996072769165,
"lp_gold": -13.063536882400513,
"lp_dist": -17.831532955169678,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 4.778961181640625,
"lp_gold": -12.704540252685547,
"lp_dist": -17.483501434326172,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.2866086959838867,
"lp_gold": -13.741567134857178,
"lp_dist": -13.454958438873291,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.2309262752532959,
"lp_gold": -13.31624436378479,
"lp_dist": -13.085318088531494,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.2521398067474365,
"lp_gold": -12.840993165969849,
"lp_dist": -12.588853359222412,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-35",
"gold_norm": "480",
"dist_norm": "520",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.5111888945102692,
"lp_gold": -12.033819317817688,
"lp_dist": -13.545008212327957,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.8960548639297485,
"lp_gold": -19.025392055511475,
"lp_dist": -16.129337191581726,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.173134446144104,
"lp_gold": -16.326287806034088,
"lp_dist": -16.499422252178192,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.15610426664352417,
"lp_gold": -16.418395936489105,
"lp_dist": -16.57450020313263,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.612240791320801,
"lp_gold": -17.892987489700317,
"lp_dist": -15.280746698379517,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -3.1868066787719727,
"lp_gold": -17.928332090377808,
"lp_dist": -14.741525411605835,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -2.8960519433021545,
"lp_gold": -19.0253963470459,
"lp_dist": -16.129344403743744,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-45",
"gold_norm": "310",
"dist_norm": "100",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9456039071083069,
"lp_gold": -16.172270894050598,
"lp_dist": -17.117874801158905,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.856696009635925,
"lp_gold": -16.36608850955963,
"lp_dist": -10.509392499923706,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.7440242022275925,
"lp_gold": -17.919243693351746,
"lp_dist": -16.175219491124153,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.6509404331445694,
"lp_gold": -17.9195556640625,
"lp_dist": -16.26861523091793,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -3.5654631853103638,
"lp_gold": -15.380214095115662,
"lp_dist": -11.814750909805298,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -6.163789927959442,
"lp_gold": -17.801445245742798,
"lp_dist": -11.637655317783356,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -5.856698274612427,
"lp_gold": -16.366087794303894,
"lp_dist": -10.509389519691467,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-48",
"gold_norm": "25",
"dist_norm": "1400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.125034153461456,
"lp_gold": -16.447975158691406,
"lp_dist": -23.573009312152863,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.08089584112167358,
"lp_gold": -13.430449962615967,
"lp_dist": -13.349554121494293,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 4.025165379047394,
"lp_gold": -15.498599290847778,
"lp_dist": -19.523764669895172,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 3.622505933046341,
"lp_gold": -15.329206466674805,
"lp_dist": -18.951712399721146,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.6573466360569,
"lp_gold": -16.551159858703613,
"lp_dist": -14.893813222646713,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.2268932834267616,
"lp_gold": -15.390444993972778,
"lp_dist": -14.163551710546017,
"n_tokens_gold": 3,
"n_tokens_dist": 5
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.08089244365692139,
"lp_gold": -13.43044662475586,
"lp_dist": -13.349554181098938,
"n_tokens_gold": 3,
"n_tokens_dist": 5
}
},
{
"ex_id": "gsm8k-test-64",
"gold_norm": "655",
"dist_norm": "800",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.8698419332504272,
"lp_gold": -17.930187582969666,
"lp_dist": -19.800029516220093,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4062272310256958,
"lp_gold": -14.47088611125946,
"lp_dist": -13.064658880233765,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -0.005745887756347656,
"lp_gold": -18.56607985496521,
"lp_dist": -18.560333967208862,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -0.33770978450775146,
"lp_gold": -18.740556836128235,
"lp_dist": -18.402847051620483,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.1954535841941833,
"lp_gold": -16.162434339523315,
"lp_dist": -14.966980755329132,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.086472988128662,
"lp_gold": -14.544449806213379,
"lp_dist": -13.457976818084717,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.4062250852584839,
"lp_gold": -14.470888018608093,
"lp_dist": -13.06466293334961,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-100",
"gold_norm": "250",
"dist_norm": "12",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.937235951423645,
"lp_gold": -16.930358290672302,
"lp_dist": -18.867594242095947,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.8317363262176514,
"lp_gold": -17.415368795394897,
"lp_dist": -15.583632469177246,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.7292950004339218,
"lp_gold": -17.88250456750393,
"lp_dist": -16.153209567070007,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.9416460394859314,
"lp_gold": -17.420925438404083,
"lp_dist": -15.479279398918152,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.7523078918457031,
"lp_gold": -15.994086980819702,
"lp_dist": -14.241779088973999,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.070462703704834,
"lp_gold": -15.625900983810425,
"lp_dist": -14.55543828010559,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.8317327499389648,
"lp_gold": -17.415366888046265,
"lp_dist": -15.5836341381073,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-104",
"gold_norm": "26",
"dist_norm": "42",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.550231754779816,
"lp_gold": -16.079154193401337,
"lp_dist": -20.629385948181152,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.743180751800537,
"lp_gold": -20.936619758605957,
"lp_dist": -17.19343900680542,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.501150131225586,
"lp_gold": -20.479767084121704,
"lp_dist": -18.978616952896118,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.586869239807129,
"lp_gold": -20.261481761932373,
"lp_dist": -18.674612522125244,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.488091826438904,
"lp_gold": -20.40666627883911,
"lp_dist": -17.918574452400208,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.9733150005340576,
"lp_gold": -20.461706161499023,
"lp_dist": -17.488391160964966,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -3.7431836128234863,
"lp_gold": -20.936622619628906,
"lp_dist": -17.19343900680542,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-105",
"gold_norm": "42",
"dist_norm": "5",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2448320388793945,
"lp_gold": -17.29369354248047,
"lp_dist": -19.538525581359863,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.48531031608581543,
"lp_gold": -14.891574144363403,
"lp_dist": -14.406263828277588,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -0.6142416000366211,
"lp_gold": -20.099190711975098,
"lp_dist": -19.484949111938477,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -0.2368631362915039,
"lp_gold": -19.605250358581543,
"lp_dist": -19.36838722229004,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.025758981704712,
"lp_gold": -16.692798852920532,
"lp_dist": -15.66703987121582,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.37449169158935547,
"lp_gold": -14.766992568969727,
"lp_dist": -14.392500877380371,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.48531413078308105,
"lp_gold": -14.891570806503296,
"lp_dist": -14.406256675720215,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-107",
"gold_norm": "14400",
"dist_norm": "400",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8998411595821381,
"lp_gold": -19.079706698656082,
"lp_dist": -19.97954785823822,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.72913409024477,
"lp_gold": -22.703229255974293,
"lp_dist": -20.974095165729523,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.19682685285806656,
"lp_gold": -22.464857898652554,
"lp_dist": -22.66168475151062,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.14343415200710297,
"lp_gold": -22.713250055909157,
"lp_dist": -22.85668420791626,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -3.5759153068065643,
"lp_gold": -27.662475764751434,
"lp_dist": -24.08656045794487,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.206382006406784,
"lp_gold": -25.311469167470932,
"lp_dist": -23.105087161064148,
"n_tokens_gold": 6,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.7291278392076492,
"lp_gold": -22.70323248207569,
"lp_dist": -20.974104642868042,
"n_tokens_gold": 6,
"n_tokens_dist": 4
}
},
{
"ex_id": "gsm8k-test-110",
"gold_norm": "83",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9284783601760864,
"lp_gold": -14.152065396308899,
"lp_dist": -15.080543756484985,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.6964447498321533,
"lp_gold": -11.795601606369019,
"lp_dist": -8.099156856536865,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.5404622554779053,
"lp_gold": -14.069255948066711,
"lp_dist": -12.528793692588806,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.4889593124389648,
"lp_gold": -14.175909280776978,
"lp_dist": -12.686949968338013,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.8712440729141235,
"lp_gold": -14.255087852478027,
"lp_dist": -11.383843779563904,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -3.085860013961792,
"lp_gold": -12.02087950706482,
"lp_dist": -8.935019493103027,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -3.696447730064392,
"lp_gold": -11.795600891113281,
"lp_dist": -8.09915316104889,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-120",
"gold_norm": "335",
"dist_norm": "60",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.718258023262024,
"lp_gold": -16.84885323047638,
"lp_dist": -18.567111253738403,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.495100736618042,
"lp_gold": -16.78837823867798,
"lp_dist": -13.293277502059937,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -6.745102643966675,
"lp_gold": -21.85818600654602,
"lp_dist": -15.113083362579346,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -6.640980243682861,
"lp_gold": -21.43053102493286,
"lp_dist": -14.78955078125,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -4.621345937252045,
"lp_gold": -18.077104091644287,
"lp_dist": -13.455758154392242,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -4.602914571762085,
"lp_gold": -19.734163284301758,
"lp_dist": -15.131248712539673,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -3.4950921535491943,
"lp_gold": -16.78837251663208,
"lp_dist": -13.293280363082886,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-134",
"gold_norm": "18",
"dist_norm": "4",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.9239641074091196,
"lp_gold": -15.171829616650939,
"lp_dist": -19.09579372406006,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.7061721086502075,
"lp_gold": -9.731460690498352,
"lp_dist": -9.025288581848145,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.6682674884796143,
"lp_gold": -13.212559461593628,
"lp_dist": -13.880826950073242,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.691381573677063,
"lp_gold": -13.084003806114197,
"lp_dist": -13.77538537979126,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -3.0653414726257324,
"lp_gold": -11.70475959777832,
"lp_dist": -8.639418125152588,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.6070647239685059,
"lp_gold": -11.957983255386353,
"lp_dist": -10.350918531417847,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.7061715722084045,
"lp_gold": -9.731466829776764,
"lp_dist": -9.02529525756836,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-138",
"gold_norm": "21",
"dist_norm": "25",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.6713391542434692,
"lp_gold": -17.788984179496765,
"lp_dist": -20.460323333740234,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.8663175106048584,
"lp_gold": -14.334570407867432,
"lp_dist": -13.468252897262573,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.3505210876464844,
"lp_gold": -21.58456540107727,
"lp_dist": -21.935086488723755,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.2903881072998047,
"lp_gold": -22.286190509796143,
"lp_dist": -22.576578617095947,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.9333076477050781,
"lp_gold": -15.113465785980225,
"lp_dist": -14.180158138275146,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.7268631458282471,
"lp_gold": -13.463084697723389,
"lp_dist": -12.736221551895142,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.8663196563720703,
"lp_gold": -14.334563970565796,
"lp_dist": -13.468244314193726,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-145",
"gold_norm": "2",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2492438331246376,
"lp_gold": -13.602060556411743,
"lp_dist": -14.85130438953638,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.6407327204942703,
"lp_gold": -12.194403648376465,
"lp_dist": -9.553670927882195,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.6734669059514999,
"lp_gold": -13.396984100341797,
"lp_dist": -14.070451006293297,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.8539667278528214,
"lp_gold": -13.266191005706787,
"lp_dist": -14.120157733559608,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.3427896350622177,
"lp_gold": -10.751940488815308,
"lp_dist": -9.40915085375309,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.827908754348755,
"lp_gold": -14.305924892425537,
"lp_dist": -11.478016138076782,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -2.640733018517494,
"lp_gold": -12.194400548934937,
"lp_dist": -9.553667530417442,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-151",
"gold_norm": "803",
"dist_norm": "16",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.22600507736206055,
"lp_gold": -20.624857425689697,
"lp_dist": -20.850862503051758,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.131483793258667,
"lp_gold": -19.228359699249268,
"lp_dist": -14.0968759059906,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -3.2413750886917114,
"lp_gold": -19.84936547279358,
"lp_dist": -16.607990384101868,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -3.131054997444153,
"lp_gold": -19.767980694770813,
"lp_dist": -16.63692569732666,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -4.6819539070129395,
"lp_gold": -18.495857000350952,
"lp_dist": -13.813903093338013,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -4.957081317901611,
"lp_gold": -20.852898836135864,
"lp_dist": -15.895817518234253,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -5.131482243537903,
"lp_gold": -19.228361129760742,
"lp_dist": -14.09687888622284,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-153",
"gold_norm": "280",
"dist_norm": "13",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.694299184717238,
"lp_gold": -15.244264638982713,
"lp_dist": -18.93856382369995,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.8231047093868256,
"lp_gold": -12.64960965514183,
"lp_dist": -11.826504945755005,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.8308207541704178,
"lp_gold": -15.758303448557854,
"lp_dist": -16.58912420272827,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.7265715599060059,
"lp_gold": -16.042329788208008,
"lp_dist": -16.768901348114014,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.1412931680679321,
"lp_gold": -14.319903492927551,
"lp_dist": -13.17861032485962,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.4246297478675842,
"lp_gold": -15.16783195734024,
"lp_dist": -13.743202209472656,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.8231084495782852,
"lp_gold": -12.649619355797768,
"lp_dist": -11.826510906219482,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-154",
"gold_norm": "13",
"dist_norm": "20",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.0602927803993225,
"lp_gold": -15.215918719768524,
"lp_dist": -17.276211500167847,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.633309006690979,
"lp_gold": -12.038846015930176,
"lp_dist": -11.405537009239197,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -0.6033051013946533,
"lp_gold": -17.432909965515137,
"lp_dist": -16.829604864120483,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -0.6372992992401123,
"lp_gold": -17.39775514602661,
"lp_dist": -16.7604558467865,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.48968505859375,
"lp_gold": -13.174037456512451,
"lp_dist": -11.684352397918701,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.2143868207931519,
"lp_gold": -12.643470287322998,
"lp_dist": -11.429083466529846,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.6333088874816895,
"lp_gold": -12.038848400115967,
"lp_dist": -11.405539512634277,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-175",
"gold_norm": "350",
"dist_norm": "50",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9250896275043488,
"lp_gold": -22.99281856417656,
"lp_dist": -23.917908191680908,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.236621737480164,
"lp_gold": -17.58333122730255,
"lp_dist": -10.346709489822388,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -2.966416835784912,
"lp_gold": -20.354339838027954,
"lp_dist": -17.387923002243042,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -3.5315762758255005,
"lp_gold": -21.129161953926086,
"lp_dist": -17.597585678100586,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -7.064225733280182,
"lp_gold": -18.191003382205963,
"lp_dist": -11.126777648925781,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -7.836197316646576,
"lp_gold": -21.282770097255707,
"lp_dist": -13.44657278060913,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -7.236626446247101,
"lp_gold": -17.583332121372223,
"lp_dist": -10.346705675125122,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-207",
"gold_norm": "22",
"dist_norm": "16",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.621975004673004,
"lp_gold": -11.219844043254852,
"lp_dist": -13.841819047927856,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.833777904510498,
"lp_gold": -10.41726541519165,
"lp_dist": -9.583487510681152,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.4578208923339844,
"lp_gold": -15.501156091690063,
"lp_dist": -14.043335199356079,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.725459337234497,
"lp_gold": -15.87945008277893,
"lp_dist": -14.153990745544434,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.1624937057495117,
"lp_gold": -15.793409585952759,
"lp_dist": -14.630915880203247,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.6316690444946289,
"lp_gold": -10.619104146957397,
"lp_dist": -9.987435102462769,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.8337790966033936,
"lp_gold": -10.417269945144653,
"lp_dist": -9.58349084854126,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-215",
"gold_norm": "2",
"dist_norm": "3",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.387197434902191,
"lp_gold": -9.424871981143951,
"lp_dist": -12.812069416046143,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.5975170135498047,
"lp_gold": -9.744040250778198,
"lp_dist": -8.146523237228394,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -3.2604313492774963,
"lp_gold": -12.714868068695068,
"lp_dist": -9.454436719417572,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -3.084445893764496,
"lp_gold": -12.945753574371338,
"lp_dist": -9.861307680606842,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.5732927322387695,
"lp_gold": -12.206480503082275,
"lp_dist": -10.633187770843506,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.1921443939208984,
"lp_gold": -10.054778575897217,
"lp_dist": -8.862634181976318,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.5975122451782227,
"lp_gold": -9.744039058685303,
"lp_dist": -8.14652681350708,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-225",
"gold_norm": "1596",
"dist_norm": "81",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.9876238331198692,
"lp_gold": -17.49847326427698,
"lp_dist": -19.48609709739685,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.1633647084236145,
"lp_gold": -24.1348779797554,
"lp_dist": -17.971513271331787,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.7931787371635437,
"lp_gold": -18.82551997900009,
"lp_dist": -17.032341241836548,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -2.224741965532303,
"lp_gold": -19.164868861436844,
"lp_dist": -16.94012689590454,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -10.08569085597992,
"lp_gold": -30.44549548625946,
"lp_dist": -20.35980463027954,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -9.3379967212677,
"lp_gold": -30.265968084335327,
"lp_dist": -20.927971363067627,
"n_tokens_gold": 5,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -6.163362383842468,
"lp_gold": -24.134878516197205,
"lp_dist": -17.971516132354736,
"n_tokens_gold": 5,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-226",
"gold_norm": "81",
"dist_norm": "56",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.605985850095749,
"lp_gold": -15.771342545747757,
"lp_dist": -19.377328395843506,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.6331486701965332,
"lp_gold": -16.669665813446045,
"lp_dist": -15.036517143249512,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.214464545249939,
"lp_gold": -16.49272656440735,
"lp_dist": -15.27826201915741,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -0.9139895439147949,
"lp_gold": -16.460952043533325,
"lp_dist": -15.54696249961853,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.7367334365844727,
"lp_gold": -19.27399492263794,
"lp_dist": -18.537261486053467,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.8308906555175781,
"lp_gold": -16.610596179962158,
"lp_dist": -14.77970552444458,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.6331486701965332,
"lp_gold": -16.66966152191162,
"lp_dist": -15.036512851715088,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "gsm8k-test-238",
"gold_norm": "60",
"dist_norm": "4",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.38430750370025635,
"lp_gold": -13.078525424003601,
"lp_dist": -13.462832927703857,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.5117335319519043,
"lp_gold": -10.930900573730469,
"lp_dist": -9.419167041778564,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -1.7834737300872803,
"lp_gold": -15.81191873550415,
"lp_dist": -14.02844500541687,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.6578322649002075,
"lp_gold": -15.757366299629211,
"lp_dist": -14.099534034729004,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -5.469817161560059,
"lp_gold": -15.78016185760498,
"lp_dist": -10.310344696044922,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.3997215032577515,
"lp_gold": -11.266274809837341,
"lp_dist": -9.86655330657959,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.5117324590682983,
"lp_gold": -10.930898785591125,
"lp_dist": -9.419166326522827,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-242",
"gold_norm": "19",
"dist_norm": "6",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.181618273258209,
"lp_gold": -12.157159745693207,
"lp_dist": -17.338778018951416,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4070416688919067,
"lp_gold": -10.997576355934143,
"lp_dist": -9.590534687042236,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 1.4950295239686966,
"lp_gold": -14.810317918658257,
"lp_dist": -16.305347442626953,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 1.47235550545156,
"lp_gold": -14.627300599589944,
"lp_dist": -16.099656105041504,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.7640156745910645,
"lp_gold": -15.07509469985962,
"lp_dist": -12.311079025268555,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.5993056297302246,
"lp_gold": -12.642723798751831,
"lp_dist": -10.043418169021606,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.4070385694503784,
"lp_gold": -10.997580409049988,
"lp_dist": -9.59054183959961,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "gsm8k-test-246",
"gold_norm": "130000",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.715070237376494,
"lp_gold": -12.472108629561262,
"lp_dist": -17.187178866937757,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -7.156454911455512,
"lp_gold": -13.647521084174514,
"lp_dist": -6.491066172719002,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -5.767381154000759,
"lp_gold": -21.409174405038357,
"lp_dist": -15.641793251037598,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -5.478326896904036,
"lp_gold": -22.25712094712071,
"lp_dist": -16.778794050216675,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -7.735332287847996,
"lp_gold": -17.688330195844173,
"lp_dist": -9.952997907996178,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -6.886516407132149,
"lp_gold": -16.19584783911705,
"lp_dist": -9.309331431984901,
"n_tokens_gold": 7,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -7.15645333006978,
"lp_gold": -13.64751996472478,
"lp_dist": -6.491066634654999,
"n_tokens_gold": 7,
"n_tokens_dist": 3
}
}
]
}