{ "meta": { "model": "meta-llama/Llama-2-7b-chat-hf", "device": "cuda", "dtype": "fp32", "layer": 10, "layers_path": "model.layers", "seed": 123, "task": "gsm8k", "eval_mode": "pair_logprob", "eval_meta": { "subspace_split": null, "eval_split": "test", "available_splits": [ "train", "test" ], "hf_id": "gsm8k/main" }, "n_eval_loaded": 256, "n_scanned": 256, "base_acc_scan": 0.625, "ablt_acc_scan": 0.59375, "flips_total": 31, "flips_used": 31, "patch_steps": [ 0 ], "patch_n_steps": 1, "Qs_path": "Q_shared_layer10.npy", "Qs_shape": [ 4096, 97 ], "gold_text_prefix": " ", "dist_text_prefix": " ", "gold_max_tokens": 0, "distractor_mode": "next_gold", "answer_prefix_effective": "\nFinal answer:", "max_new_tokens_effective": 64, "run_coeff_controls": false, "use_benchmark_loader": true, "hf_id": "", "hf_split": "test" }, "summary_on_flips": { "patched_self": { "n": 31, "rescued": 11, "rescued_pct": 35.483870967741936, "mean_delta_margin_vs_ablated": 1.5506091117858887, "median_delta_margin_vs_ablated": 1.8901087045669556 }, "control_time_shuffled": { "n": 31, "rescued": 11, "rescued_pct": 35.483870967741936, "mean_delta_margin_vs_ablated": 1.536723256111145, "median_delta_margin_vs_ablated": 1.8725682497024536 }, "control_shared_randvec": { "n": 31, "rescued": 1, "rescued_pct": 3.225806451612903, "mean_delta_margin_vs_ablated": -0.34435272216796875, "median_delta_margin_vs_ablated": -0.03446388244628906 }, "control_rand_subspace": { "n": 31, "rescued": 0, "rescued_pct": 0.0, "mean_delta_margin_vs_ablated": -0.26384562253952026, "median_delta_margin_vs_ablated": -0.18717603385448456 }, "control_patch_nonshared": { "n": 31, "rescued": 0, "rescued_pct": 0.0, "mean_delta_margin_vs_ablated": 8.61082810388325e-07, "median_delta_margin_vs_ablated": 1.0728836059570312e-06 } }, "scan_rows": [ { "ex_id": "gsm8k-test-0", "gold_norm": "50", "dist_norm": "80", "baseline": { "pred": "gold", "correct": true, "margin": 2.0485178977251053, "lp_gold": -13.375523149967194, "lp_dist": -15.424041047692299, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.2319180071353912, "lp_gold": -6.67494124174118, "lp_dist": -8.906859248876572, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-1", "gold_norm": "80", "dist_norm": "12", "baseline": { "pred": "gold", "correct": true, "margin": 1.218225084245205, "lp_gold": -16.316218174993992, "lp_dist": -17.534443259239197, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6342043727636337, "lp_gold": -18.493512138724327, "lp_dist": -17.859307765960693, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-2", "gold_norm": "12", "dist_norm": "140", "baseline": { "pred": "gold", "correct": true, "margin": 9.582239151000977, "lp_gold": -19.479307651519775, "lp_dist": -29.061546802520752, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.06528377532959, "lp_gold": -16.74149775505066, "lp_dist": -20.80678153038025, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-3", "gold_norm": "140", "dist_norm": "36", "baseline": { "pred": "dist", "correct": false, "margin": -2.5653446912765503, "lp_gold": -19.955466985702515, "lp_dist": -17.390122294425964, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.37339717149734497, "lp_gold": -14.124846756458282, "lp_dist": -13.751449584960938, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-4", "gold_norm": "36", "dist_norm": "3200", "baseline": { "pred": "gold", "correct": true, "margin": 17.496737867593765, "lp_gold": -13.73099598288536, "lp_dist": -31.227733850479126, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 9.695431232452393, "lp_gold": -7.723996877670288, "lp_dist": -17.41942811012268, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-5", "gold_norm": "3200", "dist_norm": "38", "baseline": { "pred": "gold", "correct": true, "margin": 4.075981711270288, "lp_gold": -15.808944131014869, "lp_dist": -19.884925842285156, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.112908275797963, "lp_gold": -17.281133087351918, "lp_dist": -15.168224811553955, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-6", "gold_norm": "38", "dist_norm": "32", "baseline": { "pred": "dist", "correct": false, "margin": -2.0733052492141724, "lp_gold": -17.57793438434601, "lp_dist": -15.504629135131836, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.1486949920654297, "lp_gold": -20.525099754333496, "lp_dist": -20.376404762268066, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-7", "gold_norm": "32", "dist_norm": "92", "baseline": { "pred": "gold", "correct": true, "margin": 3.4189205169677734, "lp_gold": -16.66067409515381, "lp_dist": -20.079594612121582, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.4474713802337646, "lp_gold": -15.954271793365479, "lp_dist": -18.401743173599243, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-8", "gold_norm": "92", "dist_norm": "16", "baseline": { "pred": "dist", "correct": false, "margin": -3.1885854713618755, "lp_gold": -20.10318946838379, "lp_dist": -16.914603997021914, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.5822286009788513, "lp_gold": -15.157714128494263, "lp_dist": -12.575485527515411, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-9", "gold_norm": "16", "dist_norm": "45", "baseline": { "pred": "dist", "correct": false, "margin": -3.206469178199768, "lp_gold": -20.85190773010254, "lp_dist": -17.64543855190277, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.272299289703369, "lp_gold": -11.194756746292114, "lp_dist": -13.467056035995483, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-10", "gold_norm": "45", "dist_norm": "270", "baseline": { "pred": "gold", "correct": true, "margin": 11.154298067092896, "lp_gold": -17.49683403968811, "lp_dist": -28.651132106781006, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.5754551887512207, "lp_gold": -13.103037357330322, "lp_dist": -16.678492546081543, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-11", "gold_norm": "270", "dist_norm": "100", "baseline": { "pred": "dist", "correct": false, "margin": -3.8893778324127197, "lp_gold": -21.884052515029907, "lp_dist": -17.994674682617188, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.458778917789459, "lp_gold": -14.477847814559937, "lp_dist": -10.019068896770477, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-12", "gold_norm": "100", "dist_norm": "25", "baseline": { "pred": "gold", "correct": true, "margin": 5.500066578388214, "lp_gold": -11.581663310527802, "lp_dist": -17.081729888916016, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.39761683344841, "lp_gold": -9.308580189943314, "lp_dist": -11.706197023391724, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-13", "gold_norm": "25", "dist_norm": "800", "baseline": { "pred": "gold", "correct": true, "margin": 20.992703570984304, "lp_gold": -13.314849936403334, "lp_dist": -34.30755350738764, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.717362227849662, "lp_gold": -11.016974148340523, "lp_dist": -18.734336376190186, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-14", "gold_norm": "800", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 2.7909989710897207, "lp_gold": -10.428668463602662, "lp_dist": -13.219667434692383, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.28849396109581, "lp_gold": -15.488610118627548, "lp_dist": -9.200116157531738, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-15", "gold_norm": "2", "dist_norm": "7000", "baseline": { "pred": "gold", "correct": true, "margin": 14.264829635620117, "lp_gold": -12.60490345954895, "lp_dist": -26.869733095169067, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 10.93172574043274, "lp_gold": -10.274073839187622, "lp_dist": -21.20579957962036, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-16", "gold_norm": "7000", "dist_norm": "25", "baseline": { "pred": "dist", "correct": false, "margin": -4.534815393853933, "lp_gold": -21.196847282815725, "lp_dist": -16.662031888961792, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.092950224876404, "lp_gold": -21.74782168865204, "lp_dist": -14.654871463775635, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-17", "gold_norm": "25", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -4.14834189414978, "lp_gold": -15.3827223777771, "lp_dist": -11.23438048362732, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.8233906030654907, "lp_gold": -9.256547331809998, "lp_dist": -8.433156728744507, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-18", "gold_norm": "3", "dist_norm": "3430", "baseline": { "pred": "gold", "correct": true, "margin": 31.8187518119812, "lp_gold": -10.239798672497272, "lp_dist": -42.058550484478474, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 16.154653549194336, "lp_gold": -7.938319206237793, "lp_dist": -24.09297275543213, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-19", "gold_norm": "3430", "dist_norm": "106", "baseline": { "pred": "dist", "correct": false, "margin": -4.379680693149567, "lp_gold": -23.033769607543945, "lp_dist": -18.65408891439438, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -9.992487907409668, "lp_gold": -21.179072380065918, "lp_dist": -11.18658447265625, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-20", "gold_norm": "106", "dist_norm": "80", "baseline": { "pred": "gold", "correct": true, "margin": 4.92336449585855, "lp_gold": -15.76830449141562, "lp_dist": -20.69166898727417, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6747859213501215, "lp_gold": -17.988985607400537, "lp_dist": -17.314199686050415, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-21", "gold_norm": "80", "dist_norm": "26", "baseline": { "pred": "gold", "correct": true, "margin": 2.6582831740379333, "lp_gold": -10.863374054431915, "lp_dist": -13.521657228469849, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.6402748823165894, "lp_gold": -11.24216091632843, "lp_dist": -12.88243579864502, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-22", "gold_norm": "26", "dist_norm": "750", "baseline": { "pred": "gold", "correct": true, "margin": 4.5539721846580505, "lp_gold": -21.11834144592285, "lp_dist": -25.672313630580902, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.063919067382812, "lp_gold": -12.24570107460022, "lp_dist": -20.309620141983032, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-23", "gold_norm": "750", "dist_norm": "9", "baseline": { "pred": "dist", "correct": false, "margin": -0.4596693105995655, "lp_gold": -14.162512499839067, "lp_dist": -13.702843189239502, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.3531132936477661, "lp_gold": -11.305097699165344, "lp_dist": -10.951984405517578, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-24", "gold_norm": "9", "dist_norm": "40", "baseline": { "pred": "gold", "correct": true, "margin": 0.006132304668426514, "lp_gold": -15.839151382446289, "lp_dist": -15.845283687114716, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.110446274280548, "lp_gold": -12.25863265991211, "lp_dist": -10.148186385631561, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-25", "gold_norm": "40", "dist_norm": "14", "baseline": { "pred": "dist", "correct": false, "margin": -0.5176091194152832, "lp_gold": -16.009315252304077, "lp_dist": -15.491706132888794, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.6056453585624695, "lp_gold": -14.140560686588287, "lp_dist": -15.746206045150757, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-26", "gold_norm": "14", "dist_norm": "160", "baseline": { "pred": "gold", "correct": true, "margin": 13.092049598693848, "lp_gold": -12.284036666154861, "lp_dist": -25.37608626484871, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.250454902648926, "lp_gold": -11.187321424484253, "lp_dist": -22.43777632713318, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-27", "gold_norm": "160", "dist_norm": "6", "baseline": { "pred": "gold", "correct": true, "margin": 1.597537249326706, "lp_gold": -12.841732293367386, "lp_dist": -14.439269542694092, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.338132083415985, "lp_gold": -17.455387771129608, "lp_dist": -10.117255687713623, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-28", "gold_norm": "6", "dist_norm": "132", "baseline": { "pred": "gold", "correct": true, "margin": 5.4012770652771, "lp_gold": -12.933898210525513, "lp_dist": -18.335175275802612, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.1544095277786255, "lp_gold": -9.050714015960693, "lp_dist": -16.20512354373932, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-29", "gold_norm": "132", "dist_norm": "8", "baseline": { "pred": "dist", "correct": false, "margin": -8.112765461206436, "lp_gold": -18.76314067840576, "lp_dist": -10.650375217199326, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.735072135925293, "lp_gold": -16.513195633888245, "lp_dist": -7.778123497962952, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-30", "gold_norm": "8", "dist_norm": "68", "baseline": { "pred": "gold", "correct": true, "margin": 2.2700021266937256, "lp_gold": -11.72844409942627, "lp_dist": -13.998446226119995, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.666181564331055, "lp_gold": -7.928534984588623, "lp_dist": -20.594716548919678, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-31", "gold_norm": "68", "dist_norm": "31", "baseline": { "pred": "gold", "correct": true, "margin": 3.820281505584717, "lp_gold": -15.837103843688965, "lp_dist": -19.65738534927368, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.25214481353759766, "lp_gold": -12.841001033782959, "lp_dist": -12.588856220245361, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-32", "gold_norm": "31", "dist_norm": "100", "baseline": { "pred": "gold", "correct": true, "margin": 6.4860659539699554, "lp_gold": -13.60796919465065, "lp_dist": -20.094035148620605, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.465068936347961, "lp_gold": -14.221534967422485, "lp_dist": -18.686603903770447, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-33", "gold_norm": "100", "dist_norm": "1509", "baseline": { "pred": "gold", "correct": true, "margin": 13.144955581985414, "lp_gold": -13.722247913479805, "lp_dist": -26.86720349546522, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.218970347195864, "lp_gold": -9.529480028897524, "lp_dist": -22.748450376093388, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-34", "gold_norm": "1509", "dist_norm": "480", "baseline": { "pred": "dist", "correct": false, "margin": -6.574819326400757, "lp_gold": -23.19943141937256, "lp_dist": -16.6246120929718, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.709552764892578, "lp_gold": -18.728264808654785, "lp_dist": -11.018712043762207, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-35", "gold_norm": "480", "dist_norm": "520", "baseline": { "pred": "gold", "correct": true, "margin": 1.5111888945102692, "lp_gold": -12.033819317817688, "lp_dist": -13.545008212327957, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.8960548639297485, "lp_gold": -19.025392055511475, "lp_dist": -16.129337191581726, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-36", "gold_norm": "520", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -2.531530350446701, "lp_gold": -14.413595885038376, "lp_dist": -11.882065534591675, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.166505575180054, "lp_gold": -12.240307569503784, "lp_dist": -8.07380199432373, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-37", "gold_norm": "3", "dist_norm": "33", "baseline": { "pred": "gold", "correct": true, "margin": 7.900035858154297, "lp_gold": -12.652887344360352, "lp_dist": -20.55292320251465, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.076467990875244, "lp_gold": -7.961295485496521, "lp_dist": -14.037763476371765, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-38", "gold_norm": "33", "dist_norm": "120", "baseline": { "pred": "gold", "correct": true, "margin": 15.355147242546082, "lp_gold": -12.304473280906677, "lp_dist": -27.65962052345276, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 10.39077889919281, "lp_gold": -11.524258255958557, "lp_dist": -21.915037155151367, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-39", "gold_norm": "120", "dist_norm": "14", "baseline": { "pred": "gold", "correct": true, "margin": 0.22332683950662613, "lp_gold": -14.252640329301357, "lp_dist": -14.475967168807983, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.3626268804073334, "lp_gold": -10.953217655420303, "lp_dist": -13.315844535827637, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-40", "gold_norm": "14", "dist_norm": "20", "baseline": { "pred": "dist", "correct": false, "margin": -7.754596941173077, "lp_gold": -17.770805835723877, "lp_dist": -10.0162088945508, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.5667039155960083, "lp_gold": -8.350147247314453, "lp_dist": -7.783443331718445, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-41", "gold_norm": "20", "dist_norm": "95200", "baseline": { "pred": "gold", "correct": true, "margin": 33.85109522007406, "lp_gold": -16.739925840869546, "lp_dist": -50.5910210609436, "n_tokens_gold": 3, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 23.679821968078613, "lp_gold": -9.72859787940979, "lp_dist": -33.4084198474884, "n_tokens_gold": 3, "n_tokens_dist": 6 } }, { "ex_id": "gsm8k-test-42", "gold_norm": "95200", "dist_norm": "77", "baseline": { "pred": "dist", "correct": false, "margin": -2.7502023852430284, "lp_gold": -19.08837911253795, "lp_dist": -16.338176727294922, "n_tokens_gold": 6, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.2610103897750378, "lp_gold": -21.381718140095472, "lp_dist": -18.120707750320435, "n_tokens_gold": 6, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-43", "gold_norm": "77", "dist_norm": "81", "baseline": { "pred": "dist", "correct": false, "margin": -0.041521549224853516, "lp_gold": -20.968489170074463, "lp_dist": -20.92696762084961, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.2143611907958984, "lp_gold": -14.122482776641846, "lp_dist": -16.336843967437744, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-44", "gold_norm": "81", "dist_norm": "310", "baseline": { "pred": "gold", "correct": true, "margin": 8.942630738019943, "lp_gold": -12.305748492479324, "lp_dist": -21.248379230499268, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.2553623914718628, "lp_gold": -13.983943223953247, "lp_dist": -14.23930561542511, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-45", "gold_norm": "310", "dist_norm": "100", "baseline": { "pred": "gold", "correct": true, "margin": 0.9456039071083069, "lp_gold": -16.172270894050598, "lp_dist": -17.117874801158905, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.856696009635925, "lp_gold": -16.36608850955963, "lp_dist": -10.509392499923706, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-46", "gold_norm": "100", "dist_norm": "160", "baseline": { "pred": "gold", "correct": true, "margin": 3.3894251135061495, "lp_gold": -12.40682859485969, "lp_dist": -15.796253708365839, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.0833441019058228, "lp_gold": -16.92129546403885, "lp_dist": -19.00463956594467, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-47", "gold_norm": "160", "dist_norm": "25", "baseline": { "pred": "gold", "correct": true, "margin": 0.824264804366976, "lp_gold": -14.298029144760221, "lp_dist": -15.122293949127197, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.2988634258508682, "lp_gold": -13.753293856978416, "lp_dist": -14.052157282829285, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-48", "gold_norm": "25", "dist_norm": "1400", "baseline": { "pred": "gold", "correct": true, "margin": 7.125034153461456, "lp_gold": -16.447975158691406, "lp_dist": -23.573009312152863, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.08089584112167358, "lp_gold": -13.430449962615967, "lp_dist": -13.349554121494293, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-49", "gold_norm": "1400", "dist_norm": "120", "baseline": { "pred": "dist", "correct": false, "margin": -2.9659185571945272, "lp_gold": -15.535773673269432, "lp_dist": -12.569855116074905, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.700849339365959, "lp_gold": -12.968689993023872, "lp_dist": -9.267840653657913, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-50", "gold_norm": "120", "dist_norm": "48", "baseline": { "pred": "gold", "correct": true, "margin": 3.5417392253875732, "lp_gold": -22.00163245201111, "lp_dist": -25.54337167739868, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.251087546348572, "lp_gold": -15.896643280982971, "lp_dist": -21.147730827331543, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-51", "gold_norm": "48", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 0.9340271949768066, "lp_gold": -11.738685846328735, "lp_dist": -12.672713041305542, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.7094523906707764, "lp_gold": -6.564473628997803, "lp_dist": -8.273926019668579, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-52", "gold_norm": "50", "dist_norm": "15400", "baseline": { "pred": "gold", "correct": true, "margin": 19.066895127296448, "lp_gold": -13.457320094108582, "lp_dist": -32.52421522140503, "n_tokens_gold": 3, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.942362904548645, "lp_gold": -15.37247109413147, "lp_dist": -22.314833998680115, "n_tokens_gold": 3, "n_tokens_dist": 6 } }, { "ex_id": "gsm8k-test-53", "gold_norm": "15400", "dist_norm": "80", "baseline": { "pred": "dist", "correct": false, "margin": -1.8634248977759853, "lp_gold": -20.78267443238292, "lp_dist": -18.919249534606934, "n_tokens_gold": 6, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.060255475342274, "lp_gold": -20.97295517474413, "lp_dist": -16.912699699401855, "n_tokens_gold": 6, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-54", "gold_norm": "80", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -1.1981298923492432, "lp_gold": -18.350556135177612, "lp_dist": -17.15242624282837, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.7748947478830814, "lp_gold": -7.663371529430151, "lp_dist": -11.438266277313232, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-55", "gold_norm": "5", "dist_norm": "14", "baseline": { "pred": "dist", "correct": false, "margin": -6.197003062348813, "lp_gold": -18.496329307556152, "lp_dist": -12.29932624520734, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.144104599952698, "lp_gold": -7.03524386882782, "lp_dist": -14.179348468780518, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-56", "gold_norm": "14", "dist_norm": "31", "baseline": { "pred": "gold", "correct": true, "margin": 8.266514074697625, "lp_gold": -18.428703057870734, "lp_dist": -26.69521713256836, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.1581010818481445, "lp_gold": -12.810563087463379, "lp_dist": -16.968664169311523, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-57", "gold_norm": "31", "dist_norm": "36", "baseline": { "pred": "dist", "correct": false, "margin": -1.7361334562301636, "lp_gold": -14.841211199760437, "lp_dist": -13.105077743530273, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6261711120605469, "lp_gold": -13.683454990386963, "lp_dist": -13.057283878326416, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-58", "gold_norm": "36", "dist_norm": "144", "baseline": { "pred": "gold", "correct": true, "margin": 0.37734442949295044, "lp_gold": -15.673691511154175, "lp_dist": -16.051035940647125, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.521233081817627, "lp_gold": -17.215554237365723, "lp_dist": -20.73678731918335, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-59", "gold_norm": "144", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -2.0488511323928833, "lp_gold": -15.628765225410461, "lp_dist": -13.579914093017578, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.597236156463623, "lp_gold": -16.582991123199463, "lp_dist": -13.98575496673584, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-60", "gold_norm": "5", "dist_norm": "750", "baseline": { "pred": "gold", "correct": true, "margin": 11.9340181350708, "lp_gold": -15.854983806610107, "lp_dist": -27.789001941680908, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.577134609222412, "lp_gold": -16.538414001464844, "lp_dist": -24.115548610687256, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-61", "gold_norm": "750", "dist_norm": "38", "baseline": { "pred": "dist", "correct": false, "margin": -0.7815818190574646, "lp_gold": -18.73184484243393, "lp_dist": -17.950263023376465, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.5384882092475891, "lp_gold": -11.338704288005829, "lp_dist": -11.877192497253418, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-62", "gold_norm": "38", "dist_norm": "48", "baseline": { "pred": "dist", "correct": false, "margin": -6.48445200920105, "lp_gold": -17.717634916305542, "lp_dist": -11.233182907104492, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.8313806354999542, "lp_gold": -12.223527193069458, "lp_dist": -8.392146557569504, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-63", "gold_norm": "48", "dist_norm": "655", "baseline": { "pred": "gold", "correct": true, "margin": 19.08035659790039, "lp_gold": -14.92322301864624, "lp_dist": -34.00357961654663, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.06407618522644, "lp_gold": -10.739889144897461, "lp_dist": -23.8039653301239, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-64", "gold_norm": "655", "dist_norm": "800", "baseline": { "pred": "gold", "correct": true, "margin": 1.8698419332504272, "lp_gold": -17.930187582969666, "lp_dist": -19.800029516220093, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4062272310256958, "lp_gold": -14.47088611125946, "lp_dist": -13.064658880233765, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-65", "gold_norm": "800", "dist_norm": "7300", "baseline": { "pred": "gold", "correct": true, "margin": 11.858935464173555, "lp_gold": -13.705154906958342, "lp_dist": -25.564090371131897, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.260221555829048, "lp_gold": -14.01822917163372, "lp_dist": -25.27845072746277, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-66", "gold_norm": "7300", "dist_norm": "48", "baseline": { "pred": "dist", "correct": false, "margin": -2.354404352605343, "lp_gold": -20.02805521339178, "lp_dist": -17.673650860786438, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.008578598499298096, "lp_gold": -13.163566768169403, "lp_dist": -13.172145366668701, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-67", "gold_norm": "48", "dist_norm": "4", "baseline": { "pred": "dist", "correct": false, "margin": -0.25957900285720825, "lp_gold": -17.536937534809113, "lp_dist": -17.277358531951904, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.7365117073059082, "lp_gold": -10.189218521118164, "lp_dist": -8.452706813812256, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-68", "gold_norm": "4", "dist_norm": "15", "baseline": { "pred": "dist", "correct": false, "margin": -1.0612575560808182, "lp_gold": -15.63377046585083, "lp_dist": -14.572512909770012, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.9296765327453613, "lp_gold": -9.090213418006897, "lp_dist": -12.019889950752258, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-69", "gold_norm": "15", "dist_norm": "23", "baseline": { "pred": "gold", "correct": true, "margin": 8.69707328081131, "lp_gold": -14.628733813762665, "lp_dist": -23.325807094573975, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.665110111236572, "lp_gold": -12.951319694519043, "lp_dist": -17.616429805755615, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-70", "gold_norm": "23", "dist_norm": "225", "baseline": { "pred": "gold", "correct": true, "margin": 11.066251754760742, "lp_gold": -16.185874462127686, "lp_dist": -27.252126216888428, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.176498889923096, "lp_gold": -14.897132635116577, "lp_dist": -20.073631525039673, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-71", "gold_norm": "225", "dist_norm": "15", "baseline": { "pred": "dist", "correct": false, "margin": -3.3921156525611877, "lp_gold": -16.729829609394073, "lp_dist": -13.337713956832886, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.107092618942261, "lp_gold": -17.290175914764404, "lp_dist": -11.183083295822144, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-72", "gold_norm": "15", "dist_norm": "82", "baseline": { "pred": "gold", "correct": true, "margin": 3.4555931091308594, "lp_gold": -13.475011110305786, "lp_dist": -16.930604219436646, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 9.554435849189758, "lp_gold": -11.591153025627136, "lp_dist": -21.145588874816895, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-73", "gold_norm": "82", "dist_norm": "1218", "baseline": { "pred": "gold", "correct": true, "margin": 10.144330263137817, "lp_gold": -14.399481773376465, "lp_dist": -24.543812036514282, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.632067203521729, "lp_gold": -11.898912191390991, "lp_dist": -25.53097939491272, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-74", "gold_norm": "1218", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -14.016261577606201, "lp_gold": -28.183964252471924, "lp_dist": -14.167702674865723, "n_tokens_gold": 5, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -9.392110228538513, "lp_gold": -21.020013689994812, "lp_dist": -11.627903461456299, "n_tokens_gold": 5, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-75", "gold_norm": "2", "dist_norm": "36", "baseline": { "pred": "gold", "correct": true, "margin": 2.396540880203247, "lp_gold": -13.256061553955078, "lp_dist": -15.652602434158325, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.963439464569092, "lp_gold": -9.20676326751709, "lp_dist": -16.17020273208618, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-76", "gold_norm": "36", "dist_norm": "13", "baseline": { "pred": "dist", "correct": false, "margin": -3.540708303451538, "lp_gold": -18.379968881607056, "lp_dist": -14.839260578155518, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6864776611328125, "lp_gold": -14.157576084136963, "lp_dist": -13.47109842300415, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-77", "gold_norm": "13", "dist_norm": "11", "baseline": { "pred": "gold", "correct": true, "margin": 0.3661365509033203, "lp_gold": -15.502496480941772, "lp_dist": -15.868633031845093, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.4615020751953125, "lp_gold": -14.546976089477539, "lp_dist": -15.008478164672852, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-78", "gold_norm": "11", "dist_norm": "8", "baseline": { "pred": "dist", "correct": false, "margin": -3.7840418815612793, "lp_gold": -21.872905254364014, "lp_dist": -18.088863372802734, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.002429008483887, "lp_gold": -13.64483380317688, "lp_dist": -9.642404794692993, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-79", "gold_norm": "8", "dist_norm": "440", "baseline": { "pred": "gold", "correct": true, "margin": 16.767229557037354, "lp_gold": -17.545647621154785, "lp_dist": -34.31287717819214, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.454898834228516, "lp_gold": -12.977333545684814, "lp_dist": -24.43223237991333, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-80", "gold_norm": "440", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -2.289784381631762, "lp_gold": -15.706766793970019, "lp_dist": -13.416982412338257, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.212664246559143, "lp_gold": -17.012100338935852, "lp_dist": -13.799436092376709, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-81", "gold_norm": "2", "dist_norm": "45", "baseline": { "pred": "gold", "correct": true, "margin": 7.003484487533569, "lp_gold": -13.458641052246094, "lp_dist": -20.462125539779663, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.808353066444397, "lp_gold": -8.184542536735535, "lp_dist": -13.992895603179932, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-82", "gold_norm": "45", "dist_norm": "54", "baseline": { "pred": "dist", "correct": false, "margin": -0.24556124210357666, "lp_gold": -15.131654500961304, "lp_dist": -14.886093258857727, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.5171573162078857, "lp_gold": -10.597485780715942, "lp_dist": -12.114643096923828, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-83", "gold_norm": "54", "dist_norm": "6", "baseline": { "pred": "dist", "correct": false, "margin": -2.36360502243042, "lp_gold": -13.119836330413818, "lp_dist": -10.756231307983398, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.9401865005493164, "lp_gold": -13.909927368164062, "lp_dist": -10.969740867614746, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-84", "gold_norm": "6", "dist_norm": "240", "baseline": { "pred": "gold", "correct": true, "margin": 16.525604009628296, "lp_gold": -14.586916446685791, "lp_dist": -31.112520456314087, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.351407527923584, "lp_gold": -8.294451236724854, "lp_dist": -20.645858764648438, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-85", "gold_norm": "240", "dist_norm": "428", "baseline": { "pred": "gold", "correct": true, "margin": 5.200618744827807, "lp_gold": -10.596241324208677, "lp_dist": -15.796860069036484, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.575641840696335, "lp_gold": -10.782865315675735, "lp_dist": -16.35850715637207, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-86", "gold_norm": "428", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -11.445145592093468, "lp_gold": -21.372050523757935, "lp_dist": -9.926904931664467, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.8857234716415405, "lp_gold": -13.641488909721375, "lp_dist": -10.755765438079834, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-87", "gold_norm": "5", "dist_norm": "255", "baseline": { "pred": "gold", "correct": true, "margin": 13.83140754699707, "lp_gold": -11.654325008392334, "lp_dist": -25.485732555389404, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 15.030074834823608, "lp_gold": -10.351794719696045, "lp_dist": -25.381869554519653, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-88", "gold_norm": "255", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -4.704339981079102, "lp_gold": -22.688746690750122, "lp_dist": -17.98440670967102, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.790473118424416, "lp_gold": -23.528611078858376, "lp_dist": -18.73813796043396, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-89", "gold_norm": "10", "dist_norm": "9", "baseline": { "pred": "dist", "correct": false, "margin": -1.6458263397216797, "lp_gold": -14.022311687469482, "lp_dist": -12.376485347747803, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.548365592956543, "lp_gold": -11.53894329071045, "lp_dist": -9.990577697753906, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-90", "gold_norm": "9", "dist_norm": "157", "baseline": { "pred": "gold", "correct": true, "margin": 11.838926374912262, "lp_gold": -14.729028940200806, "lp_dist": -26.567955315113068, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.449068486690521, "lp_gold": -13.700207710266113, "lp_dist": -18.149276196956635, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-91", "gold_norm": "157", "dist_norm": "56", "baseline": { "pred": "dist", "correct": false, "margin": -0.06636106967926025, "lp_gold": -13.999522089958191, "lp_dist": -13.93316102027893, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.18358194828033447, "lp_gold": -12.361805081367493, "lp_dist": -12.178223133087158, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-92", "gold_norm": "56", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -0.014044851064682007, "lp_gold": -13.622624963521957, "lp_dist": -13.608580112457275, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.375391006469727, "lp_gold": -15.001872539520264, "lp_dist": -9.626481533050537, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-93", "gold_norm": "5", "dist_norm": "144", "baseline": { "pred": "gold", "correct": true, "margin": 10.107487440109253, "lp_gold": -21.01281452178955, "lp_dist": -31.120301961898804, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.960463047027588, "lp_gold": -8.431816339492798, "lp_dist": -20.392279386520386, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-94", "gold_norm": "144", "dist_norm": "50", "baseline": { "pred": "dist", "correct": false, "margin": -0.8007860428187996, "lp_gold": -16.124470019945875, "lp_dist": -15.323683977127075, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.586140275001526, "lp_gold": -13.455833077430725, "lp_dist": -10.8696928024292, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-95", "gold_norm": "50", "dist_norm": "4", "baseline": { "pred": "dist", "correct": false, "margin": -4.431166723370552, "lp_gold": -14.887599676847458, "lp_dist": -10.456432953476906, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.6977656185626984, "lp_gold": -7.156943529844284, "lp_dist": -8.854709148406982, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-96", "gold_norm": "4", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 6.549594163894653, "lp_gold": -12.547298669815063, "lp_dist": -19.096892833709717, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.145283490419388, "lp_gold": -8.248348951339722, "lp_dist": -10.39363244175911, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-97", "gold_norm": "50", "dist_norm": "42", "baseline": { "pred": "gold", "correct": true, "margin": 0.1262907013297081, "lp_gold": -12.730935551226139, "lp_dist": -12.857226252555847, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.2466068267822266, "lp_gold": -9.356587171554565, "lp_dist": -12.603193998336792, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-98", "gold_norm": "42", "dist_norm": "7", "baseline": { "pred": "dist", "correct": false, "margin": -5.333731412887573, "lp_gold": -18.121748208999634, "lp_dist": -12.78801679611206, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.454429030418396, "lp_gold": -11.41723620891571, "lp_dist": -10.962807178497314, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-99", "gold_norm": "7", "dist_norm": "250", "baseline": { "pred": "gold", "correct": true, "margin": 10.109447717666626, "lp_gold": -17.65127396583557, "lp_dist": -27.760721683502197, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.160389855504036, "lp_gold": -12.602290153503418, "lp_dist": -15.762680009007454, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-100", "gold_norm": "250", "dist_norm": "12", "baseline": { "pred": "gold", "correct": true, "margin": 1.937235951423645, "lp_gold": -16.930358290672302, "lp_dist": -18.867594242095947, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.8317363262176514, "lp_gold": -17.415368795394897, "lp_dist": -15.583632469177246, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-101", "gold_norm": "12", "dist_norm": "7", "baseline": { "pred": "gold", "correct": true, "margin": 1.4091547727584839, "lp_gold": -11.118706822395325, "lp_dist": -12.527861595153809, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.3088250160217285, "lp_gold": -9.039770126342773, "lp_dist": -11.348595142364502, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-102", "gold_norm": "7", "dist_norm": "8", "baseline": { "pred": "dist", "correct": false, "margin": -1.3529720306396484, "lp_gold": -18.84420108795166, "lp_dist": -17.49122905731201, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.1846144199371338, "lp_gold": -10.204635620117188, "lp_dist": -9.020021200180054, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-103", "gold_norm": "8", "dist_norm": "26", "baseline": { "pred": "gold", "correct": true, "margin": 10.383567810058594, "lp_gold": -15.868620872497559, "lp_dist": -26.252188682556152, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.597115993499756, "lp_gold": -10.359461784362793, "lp_dist": -18.95657777786255, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-104", "gold_norm": "26", "dist_norm": "42", "baseline": { "pred": "gold", "correct": true, "margin": 4.550231754779816, "lp_gold": -16.079154193401337, "lp_dist": -20.629385948181152, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.743180751800537, "lp_gold": -20.936619758605957, "lp_dist": -17.19343900680542, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-105", "gold_norm": "42", "dist_norm": "5", "baseline": { "pred": "gold", "correct": true, "margin": 2.2448320388793945, "lp_gold": -17.29369354248047, "lp_dist": -19.538525581359863, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.48531031608581543, "lp_gold": -14.891574144363403, "lp_dist": -14.406263828277588, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-106", "gold_norm": "5", "dist_norm": "14400", "baseline": { "pred": "gold", "correct": true, "margin": 17.918405055999756, "lp_gold": -16.223863124847412, "lp_dist": -34.14226818084717, "n_tokens_gold": 2, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 18.689422607421875, "lp_gold": -8.31445324420929, "lp_dist": -27.003875851631165, "n_tokens_gold": 2, "n_tokens_dist": 6 } }, { "ex_id": "gsm8k-test-107", "gold_norm": "14400", "dist_norm": "400", "baseline": { "pred": "gold", "correct": true, "margin": 0.8998411595821381, "lp_gold": -19.079706698656082, "lp_dist": -19.97954785823822, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.72913409024477, "lp_gold": -22.703229255974293, "lp_dist": -20.974095165729523, "n_tokens_gold": 6, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-108", "gold_norm": "400", "dist_norm": "40", "baseline": { "pred": "dist", "correct": false, "margin": -0.00030357998912222683, "lp_gold": -12.683453394594835, "lp_dist": -12.683149814605713, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.014066597446799278, "lp_gold": -7.615564605221152, "lp_dist": -7.601498007774353, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-109", "gold_norm": "40", "dist_norm": "83", "baseline": { "pred": "gold", "correct": true, "margin": 5.819165468215942, "lp_gold": -15.469541311264038, "lp_dist": -21.28870677947998, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.161367028951645, "lp_gold": -13.052747160196304, "lp_dist": -21.21411418914795, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-110", "gold_norm": "83", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 0.9284783601760864, "lp_gold": -14.152065396308899, "lp_dist": -15.080543756484985, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.6964447498321533, "lp_gold": -11.795601606369019, "lp_dist": -8.099156856536865, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-111", "gold_norm": "10", "dist_norm": "80", "baseline": { "pred": "gold", "correct": true, "margin": 6.51949143409729, "lp_gold": -16.031707048416138, "lp_dist": -22.551198482513428, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.987100124359131, "lp_gold": -14.24100637435913, "lp_dist": -20.22810649871826, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-112", "gold_norm": "80", "dist_norm": "180", "baseline": { "pred": "gold", "correct": true, "margin": 1.0028108435217291, "lp_gold": -14.749124482274055, "lp_dist": -15.751935325795785, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.4700966998934746, "lp_gold": -9.443349197506905, "lp_dist": -11.91344589740038, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-113", "gold_norm": "180", "dist_norm": "1450000", "baseline": { "pred": "gold", "correct": true, "margin": 21.25386095046997, "lp_gold": -27.976417541503906, "lp_dist": -49.23027849197388, "n_tokens_gold": 4, "n_tokens_dist": 8 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.907071352005005, "lp_gold": -15.60246878862381, "lp_dist": -29.509540140628815, "n_tokens_gold": 4, "n_tokens_dist": 8 } }, { "ex_id": "gsm8k-test-114", "gold_norm": "1450000", "dist_norm": "15", "baseline": { "pred": "dist", "correct": false, "margin": -5.66898221289739, "lp_gold": -18.9738236120902, "lp_dist": -13.30484139919281, "n_tokens_gold": 8, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.122156334575266, "lp_gold": -18.38626338308677, "lp_dist": -10.264107048511505, "n_tokens_gold": 8, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-115", "gold_norm": "15", "dist_norm": "1000", "baseline": { "pred": "gold", "correct": true, "margin": 8.356005743145943, "lp_gold": -21.951110124588013, "lp_dist": -30.307115867733955, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.199884116649628, "lp_gold": -11.521326780319214, "lp_dist": -19.72121089696884, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-116", "gold_norm": "1000", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -0.19174530085001606, "lp_gold": -13.59133626993571, "lp_dist": -13.399590969085693, "n_tokens_gold": 5, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.19964181631803513, "lp_gold": -10.637855164706707, "lp_dist": -10.438213348388672, "n_tokens_gold": 5, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-117", "gold_norm": "2", "dist_norm": "15", "baseline": { "pred": "gold", "correct": true, "margin": 6.722217559814453, "lp_gold": -18.274237632751465, "lp_dist": -24.996455192565918, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.5563411712646484, "lp_gold": -12.917238712310791, "lp_dist": -14.47357988357544, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-118", "gold_norm": "15", "dist_norm": "100", "baseline": { "pred": "gold", "correct": true, "margin": 3.2469024658203125, "lp_gold": -14.20986533164978, "lp_dist": -17.456767797470093, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.755454957485199, "lp_gold": -7.566788613796234, "lp_dist": -16.322243571281433, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-119", "gold_norm": "100", "dist_norm": "335", "baseline": { "pred": "gold", "correct": true, "margin": 16.725587379769422, "lp_gold": -11.31663225905504, "lp_dist": -28.042219638824463, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.39561602845788, "lp_gold": -11.48399594053626, "lp_dist": -19.87961196899414, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-120", "gold_norm": "335", "dist_norm": "60", "baseline": { "pred": "gold", "correct": true, "margin": 1.718258023262024, "lp_gold": -16.84885323047638, "lp_dist": -18.567111253738403, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.495100736618042, "lp_gold": -16.78837823867798, "lp_dist": -13.293277502059937, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-121", "gold_norm": "60", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -2.7624173164367676, "lp_gold": -16.792863368988037, "lp_dist": -14.03044605255127, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.886390924453735, "lp_gold": -13.833235025405884, "lp_dist": -7.946844100952148, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-122", "gold_norm": "5", "dist_norm": "9500", "baseline": { "pred": "gold", "correct": true, "margin": 16.83846201375127, "lp_gold": -13.696074485778809, "lp_dist": -30.534536499530077, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 15.658583138138056, "lp_gold": -8.453100323677063, "lp_dist": -24.11168346181512, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-123", "gold_norm": "9500", "dist_norm": "160", "baseline": { "pred": "dist", "correct": false, "margin": -0.3972220839932561, "lp_gold": -20.279614341445267, "lp_dist": -19.88239225745201, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.62424199283123, "lp_gold": -20.69040386378765, "lp_dist": -18.06616187095642, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-124", "gold_norm": "160", "dist_norm": "1050", "baseline": { "pred": "gold", "correct": true, "margin": 16.60233561617497, "lp_gold": -18.274476603444782, "lp_dist": -34.87681221961975, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.274514463730156, "lp_gold": -13.8317128745839, "lp_dist": -22.106227338314056, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-125", "gold_norm": "1050", "dist_norm": "91", "baseline": { "pred": "dist", "correct": false, "margin": -2.0837584948167205, "lp_gold": -23.45734657999128, "lp_dist": -21.37358808517456, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.869788646697998, "lp_gold": -20.34303617477417, "lp_dist": -17.473247528076172, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-126", "gold_norm": "91", "dist_norm": "21", "baseline": { "pred": "dist", "correct": false, "margin": -6.658190071582794, "lp_gold": -20.631014347076416, "lp_dist": -13.972824275493622, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.4234659671783447, "lp_gold": -17.770225048065186, "lp_dist": -15.34675908088684, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-127", "gold_norm": "21", "dist_norm": "20", "baseline": { "pred": "dist", "correct": false, "margin": -1.925290822982788, "lp_gold": -16.08545808121562, "lp_dist": -14.160167258232832, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.1263790130615234, "lp_gold": -13.073553562164307, "lp_dist": -10.947174549102783, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-128", "gold_norm": "20", "dist_norm": "36", "baseline": { "pred": "gold", "correct": true, "margin": 0.3922419548034668, "lp_gold": -19.574571132659912, "lp_dist": -19.96681308746338, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.179409921169281, "lp_gold": -8.625289022922516, "lp_dist": -10.804698944091797, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-129", "gold_norm": "36", "dist_norm": "36", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -12.731476545333862, "lp_dist": -12.731476545333862, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -11.376878261566162, "lp_dist": -11.376878261566162, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-130", "gold_norm": "36", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -2.552075147628784, "lp_gold": -14.53696346282959, "lp_dist": -11.984888315200806, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -11.998991012573242, "lp_gold": -20.990919589996338, "lp_dist": -8.991928577423096, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-131", "gold_norm": "10", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -8.991529874503613, "lp_gold": -16.921989023685455, "lp_dist": -7.930459149181843, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.6315575018525124, "lp_gold": -6.705006085336208, "lp_dist": -8.33656358718872, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-132", "gold_norm": "5", "dist_norm": "32", "baseline": { "pred": "gold", "correct": true, "margin": 6.572678565979004, "lp_gold": -12.411277294158936, "lp_dist": -18.98395586013794, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.04911994934082, "lp_gold": -8.343387603759766, "lp_dist": -12.392507553100586, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-133", "gold_norm": "32", "dist_norm": "18", "baseline": { "pred": "dist", "correct": false, "margin": -0.6729400157928467, "lp_gold": -15.0261971950531, "lp_dist": -14.353257179260254, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.8697174787521362, "lp_gold": -10.817826390266418, "lp_dist": -11.687543869018555, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-134", "gold_norm": "18", "dist_norm": "4", "baseline": { "pred": "gold", "correct": true, "margin": 3.9239641074091196, "lp_gold": -15.171829616650939, "lp_dist": -19.09579372406006, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.7061721086502075, "lp_gold": -9.731460690498352, "lp_dist": -9.025288581848145, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-135", "gold_norm": "4", "dist_norm": "48", "baseline": { "pred": "gold", "correct": true, "margin": 9.912032127380371, "lp_gold": -11.661430835723877, "lp_dist": -21.573462963104248, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 10.949398040771484, "lp_gold": -13.34396743774414, "lp_dist": -24.293365478515625, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-136", "gold_norm": "48", "dist_norm": "8", "baseline": { "pred": "dist", "correct": false, "margin": -1.2137904167175293, "lp_gold": -13.246366620063782, "lp_dist": -12.032576203346252, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.785146713256836, "lp_gold": -15.54097604751587, "lp_dist": -10.755829334259033, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-137", "gold_norm": "8", "dist_norm": "21", "baseline": { "pred": "gold", "correct": true, "margin": 9.050054788589478, "lp_gold": -14.328342199325562, "lp_dist": -23.37839698791504, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.801905870437622, "lp_gold": -9.647645235061646, "lp_dist": -13.449551105499268, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-138", "gold_norm": "21", "dist_norm": "25", "baseline": { "pred": "gold", "correct": true, "margin": 2.6713391542434692, "lp_gold": -17.788984179496765, "lp_dist": -20.460323333740234, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.8663175106048584, "lp_gold": -14.334570407867432, "lp_dist": -13.468252897262573, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-139", "gold_norm": "25", "dist_norm": "3000", "baseline": { "pred": "gold", "correct": true, "margin": 10.303999066352844, "lp_gold": -13.958717346191406, "lp_dist": -24.26271641254425, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.2307329177856445, "lp_gold": -7.678596615791321, "lp_dist": -13.909329533576965, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-140", "gold_norm": "3000", "dist_norm": "40", "baseline": { "pred": "dist", "correct": false, "margin": -1.8300985433161259, "lp_gold": -17.700348053127527, "lp_dist": -15.870249509811401, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.4543883726000786, "lp_gold": -14.831104047596455, "lp_dist": -15.285492420196533, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-141", "gold_norm": "40", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 1.7375423088669777, "lp_gold": -13.080543011426926, "lp_dist": -14.818085320293903, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.48419055342674255, "lp_gold": -11.210194662213326, "lp_dist": -11.694385215640068, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-142", "gold_norm": "50", "dist_norm": "90", "baseline": { "pred": "dist", "correct": false, "margin": -1.239281177520752, "lp_gold": -13.6494460105896, "lp_dist": -12.410164833068848, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.5999003648757935, "lp_gold": -9.939468264579773, "lp_dist": -11.539368629455566, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-143", "gold_norm": "90", "dist_norm": "23", "baseline": { "pred": "dist", "correct": false, "margin": -0.3710329532623291, "lp_gold": -20.11902666091919, "lp_dist": -19.74799370765686, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.965712070465088, "lp_gold": -15.742670059204102, "lp_dist": -18.70838212966919, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-144", "gold_norm": "23", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -4.9211039543151855, "lp_gold": -17.69726538658142, "lp_dist": -12.776161432266235, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.012661933898926, "lp_gold": -12.401761054992676, "lp_dist": -8.38909912109375, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-145", "gold_norm": "2", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 1.2492438331246376, "lp_gold": -13.602060556411743, "lp_dist": -14.85130438953638, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.6407327204942703, "lp_gold": -12.194403648376465, "lp_dist": -9.553670927882195, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-146", "gold_norm": "50", "dist_norm": "122", "baseline": { "pred": "gold", "correct": true, "margin": 4.681281805038452, "lp_gold": -16.079622983932495, "lp_dist": -20.760904788970947, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.314260721206665, "lp_gold": -11.10973858833313, "lp_dist": -18.423999309539795, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-147", "gold_norm": "122", "dist_norm": "300", "baseline": { "pred": "gold", "correct": true, "margin": 6.33387154340744, "lp_gold": -18.671439349651337, "lp_dist": -25.005310893058777, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.5978607535362244, "lp_gold": -15.271158814430237, "lp_dist": -15.869019567966461, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-148", "gold_norm": "300", "dist_norm": "448", "baseline": { "pred": "gold", "correct": true, "margin": 13.451393851355533, "lp_gold": -17.04763673870184, "lp_dist": -30.499030590057373, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.197498982015532, "lp_gold": -14.669522101816256, "lp_dist": -27.867021083831787, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-149", "gold_norm": "448", "dist_norm": "2450", "baseline": { "pred": "gold", "correct": true, "margin": 5.949738264083862, "lp_gold": -25.570088386535645, "lp_dist": -31.519826650619507, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.753702878952026, "lp_gold": -20.34923005104065, "lp_dist": -26.102932929992676, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-150", "gold_norm": "2450", "dist_norm": "803", "baseline": { "pred": "gold", "correct": true, "margin": 5.90772854257375, "lp_gold": -16.1460417015478, "lp_dist": -22.05377024412155, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.6608445048332214, "lp_gold": -14.72765988111496, "lp_dist": -15.388504385948181, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-151", "gold_norm": "803", "dist_norm": "16", "baseline": { "pred": "gold", "correct": true, "margin": 0.22600507736206055, "lp_gold": -20.624857425689697, "lp_dist": -20.850862503051758, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.131483793258667, "lp_gold": -19.228359699249268, "lp_dist": -14.0968759059906, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-152", "gold_norm": "16", "dist_norm": "280", "baseline": { "pred": "gold", "correct": true, "margin": 9.489606261253357, "lp_gold": -13.729230046272278, "lp_dist": -23.218836307525635, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.312919616699219, "lp_gold": -9.783522605895996, "lp_dist": -23.096442222595215, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-153", "gold_norm": "280", "dist_norm": "13", "baseline": { "pred": "gold", "correct": true, "margin": 3.694299184717238, "lp_gold": -15.244264638982713, "lp_dist": -18.93856382369995, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.8231047093868256, "lp_gold": -12.64960965514183, "lp_dist": -11.826504945755005, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-154", "gold_norm": "13", "dist_norm": "20", "baseline": { "pred": "gold", "correct": true, "margin": 2.0602927803993225, "lp_gold": -15.215918719768524, "lp_dist": -17.276211500167847, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.633309006690979, "lp_gold": -12.038846015930176, "lp_dist": -11.405537009239197, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-155", "gold_norm": "20", "dist_norm": "14", "baseline": { "pred": "dist", "correct": false, "margin": -2.590712457895279, "lp_gold": -18.284266233444214, "lp_dist": -15.693553775548935, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.9870389699935913, "lp_gold": -10.093660473823547, "lp_dist": -12.080699443817139, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-156", "gold_norm": "14", "dist_norm": "32", "baseline": { "pred": "gold", "correct": true, "margin": 4.239875316619873, "lp_gold": -20.827781200408936, "lp_dist": -25.06765651702881, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.260110855102539, "lp_gold": -13.14831280708313, "lp_dist": -15.408423662185669, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-157", "gold_norm": "32", "dist_norm": "105", "baseline": { "pred": "gold", "correct": true, "margin": 12.336161613464355, "lp_gold": -20.509262084960938, "lp_dist": -32.84542369842529, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.297417879104614, "lp_gold": -9.766870260238647, "lp_dist": -16.06428813934326, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-158", "gold_norm": "105", "dist_norm": "71", "baseline": { "pred": "dist", "correct": false, "margin": -0.14784783124923706, "lp_gold": -21.096359431743622, "lp_dist": -20.948511600494385, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4936403930187225, "lp_gold": -13.522361606359482, "lp_dist": -12.02872121334076, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-159", "gold_norm": "71", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -1.2031426429748535, "lp_gold": -17.497971057891846, "lp_dist": -16.294828414916992, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.283137083053589, "lp_gold": -15.376394033432007, "lp_dist": -12.093256950378418, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-160", "gold_norm": "5", "dist_norm": "30", "baseline": { "pred": "gold", "correct": true, "margin": 5.20781135559082, "lp_gold": -9.617193222045898, "lp_dist": -14.825004577636719, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.284235119819641, "lp_gold": -6.052669286727905, "lp_dist": -9.336904406547546, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-161", "gold_norm": "30", "dist_norm": "95", "baseline": { "pred": "gold", "correct": true, "margin": 6.308123826980591, "lp_gold": -12.008728742599487, "lp_dist": -18.316852569580078, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.064014196395874, "lp_gold": -11.074568510055542, "lp_dist": -16.138582706451416, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-162", "gold_norm": "95", "dist_norm": "147", "baseline": { "pred": "gold", "correct": true, "margin": 14.487586110830307, "lp_gold": -13.062050491571426, "lp_dist": -27.549636602401733, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.7768335342407227, "lp_gold": -18.569175243377686, "lp_dist": -22.346008777618408, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-163", "gold_norm": "147", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -5.9637865126132965, "lp_gold": -13.891173975542188, "lp_dist": -7.927387462928891, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.2743232250213623, "lp_gold": -11.51265001296997, "lp_dist": -11.238326787948608, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-164", "gold_norm": "10", "dist_norm": "40000", "baseline": { "pred": "gold", "correct": true, "margin": 16.0207504555583, "lp_gold": -12.818441897630692, "lp_dist": -28.83919235318899, "n_tokens_gold": 3, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 15.683642621152103, "lp_gold": -6.558068131096661, "lp_dist": -22.241710752248764, "n_tokens_gold": 3, "n_tokens_dist": 6 } }, { "ex_id": "gsm8k-test-165", "gold_norm": "40000", "dist_norm": "12", "baseline": { "pred": "dist", "correct": false, "margin": -6.0927228182554245, "lp_gold": -21.754829093813896, "lp_dist": -15.662106275558472, "n_tokens_gold": 6, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.87526479922235, "lp_gold": -16.18092787824571, "lp_dist": -9.305663079023361, "n_tokens_gold": 6, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-166", "gold_norm": "12", "dist_norm": "129200", "baseline": { "pred": "gold", "correct": true, "margin": 17.634711742401123, "lp_gold": -20.60856056213379, "lp_dist": -38.24327230453491, "n_tokens_gold": 3, "n_tokens_dist": 7 }, "ablated": { "pred": "gold", "correct": true, "margin": 16.27542757987976, "lp_gold": -20.810616493225098, "lp_dist": -37.08604407310486, "n_tokens_gold": 3, "n_tokens_dist": 7 } }, { "ex_id": "gsm8k-test-167", "gold_norm": "129200", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -8.816860965336673, "lp_gold": -24.884800246800296, "lp_dist": -16.067939281463623, "n_tokens_gold": 7, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -12.447648953646421, "lp_gold": -26.88768095895648, "lp_dist": -14.440032005310059, "n_tokens_gold": 7, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-168", "gold_norm": "5", "dist_norm": "45", "baseline": { "pred": "gold", "correct": true, "margin": 7.262725353240967, "lp_gold": -11.162125587463379, "lp_dist": -18.424850940704346, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.129469871520996, "lp_gold": -6.1789350509643555, "lp_dist": -18.30840492248535, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-169", "gold_norm": "45", "dist_norm": "20", "baseline": { "pred": "gold", "correct": true, "margin": 0.7432926744222641, "lp_gold": -13.45964826643467, "lp_dist": -14.202940940856934, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.6613135635852814, "lp_gold": -9.49636921286583, "lp_dist": -12.15768277645111, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-170", "gold_norm": "20", "dist_norm": "1170", "baseline": { "pred": "gold", "correct": true, "margin": 28.943727374076843, "lp_gold": -13.921928644180298, "lp_dist": -42.86565601825714, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.730559349060059, "lp_gold": -15.503687143325806, "lp_dist": -28.234246492385864, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-171", "gold_norm": "1170", "dist_norm": "192", "baseline": { "pred": "dist", "correct": false, "margin": -2.4793071039021015, "lp_gold": -25.866613794118166, "lp_dist": -23.387306690216064, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.7092228829860687, "lp_gold": -17.528388172388077, "lp_dist": -20.237611055374146, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-172", "gold_norm": "192", "dist_norm": "14", "baseline": { "pred": "dist", "correct": false, "margin": -2.4512736797332764, "lp_gold": -20.52132660150528, "lp_dist": -18.070052921772003, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.270252227783203, "lp_gold": -18.500007390975952, "lp_dist": -10.229755163192749, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-173", "gold_norm": "14", "dist_norm": "144", "baseline": { "pred": "gold", "correct": true, "margin": 6.012117385864258, "lp_gold": -19.01213574409485, "lp_dist": -25.024253129959106, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.6395363807678223, "lp_gold": -17.207820653915405, "lp_dist": -20.847357034683228, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-174", "gold_norm": "144", "dist_norm": "350", "baseline": { "pred": "gold", "correct": true, "margin": 9.220242428360507, "lp_gold": -20.075438094558194, "lp_dist": -29.2956805229187, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.203243670635857, "lp_gold": -15.825250687426887, "lp_dist": -24.028494358062744, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-175", "gold_norm": "350", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 0.9250896275043488, "lp_gold": -22.99281856417656, "lp_dist": -23.917908191680908, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.236621737480164, "lp_gold": -17.58333122730255, "lp_dist": -10.346709489822388, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-176", "gold_norm": "50", "dist_norm": "7", "baseline": { "pred": "dist", "correct": false, "margin": -0.0019221305847167969, "lp_gold": -14.026922941207886, "lp_dist": -14.025000810623169, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.7933419942855835, "lp_gold": -10.8789883852005, "lp_dist": -10.085646390914917, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-177", "gold_norm": "7", "dist_norm": "50", "baseline": { "pred": "dist", "correct": false, "margin": -0.36652660369873047, "lp_gold": -14.853872776031494, "lp_dist": -14.487346172332764, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.34556770324707, "lp_gold": -7.91382908821106, "lp_dist": -13.25939679145813, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-178", "gold_norm": "50", "dist_norm": "8", "baseline": { "pred": "gold", "correct": true, "margin": 3.387412190437317, "lp_gold": -12.106852412223816, "lp_dist": -15.494264602661133, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.1638234257698059, "lp_gold": -11.002189338207245, "lp_dist": -11.16601276397705, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-179", "gold_norm": "8", "dist_norm": "3160", "baseline": { "pred": "gold", "correct": true, "margin": 27.79468995332718, "lp_gold": -10.52354496717453, "lp_dist": -38.31823492050171, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 26.891671419143677, "lp_gold": -6.7627270221710205, "lp_dist": -33.6543984413147, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-180", "gold_norm": "3160", "dist_norm": "80", "baseline": { "pred": "dist", "correct": false, "margin": -14.208902917802334, "lp_gold": -23.272248081862926, "lp_dist": -9.063345164060593, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.284509412944317, "lp_gold": -18.512411706149578, "lp_dist": -10.227902293205261, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-181", "gold_norm": "80", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 6.666141152381897, "lp_gold": -11.426292300224304, "lp_dist": -18.0924334526062, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.4361144304275513, "lp_gold": -8.623531460762024, "lp_dist": -12.059645891189575, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-182", "gold_norm": "50", "dist_norm": "40", "baseline": { "pred": "dist", "correct": false, "margin": -0.3921025022864342, "lp_gold": -16.256853722035885, "lp_dist": -15.86475121974945, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.5148345530033112, "lp_gold": -11.546109974384308, "lp_dist": -11.031275421380997, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-183", "gold_norm": "40", "dist_norm": "78", "baseline": { "pred": "gold", "correct": true, "margin": 10.310973192565143, "lp_gold": -11.783679460175335, "lp_dist": -22.09465265274048, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.1819764897227287, "lp_gold": -8.697102136909962, "lp_dist": -11.87907862663269, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-184", "gold_norm": "78", "dist_norm": "273", "baseline": { "pred": "gold", "correct": true, "margin": 9.903703212738037, "lp_gold": -21.98734474182129, "lp_dist": -31.891047954559326, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.26316213607788, "lp_gold": -13.972403526306152, "lp_dist": -25.235565662384033, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-185", "gold_norm": "273", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -4.715305805206299, "lp_gold": -18.186378479003906, "lp_dist": -13.471072673797607, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.78067421913147, "lp_gold": -15.708798170089722, "lp_dist": -9.928123950958252, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-186", "gold_norm": "2", "dist_norm": "195", "baseline": { "pred": "gold", "correct": true, "margin": 12.730733886361122, "lp_gold": -12.38149118423462, "lp_dist": -25.11222507059574, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.67141306400299, "lp_gold": -8.928462505340576, "lp_dist": -21.599875569343567, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-187", "gold_norm": "195", "dist_norm": "1128", "baseline": { "pred": "gold", "correct": true, "margin": 15.125296980142593, "lp_gold": -20.79087921977043, "lp_dist": -35.916176199913025, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.027642607688904, "lp_gold": -17.632879853248596, "lp_dist": -29.6605224609375, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-188", "gold_norm": "1128", "dist_norm": "172", "baseline": { "pred": "dist", "correct": false, "margin": -2.5262241810560226, "lp_gold": -21.482525154948235, "lp_dist": -18.956300973892212, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.822476863861084, "lp_gold": -22.20119798183441, "lp_dist": -14.378721117973328, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-189", "gold_norm": "172", "dist_norm": "30", "baseline": { "pred": "dist", "correct": false, "margin": -10.2972651720047, "lp_gold": -25.187148094177246, "lp_dist": -14.889882922172546, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.863584116101265, "lp_gold": -19.34168529510498, "lp_dist": -11.478101179003716, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-190", "gold_norm": "30", "dist_norm": "30", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.69141697883606, "lp_dist": -20.69141697883606, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -14.99114179611206, "lp_dist": -14.99114179611206, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-191", "gold_norm": "30", "dist_norm": "92", "baseline": { "pred": "gold", "correct": true, "margin": 14.041085667908192, "lp_gold": -11.936107210814953, "lp_dist": -25.977192878723145, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 10.562521874904633, "lp_gold": -8.070083677768707, "lp_dist": -18.63260555267334, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-192", "gold_norm": "92", "dist_norm": "20", "baseline": { "pred": "dist", "correct": false, "margin": -0.352982759475708, "lp_gold": -19.694137811660767, "lp_dist": -19.34115505218506, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.613435983657837, "lp_gold": -16.650076866149902, "lp_dist": -9.036640882492065, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-193", "gold_norm": "20", "dist_norm": "540", "baseline": { "pred": "gold", "correct": true, "margin": 5.857290744781494, "lp_gold": -12.000582933425903, "lp_dist": -17.857873678207397, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.146737933158875, "lp_gold": -8.929409623146057, "lp_dist": -20.07614755630493, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-194", "gold_norm": "540", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -0.4153643026947975, "lp_gold": -17.50879267603159, "lp_dist": -17.093428373336792, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.476008802652359, "lp_gold": -18.04153409600258, "lp_dist": -11.56552529335022, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-195", "gold_norm": "10", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -13.743456363677979, "lp_dist": -13.743456363677979, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -11.619784355163574, "lp_dist": -11.619784355163574, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-196", "gold_norm": "10", "dist_norm": "38", "baseline": { "pred": "gold", "correct": true, "margin": 7.615695416927338, "lp_gold": -13.448404610157013, "lp_dist": -21.06410002708435, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.2176668643951416, "lp_gold": -12.037200689315796, "lp_dist": -12.254867553710938, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-197", "gold_norm": "38", "dist_norm": "4000", "baseline": { "pred": "gold", "correct": true, "margin": 19.051328860223293, "lp_gold": -18.077466011047363, "lp_dist": -37.12879487127066, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.882668375968933, "lp_gold": -18.516667366027832, "lp_dist": -27.399335741996765, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-198", "gold_norm": "4000", "dist_norm": "594", "baseline": { "pred": "gold", "correct": true, "margin": 10.496505833114497, "lp_gold": -20.328653597389348, "lp_dist": -30.825159430503845, "n_tokens_gold": 5, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.5506546348333359, "lp_gold": -16.97495509684086, "lp_dist": -18.525609731674194, "n_tokens_gold": 5, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-199", "gold_norm": "594", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -13.668475985527039, "lp_gold": -24.507625102996826, "lp_dist": -10.839149117469788, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -12.574656009674072, "lp_gold": -20.77769374847412, "lp_dist": -8.203037738800049, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-200", "gold_norm": "2", "dist_norm": "142", "baseline": { "pred": "gold", "correct": true, "margin": 20.30094861984253, "lp_gold": -13.631542205810547, "lp_dist": -33.932490825653076, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.00571084022522, "lp_gold": -11.859813690185547, "lp_dist": -24.865524530410767, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-201", "gold_norm": "142", "dist_norm": "9", "baseline": { "pred": "dist", "correct": false, "margin": -4.474197149276733, "lp_gold": -17.457672357559204, "lp_dist": -12.98347520828247, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.4977235794067383, "lp_gold": -15.609914779663086, "lp_dist": -13.112191200256348, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-202", "gold_norm": "9", "dist_norm": "6", "baseline": { "pred": "dist", "correct": false, "margin": -1.3756675720214844, "lp_gold": -12.738621711730957, "lp_dist": -11.362954139709473, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.2889900207519531, "lp_gold": -12.575027465820312, "lp_dist": -11.28603744506836, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-203", "gold_norm": "6", "dist_norm": "100", "baseline": { "pred": "gold", "correct": true, "margin": 6.780631840229034, "lp_gold": -15.693442344665527, "lp_dist": -22.47407418489456, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.30402946472168, "lp_gold": -9.650990724563599, "lp_dist": -16.95502018928528, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-204", "gold_norm": "100", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -0.23692995309829712, "lp_gold": -17.014364540576935, "lp_dist": -16.777434587478638, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6824193000793457, "lp_gold": -11.578391790390015, "lp_dist": -10.895972490310669, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-205", "gold_norm": "10", "dist_norm": "15", "baseline": { "pred": "dist", "correct": false, "margin": -0.3394050598144531, "lp_gold": -18.575839042663574, "lp_dist": -18.23643398284912, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.5095596313476562, "lp_gold": -18.252729892730713, "lp_dist": -18.76228952407837, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-206", "gold_norm": "15", "dist_norm": "22", "baseline": { "pred": "gold", "correct": true, "margin": 1.2520769834518433, "lp_gold": -16.89211142063141, "lp_dist": -18.144188404083252, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.4231153726577759, "lp_gold": -14.139848232269287, "lp_dist": -15.562963604927063, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-207", "gold_norm": "22", "dist_norm": "16", "baseline": { "pred": "gold", "correct": true, "margin": 2.621975004673004, "lp_gold": -11.219844043254852, "lp_dist": -13.841819047927856, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.833777904510498, "lp_gold": -10.41726541519165, "lp_dist": -9.583487510681152, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-208", "gold_norm": "16", "dist_norm": "16", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -11.36221162811853, "lp_dist": -11.36221162811853, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -13.241074323654175, "lp_dist": -13.241074323654175, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-209", "gold_norm": "16", "dist_norm": "5", "baseline": { "pred": "dist", "correct": false, "margin": -7.475744724273682, "lp_gold": -19.41911506652832, "lp_dist": -11.943370342254639, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.438729763031006, "lp_gold": -15.213366270065308, "lp_dist": -10.774636507034302, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-210", "gold_norm": "5", "dist_norm": "23", "baseline": { "pred": "gold", "correct": true, "margin": 6.465651273727417, "lp_gold": -15.02237606048584, "lp_dist": -21.488027334213257, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.424657344818115, "lp_gold": -8.115961074829102, "lp_dist": -15.540618419647217, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-211", "gold_norm": "23", "dist_norm": "30", "baseline": { "pred": "dist", "correct": false, "margin": -0.4010072350502014, "lp_gold": -15.865891933441162, "lp_dist": -15.46488469839096, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.2698206305503845, "lp_gold": -18.3642840385437, "lp_dist": -13.094463407993317, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-212", "gold_norm": "30", "dist_norm": "14000", "baseline": { "pred": "gold", "correct": true, "margin": 23.520719528198242, "lp_gold": -19.58930778503418, "lp_dist": -43.11002731323242, "n_tokens_gold": 3, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 18.009515285491943, "lp_gold": -13.2671217918396, "lp_dist": -31.276637077331543, "n_tokens_gold": 3, "n_tokens_dist": 6 } }, { "ex_id": "gsm8k-test-213", "gold_norm": "14000", "dist_norm": "60", "baseline": { "pred": "dist", "correct": false, "margin": -6.45991032384336, "lp_gold": -21.998028149828315, "lp_dist": -15.538117825984955, "n_tokens_gold": 6, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.547004419262521, "lp_gold": -14.710521432454698, "lp_dist": -13.163517013192177, "n_tokens_gold": 6, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-214", "gold_norm": "60", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -8.948975563049316, "lp_gold": -23.77088451385498, "lp_dist": -14.821908950805664, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.54961109161377, "lp_gold": -19.638930320739746, "lp_dist": -11.089319229125977, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-215", "gold_norm": "2", "dist_norm": "3", "baseline": { "pred": "gold", "correct": true, "margin": 3.387197434902191, "lp_gold": -9.424871981143951, "lp_dist": -12.812069416046143, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.5975170135498047, "lp_gold": -9.744040250778198, "lp_dist": -8.146523237228394, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-216", "gold_norm": "3", "dist_norm": "30", "baseline": { "pred": "gold", "correct": true, "margin": 5.872649192810059, "lp_gold": -14.82950735092163, "lp_dist": -20.70215654373169, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.080531597137451, "lp_gold": -11.414220809936523, "lp_dist": -13.494752407073975, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-217", "gold_norm": "30", "dist_norm": "1920", "baseline": { "pred": "gold", "correct": true, "margin": 21.697412703186274, "lp_gold": -13.986085917800665, "lp_dist": -35.68349862098694, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 12.313387870788574, "lp_gold": -12.878417491912842, "lp_dist": -25.191805362701416, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-218", "gold_norm": "1920", "dist_norm": "84", "baseline": { "pred": "dist", "correct": false, "margin": -1.6947197169065475, "lp_gold": -25.65590851008892, "lp_dist": -23.961188793182373, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.878614127635956, "lp_gold": -21.65609782934189, "lp_dist": -14.777483701705933, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-219", "gold_norm": "84", "dist_norm": "8", "baseline": { "pred": "dist", "correct": false, "margin": -1.1246260404586792, "lp_gold": -13.33847463130951, "lp_dist": -12.21384859085083, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.7234134674072266, "lp_gold": -12.977782487869263, "lp_dist": -10.254369020462036, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-220", "gold_norm": "8", "dist_norm": "12", "baseline": { "pred": "dist", "correct": false, "margin": -1.304245948791504, "lp_gold": -15.825139999389648, "lp_dist": -14.520894050598145, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.1817718744277954, "lp_gold": -12.667408466339111, "lp_dist": -12.849180340766907, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-221", "gold_norm": "12", "dist_norm": "260", "baseline": { "pred": "gold", "correct": true, "margin": 8.42927235364914, "lp_gold": -9.642007768154144, "lp_dist": -18.071280121803284, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.9336384534835815, "lp_gold": -7.166749358177185, "lp_dist": -14.100387811660767, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-222", "gold_norm": "260", "dist_norm": "288", "baseline": { "pred": "gold", "correct": true, "margin": 3.7796518057584763, "lp_gold": -18.89673836529255, "lp_dist": -22.676390171051025, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.0456210374832153, "lp_gold": -16.525109887123108, "lp_dist": -18.570730924606323, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-223", "gold_norm": "288", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -19.030277393758297, "lp_gold": -26.646236896514893, "lp_dist": -7.615959502756596, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -15.758692368865013, "lp_gold": -20.088956594467163, "lp_dist": -4.33026422560215, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-224", "gold_norm": "3", "dist_norm": "1596", "baseline": { "pred": "gold", "correct": true, "margin": 16.595462799072266, "lp_gold": -14.905784606933594, "lp_dist": -31.50124740600586, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 16.13599407672882, "lp_gold": -13.950559616088867, "lp_dist": -30.086553692817688, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-225", "gold_norm": "1596", "dist_norm": "81", "baseline": { "pred": "gold", "correct": true, "margin": 1.9876238331198692, "lp_gold": -17.49847326427698, "lp_dist": -19.48609709739685, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.1633647084236145, "lp_gold": -24.1348779797554, "lp_dist": -17.971513271331787, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-226", "gold_norm": "81", "dist_norm": "56", "baseline": { "pred": "gold", "correct": true, "margin": 3.605985850095749, "lp_gold": -15.771342545747757, "lp_dist": -19.377328395843506, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.6331486701965332, "lp_gold": -16.669665813446045, "lp_dist": -15.036517143249512, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-227", "gold_norm": "56", "dist_norm": "1490", "baseline": { "pred": "gold", "correct": true, "margin": 16.239468812942505, "lp_gold": -14.987546801567078, "lp_dist": -31.227015614509583, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.551798105239868, "lp_gold": -13.551477909088135, "lp_dist": -25.103276014328003, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-228", "gold_norm": "1490", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -19.20861628651619, "lp_gold": -30.783629894256592, "lp_dist": -11.575013607740402, "n_tokens_gold": 5, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -13.095399379730225, "lp_gold": -24.58754062652588, "lp_dist": -11.492141246795654, "n_tokens_gold": 5, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-229", "gold_norm": "2", "dist_norm": "20", "baseline": { "pred": "gold", "correct": true, "margin": 5.965806007385254, "lp_gold": -11.508173823356628, "lp_dist": -17.473979830741882, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.32473722100257874, "lp_gold": -11.093923568725586, "lp_dist": -11.418660789728165, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-230", "gold_norm": "20", "dist_norm": "11", "baseline": { "pred": "dist", "correct": false, "margin": -1.1644073724746704, "lp_gold": -14.290618896484375, "lp_dist": -13.126211524009705, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.0560493469238281, "lp_gold": -12.166522026062012, "lp_dist": -13.22257137298584, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-231", "gold_norm": "11", "dist_norm": "120", "baseline": { "pred": "dist", "correct": false, "margin": -1.219505786895752, "lp_gold": -14.914972305297852, "lp_dist": -13.6954665184021, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.7893390655517578, "lp_gold": -11.986905813217163, "lp_dist": -13.776244878768921, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-232", "gold_norm": "120", "dist_norm": "45", "baseline": { "pred": "gold", "correct": true, "margin": 4.419019672088325, "lp_gold": -15.604393211193383, "lp_dist": -20.023412883281708, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.708264172077179, "lp_gold": -12.799233138561249, "lp_dist": -13.507497310638428, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-233", "gold_norm": "45", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -1.5151035785675049, "lp_gold": -11.359116911888123, "lp_dist": -9.844013333320618, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.095129489898682, "lp_gold": -14.39400601387024, "lp_dist": -6.298876523971558, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-234", "gold_norm": "10", "dist_norm": "9", "baseline": { "pred": "gold", "correct": true, "margin": 0.05903661251068115, "lp_gold": -13.167555451393127, "lp_dist": -13.226592063903809, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.7660583406686783, "lp_gold": -6.627322778105736, "lp_dist": -10.393381118774414, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-235", "gold_norm": "9", "dist_norm": "33", "baseline": { "pred": "gold", "correct": true, "margin": 7.191148281097412, "lp_gold": -14.321090459823608, "lp_dist": -21.51223874092102, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.510578155517578, "lp_gold": -13.556029319763184, "lp_dist": -16.06660747528076, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-236", "gold_norm": "33", "dist_norm": "150", "baseline": { "pred": "gold", "correct": true, "margin": 9.496721982955933, "lp_gold": -19.231878995895386, "lp_dist": -28.72860097885132, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.684651017189026, "lp_gold": -13.231264114379883, "lp_dist": -18.91591513156891, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-237", "gold_norm": "150", "dist_norm": "60", "baseline": { "pred": "gold", "correct": true, "margin": 7.496449222322553, "lp_gold": -12.51727462792769, "lp_dist": -20.013723850250244, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.5342855900526047, "lp_gold": -11.47604425251484, "lp_dist": -12.010329842567444, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-238", "gold_norm": "60", "dist_norm": "4", "baseline": { "pred": "gold", "correct": true, "margin": 0.38430750370025635, "lp_gold": -13.078525424003601, "lp_dist": -13.462832927703857, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.5117335319519043, "lp_gold": -10.930900573730469, "lp_dist": -9.419167041778564, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-239", "gold_norm": "4", "dist_norm": "7", "baseline": { "pred": "gold", "correct": true, "margin": 0.3680305480957031, "lp_gold": -12.839935302734375, "lp_dist": -13.207965850830078, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.0732803344726562, "lp_gold": -11.00877571105957, "lp_dist": -12.082056045532227, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-240", "gold_norm": "7", "dist_norm": "3140", "baseline": { "pred": "gold", "correct": true, "margin": 22.65280568599701, "lp_gold": -12.292606830596924, "lp_dist": -34.94541251659393, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 18.48963165283203, "lp_gold": -9.809606552124023, "lp_dist": -28.299238204956055, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-241", "gold_norm": "3140", "dist_norm": "19", "baseline": { "pred": "dist", "correct": false, "margin": -8.30290687084198, "lp_gold": -22.36732530593872, "lp_dist": -14.06441843509674, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -13.02580738067627, "lp_gold": -25.325818061828613, "lp_dist": -12.300010681152344, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-242", "gold_norm": "19", "dist_norm": "6", "baseline": { "pred": "gold", "correct": true, "margin": 5.181618273258209, "lp_gold": -12.157159745693207, "lp_dist": -17.338778018951416, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4070416688919067, "lp_gold": -10.997576355934143, "lp_dist": -9.590534687042236, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-243", "gold_norm": "6", "dist_norm": "90", "baseline": { "pred": "gold", "correct": true, "margin": 3.3495291471481323, "lp_gold": -19.138280868530273, "lp_dist": -22.487810015678406, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.812922954559326, "lp_gold": -11.337668418884277, "lp_dist": -17.150591373443604, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-244", "gold_norm": "90", "dist_norm": "10", "baseline": { "pred": "dist", "correct": false, "margin": -1.6097938957027509, "lp_gold": -13.304834717731865, "lp_dist": -11.695040822029114, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.5438682280946523, "lp_gold": -7.949862555367872, "lp_dist": -10.493730783462524, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-245", "gold_norm": "10", "dist_norm": "130000", "baseline": { "pred": "gold", "correct": true, "margin": 26.08018460869789, "lp_gold": -8.254620164632797, "lp_dist": -34.33480477333069, "n_tokens_gold": 3, "n_tokens_dist": 7 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.28964650630951, "lp_gold": -7.219025731086731, "lp_dist": -20.50867223739624, "n_tokens_gold": 3, "n_tokens_dist": 7 } }, { "ex_id": "gsm8k-test-246", "gold_norm": "130000", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 4.715070237376494, "lp_gold": -12.472108629561262, "lp_dist": -17.187178866937757, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.156454911455512, "lp_gold": -13.647521084174514, "lp_dist": -6.491066172719002, "n_tokens_gold": 7, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-247", "gold_norm": "10", "dist_norm": "525", "baseline": { "pred": "gold", "correct": true, "margin": 17.677427016198635, "lp_gold": -15.68656424432993, "lp_dist": -33.363991260528564, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.708948612213135, "lp_gold": -11.453617930412292, "lp_dist": -23.162566542625427, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-248", "gold_norm": "525", "dist_norm": "180", "baseline": { "pred": "gold", "correct": true, "margin": 6.852982550859451, "lp_gold": -11.824598759412766, "lp_dist": -18.677581310272217, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.6725820302963257, "lp_gold": -11.64935302734375, "lp_dist": -12.321935057640076, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-249", "gold_norm": "180", "dist_norm": "1200", "baseline": { "pred": "gold", "correct": true, "margin": 13.544742852449417, "lp_gold": -8.964889764785767, "lp_dist": -22.509632617235184, "n_tokens_gold": 4, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 15.23933944106102, "lp_gold": -7.498180732131004, "lp_dist": -22.737520173192024, "n_tokens_gold": 4, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-250", "gold_norm": "1200", "dist_norm": "25", "baseline": { "pred": "gold", "correct": true, "margin": 9.737206868827343, "lp_gold": -9.13471419364214, "lp_dist": -18.871921062469482, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.1686492152512074, "lp_gold": -8.043186407536268, "lp_dist": -10.211835622787476, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-251", "gold_norm": "25", "dist_norm": "21", "baseline": { "pred": "gold", "correct": true, "margin": 0.8187389373779297, "lp_gold": -12.140745043754578, "lp_dist": -12.959483981132507, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.511936068534851, "lp_gold": -6.530247092247009, "lp_dist": -10.04218316078186, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-252", "gold_norm": "21", "dist_norm": "2304", "baseline": { "pred": "gold", "correct": true, "margin": 16.503239154815674, "lp_gold": -17.229759454727173, "lp_dist": -33.73299860954285, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 14.275152683258057, "lp_gold": -13.177631378173828, "lp_dist": -27.452784061431885, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-253", "gold_norm": "2304", "dist_norm": "2325", "baseline": { "pred": "gold", "correct": true, "margin": 4.856137990951538, "lp_gold": -25.891463041305542, "lp_dist": -30.74760103225708, "n_tokens_gold": 5, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.9878709018230438, "lp_gold": -25.637850552797318, "lp_dist": -26.62572145462036, "n_tokens_gold": 5, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-254", "gold_norm": "2325", "dist_norm": "15", "baseline": { "pred": "dist", "correct": false, "margin": -2.660225659608841, "lp_gold": -13.980357348918915, "lp_dist": -11.320131689310074, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.668605744838715, "lp_gold": -16.375044524669647, "lp_dist": -11.706438779830933, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-255", "gold_norm": "15", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 6.416126787662506, "lp_gold": -14.69802612066269, "lp_dist": -21.114152908325195, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.287871062755585, "lp_gold": -10.296239674091339, "lp_dist": -17.584110736846924, "n_tokens_gold": 3, "n_tokens_dist": 3 } } ], "flip_rows": [ { "ex_id": "gsm8k-test-1", "gold_norm": "80", "dist_norm": "12", "baseline": { "pred": "gold", "correct": true, "margin": 1.218225084245205, "lp_gold": -16.316218174993992, "lp_dist": -17.534443259239197, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6342043727636337, "lp_gold": -18.493512138724327, "lp_dist": -17.859307765960693, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -2.4292978644371033, "lp_gold": -17.579256772994995, "lp_dist": -15.149958908557892, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -2.5465927198529243, "lp_gold": -17.71218091994524, "lp_dist": -15.165588200092316, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "gold", "correct": true, "margin": 0.1809745579957962, "lp_gold": -14.873322412371635, "lp_dist": -15.054296970367432, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.3117252141237259, "lp_gold": -17.119899585843086, "lp_dist": -16.80817437171936, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.6342088878154755, "lp_gold": -18.49351069331169, "lp_dist": -17.859301805496216, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-5", "gold_norm": "3200", "dist_norm": "38", "baseline": { "pred": "gold", "correct": true, "margin": 4.075981711270288, "lp_gold": -15.808944131014869, "lp_dist": -19.884925842285156, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.112908275797963, "lp_gold": -17.281133087351918, "lp_dist": -15.168224811553955, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.29585185274481773, "lp_gold": -18.88566479459405, "lp_dist": -19.181516647338867, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.4162398586049676, "lp_gold": -18.945031284354627, "lp_dist": -19.361271142959595, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.9830470234155655, "lp_gold": -17.90198041498661, "lp_dist": -16.918933391571045, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.479109164327383, "lp_gold": -18.568949338048697, "lp_dist": -16.089840173721313, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -2.112905891612172, "lp_gold": -17.28112688846886, "lp_dist": -15.16822099685669, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-14", "gold_norm": "800", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 2.7909989710897207, "lp_gold": -10.428668463602662, "lp_dist": -13.219667434692383, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.28849396109581, "lp_gold": -15.488610118627548, "lp_dist": -9.200116157531738, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -3.221886307001114, "lp_gold": -17.41419091820717, "lp_dist": -14.192304611206055, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -3.1399324536323547, "lp_gold": -16.989762604236603, "lp_dist": -13.849830150604248, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -5.316341996192932, "lp_gold": -16.00817358493805, "lp_dist": -10.691831588745117, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -6.094054877758026, "lp_gold": -17.299417197704315, "lp_dist": -11.205362319946289, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -6.288493096828461, "lp_gold": -15.488613307476044, "lp_dist": -9.200120210647583, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-20", "gold_norm": "106", "dist_norm": "80", "baseline": { "pred": "gold", "correct": true, "margin": 4.92336449585855, "lp_gold": -15.76830449141562, "lp_dist": -20.69166898727417, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.6747859213501215, "lp_gold": -17.988985607400537, "lp_dist": -17.314199686050415, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -3.9746298789978027, "lp_gold": -22.760347604751587, "lp_dist": -18.785717725753784, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -3.769364595413208, "lp_gold": -22.492818355560303, "lp_dist": -18.723453760147095, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.8601390519179404, "lp_gold": -22.44402221823111, "lp_dist": -19.58388316631317, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.127611498348415, "lp_gold": -16.912333111278713, "lp_dist": -14.784721612930298, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.6747779380530119, "lp_gold": -17.988986684009433, "lp_dist": -17.31420874595642, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-24", "gold_norm": "9", "dist_norm": "40", "baseline": { "pred": "gold", "correct": true, "margin": 0.006132304668426514, "lp_gold": -15.839151382446289, "lp_dist": -15.845283687114716, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.110446274280548, "lp_gold": -12.25863265991211, "lp_dist": -10.148186385631561, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.04043316841125488, "lp_gold": -14.208327531814575, "lp_dist": -14.24876070022583, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.17713472247123718, "lp_gold": -14.100894212722778, "lp_dist": -14.278028935194016, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.736970603466034, "lp_gold": -13.546704292297363, "lp_dist": -11.80973368883133, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.5443955063819885, "lp_gold": -14.465348243713379, "lp_dist": -12.92095273733139, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -2.110443741083145, "lp_gold": -12.258633613586426, "lp_dist": -10.14818987250328, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-27", "gold_norm": "160", "dist_norm": "6", "baseline": { "pred": "gold", "correct": true, "margin": 1.597537249326706, "lp_gold": -12.841732293367386, "lp_dist": -14.439269542694092, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.338132083415985, "lp_gold": -17.455387771129608, "lp_dist": -10.117255687713623, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -2.827625960111618, "lp_gold": -15.895989626646042, "lp_dist": -13.068363666534424, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -2.9387161433696747, "lp_gold": -16.01883837580681, "lp_dist": -13.080122232437134, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -8.034337878227234, "lp_gold": -23.63157594203949, "lp_dist": -15.597238063812256, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -7.914303302764893, "lp_gold": -19.091363430023193, "lp_dist": -11.1770601272583, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -7.338137567043304, "lp_gold": -17.455391585826874, "lp_dist": -10.11725401878357, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-31", "gold_norm": "68", "dist_norm": "31", "baseline": { "pred": "gold", "correct": true, "margin": 3.820281505584717, "lp_gold": -15.837103843688965, "lp_dist": -19.65738534927368, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.25214481353759766, "lp_gold": -12.841001033782959, "lp_dist": -12.588856220245361, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 4.767996072769165, "lp_gold": -13.063536882400513, "lp_dist": -17.831532955169678, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 4.778961181640625, "lp_gold": -12.704540252685547, "lp_dist": -17.483501434326172, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.2866086959838867, "lp_gold": -13.741567134857178, "lp_dist": -13.454958438873291, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.2309262752532959, "lp_gold": -13.31624436378479, "lp_dist": -13.085318088531494, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.2521398067474365, "lp_gold": -12.840993165969849, "lp_dist": -12.588853359222412, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-35", "gold_norm": "480", "dist_norm": "520", "baseline": { "pred": "gold", "correct": true, "margin": 1.5111888945102692, "lp_gold": -12.033819317817688, "lp_dist": -13.545008212327957, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.8960548639297485, "lp_gold": -19.025392055511475, "lp_dist": -16.129337191581726, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.173134446144104, "lp_gold": -16.326287806034088, "lp_dist": -16.499422252178192, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.15610426664352417, "lp_gold": -16.418395936489105, "lp_dist": -16.57450020313263, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.612240791320801, "lp_gold": -17.892987489700317, "lp_dist": -15.280746698379517, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -3.1868066787719727, "lp_gold": -17.928332090377808, "lp_dist": -14.741525411605835, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -2.8960519433021545, "lp_gold": -19.0253963470459, "lp_dist": -16.129344403743744, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-45", "gold_norm": "310", "dist_norm": "100", "baseline": { "pred": "gold", "correct": true, "margin": 0.9456039071083069, "lp_gold": -16.172270894050598, "lp_dist": -17.117874801158905, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.856696009635925, "lp_gold": -16.36608850955963, "lp_dist": -10.509392499923706, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.7440242022275925, "lp_gold": -17.919243693351746, "lp_dist": -16.175219491124153, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.6509404331445694, "lp_gold": -17.9195556640625, "lp_dist": -16.26861523091793, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -3.5654631853103638, "lp_gold": -15.380214095115662, "lp_dist": -11.814750909805298, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -6.163789927959442, "lp_gold": -17.801445245742798, "lp_dist": -11.637655317783356, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -5.856698274612427, "lp_gold": -16.366087794303894, "lp_dist": -10.509389519691467, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-48", "gold_norm": "25", "dist_norm": "1400", "baseline": { "pred": "gold", "correct": true, "margin": 7.125034153461456, "lp_gold": -16.447975158691406, "lp_dist": -23.573009312152863, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.08089584112167358, "lp_gold": -13.430449962615967, "lp_dist": -13.349554121494293, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "patched_self": { "pred": "gold", "correct": true, "margin": 4.025165379047394, "lp_gold": -15.498599290847778, "lp_dist": -19.523764669895172, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 3.622505933046341, "lp_gold": -15.329206466674805, "lp_dist": -18.951712399721146, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.6573466360569, "lp_gold": -16.551159858703613, "lp_dist": -14.893813222646713, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.2268932834267616, "lp_gold": -15.390444993972778, "lp_dist": -14.163551710546017, "n_tokens_gold": 3, "n_tokens_dist": 5 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.08089244365692139, "lp_gold": -13.43044662475586, "lp_dist": -13.349554181098938, "n_tokens_gold": 3, "n_tokens_dist": 5 } }, { "ex_id": "gsm8k-test-64", "gold_norm": "655", "dist_norm": "800", "baseline": { "pred": "gold", "correct": true, "margin": 1.8698419332504272, "lp_gold": -17.930187582969666, "lp_dist": -19.800029516220093, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4062272310256958, "lp_gold": -14.47088611125946, "lp_dist": -13.064658880233765, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "patched_self": { "pred": "dist", "correct": false, "margin": -0.005745887756347656, "lp_gold": -18.56607985496521, "lp_dist": -18.560333967208862, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -0.33770978450775146, "lp_gold": -18.740556836128235, "lp_dist": -18.402847051620483, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.1954535841941833, "lp_gold": -16.162434339523315, "lp_dist": -14.966980755329132, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.086472988128662, "lp_gold": -14.544449806213379, "lp_dist": -13.457976818084717, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.4062250852584839, "lp_gold": -14.470888018608093, "lp_dist": -13.06466293334961, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-100", "gold_norm": "250", "dist_norm": "12", "baseline": { "pred": "gold", "correct": true, "margin": 1.937235951423645, "lp_gold": -16.930358290672302, "lp_dist": -18.867594242095947, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.8317363262176514, "lp_gold": -17.415368795394897, "lp_dist": -15.583632469177246, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.7292950004339218, "lp_gold": -17.88250456750393, "lp_dist": -16.153209567070007, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.9416460394859314, "lp_gold": -17.420925438404083, "lp_dist": -15.479279398918152, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.7523078918457031, "lp_gold": -15.994086980819702, "lp_dist": -14.241779088973999, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.070462703704834, "lp_gold": -15.625900983810425, "lp_dist": -14.55543828010559, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.8317327499389648, "lp_gold": -17.415366888046265, "lp_dist": -15.5836341381073, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-104", "gold_norm": "26", "dist_norm": "42", "baseline": { "pred": "gold", "correct": true, "margin": 4.550231754779816, "lp_gold": -16.079154193401337, "lp_dist": -20.629385948181152, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.743180751800537, "lp_gold": -20.936619758605957, "lp_dist": -17.19343900680542, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.501150131225586, "lp_gold": -20.479767084121704, "lp_dist": -18.978616952896118, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.586869239807129, "lp_gold": -20.261481761932373, "lp_dist": -18.674612522125244, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.488091826438904, "lp_gold": -20.40666627883911, "lp_dist": -17.918574452400208, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.9733150005340576, "lp_gold": -20.461706161499023, "lp_dist": -17.488391160964966, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -3.7431836128234863, "lp_gold": -20.936622619628906, "lp_dist": -17.19343900680542, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-105", "gold_norm": "42", "dist_norm": "5", "baseline": { "pred": "gold", "correct": true, "margin": 2.2448320388793945, "lp_gold": -17.29369354248047, "lp_dist": -19.538525581359863, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.48531031608581543, "lp_gold": -14.891574144363403, "lp_dist": -14.406263828277588, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -0.6142416000366211, "lp_gold": -20.099190711975098, "lp_dist": -19.484949111938477, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -0.2368631362915039, "lp_gold": -19.605250358581543, "lp_dist": -19.36838722229004, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.025758981704712, "lp_gold": -16.692798852920532, "lp_dist": -15.66703987121582, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.37449169158935547, "lp_gold": -14.766992568969727, "lp_dist": -14.392500877380371, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.48531413078308105, "lp_gold": -14.891570806503296, "lp_dist": -14.406256675720215, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-107", "gold_norm": "14400", "dist_norm": "400", "baseline": { "pred": "gold", "correct": true, "margin": 0.8998411595821381, "lp_gold": -19.079706698656082, "lp_dist": -19.97954785823822, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.72913409024477, "lp_gold": -22.703229255974293, "lp_dist": -20.974095165729523, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.19682685285806656, "lp_gold": -22.464857898652554, "lp_dist": -22.66168475151062, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.14343415200710297, "lp_gold": -22.713250055909157, "lp_dist": -22.85668420791626, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -3.5759153068065643, "lp_gold": -27.662475764751434, "lp_dist": -24.08656045794487, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.206382006406784, "lp_gold": -25.311469167470932, "lp_dist": -23.105087161064148, "n_tokens_gold": 6, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.7291278392076492, "lp_gold": -22.70323248207569, "lp_dist": -20.974104642868042, "n_tokens_gold": 6, "n_tokens_dist": 4 } }, { "ex_id": "gsm8k-test-110", "gold_norm": "83", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 0.9284783601760864, "lp_gold": -14.152065396308899, "lp_dist": -15.080543756484985, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.6964447498321533, "lp_gold": -11.795601606369019, "lp_dist": -8.099156856536865, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.5404622554779053, "lp_gold": -14.069255948066711, "lp_dist": -12.528793692588806, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.4889593124389648, "lp_gold": -14.175909280776978, "lp_dist": -12.686949968338013, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.8712440729141235, "lp_gold": -14.255087852478027, "lp_dist": -11.383843779563904, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -3.085860013961792, "lp_gold": -12.02087950706482, "lp_dist": -8.935019493103027, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -3.696447730064392, "lp_gold": -11.795600891113281, "lp_dist": -8.09915316104889, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-120", "gold_norm": "335", "dist_norm": "60", "baseline": { "pred": "gold", "correct": true, "margin": 1.718258023262024, "lp_gold": -16.84885323047638, "lp_dist": -18.567111253738403, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.495100736618042, "lp_gold": -16.78837823867798, "lp_dist": -13.293277502059937, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -6.745102643966675, "lp_gold": -21.85818600654602, "lp_dist": -15.113083362579346, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -6.640980243682861, "lp_gold": -21.43053102493286, "lp_dist": -14.78955078125, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -4.621345937252045, "lp_gold": -18.077104091644287, "lp_dist": -13.455758154392242, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -4.602914571762085, "lp_gold": -19.734163284301758, "lp_dist": -15.131248712539673, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -3.4950921535491943, "lp_gold": -16.78837251663208, "lp_dist": -13.293280363082886, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-134", "gold_norm": "18", "dist_norm": "4", "baseline": { "pred": "gold", "correct": true, "margin": 3.9239641074091196, "lp_gold": -15.171829616650939, "lp_dist": -19.09579372406006, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.7061721086502075, "lp_gold": -9.731460690498352, "lp_dist": -9.025288581848145, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.6682674884796143, "lp_gold": -13.212559461593628, "lp_dist": -13.880826950073242, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.691381573677063, "lp_gold": -13.084003806114197, "lp_dist": -13.77538537979126, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -3.0653414726257324, "lp_gold": -11.70475959777832, "lp_dist": -8.639418125152588, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.6070647239685059, "lp_gold": -11.957983255386353, "lp_dist": -10.350918531417847, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.7061715722084045, "lp_gold": -9.731466829776764, "lp_dist": -9.02529525756836, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-138", "gold_norm": "21", "dist_norm": "25", "baseline": { "pred": "gold", "correct": true, "margin": 2.6713391542434692, "lp_gold": -17.788984179496765, "lp_dist": -20.460323333740234, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.8663175106048584, "lp_gold": -14.334570407867432, "lp_dist": -13.468252897262573, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.3505210876464844, "lp_gold": -21.58456540107727, "lp_dist": -21.935086488723755, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.2903881072998047, "lp_gold": -22.286190509796143, "lp_dist": -22.576578617095947, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.9333076477050781, "lp_gold": -15.113465785980225, "lp_dist": -14.180158138275146, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.7268631458282471, "lp_gold": -13.463084697723389, "lp_dist": -12.736221551895142, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.8663196563720703, "lp_gold": -14.334563970565796, "lp_dist": -13.468244314193726, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-145", "gold_norm": "2", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 1.2492438331246376, "lp_gold": -13.602060556411743, "lp_dist": -14.85130438953638, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.6407327204942703, "lp_gold": -12.194403648376465, "lp_dist": -9.553670927882195, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.6734669059514999, "lp_gold": -13.396984100341797, "lp_dist": -14.070451006293297, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.8539667278528214, "lp_gold": -13.266191005706787, "lp_dist": -14.120157733559608, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.3427896350622177, "lp_gold": -10.751940488815308, "lp_dist": -9.40915085375309, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.827908754348755, "lp_gold": -14.305924892425537, "lp_dist": -11.478016138076782, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -2.640733018517494, "lp_gold": -12.194400548934937, "lp_dist": -9.553667530417442, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-151", "gold_norm": "803", "dist_norm": "16", "baseline": { "pred": "gold", "correct": true, "margin": 0.22600507736206055, "lp_gold": -20.624857425689697, "lp_dist": -20.850862503051758, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.131483793258667, "lp_gold": -19.228359699249268, "lp_dist": -14.0968759059906, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -3.2413750886917114, "lp_gold": -19.84936547279358, "lp_dist": -16.607990384101868, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -3.131054997444153, "lp_gold": -19.767980694770813, "lp_dist": -16.63692569732666, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -4.6819539070129395, "lp_gold": -18.495857000350952, "lp_dist": -13.813903093338013, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -4.957081317901611, "lp_gold": -20.852898836135864, "lp_dist": -15.895817518234253, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -5.131482243537903, "lp_gold": -19.228361129760742, "lp_dist": -14.09687888622284, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-153", "gold_norm": "280", "dist_norm": "13", "baseline": { "pred": "gold", "correct": true, "margin": 3.694299184717238, "lp_gold": -15.244264638982713, "lp_dist": -18.93856382369995, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.8231047093868256, "lp_gold": -12.64960965514183, "lp_dist": -11.826504945755005, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.8308207541704178, "lp_gold": -15.758303448557854, "lp_dist": -16.58912420272827, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.7265715599060059, "lp_gold": -16.042329788208008, "lp_dist": -16.768901348114014, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.1412931680679321, "lp_gold": -14.319903492927551, "lp_dist": -13.17861032485962, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.4246297478675842, "lp_gold": -15.16783195734024, "lp_dist": -13.743202209472656, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.8231084495782852, "lp_gold": -12.649619355797768, "lp_dist": -11.826510906219482, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-154", "gold_norm": "13", "dist_norm": "20", "baseline": { "pred": "gold", "correct": true, "margin": 2.0602927803993225, "lp_gold": -15.215918719768524, "lp_dist": -17.276211500167847, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.633309006690979, "lp_gold": -12.038846015930176, "lp_dist": -11.405537009239197, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -0.6033051013946533, "lp_gold": -17.432909965515137, "lp_dist": -16.829604864120483, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -0.6372992992401123, "lp_gold": -17.39775514602661, "lp_dist": -16.7604558467865, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.48968505859375, "lp_gold": -13.174037456512451, "lp_dist": -11.684352397918701, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.2143868207931519, "lp_gold": -12.643470287322998, "lp_dist": -11.429083466529846, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.6333088874816895, "lp_gold": -12.038848400115967, "lp_dist": -11.405539512634277, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-175", "gold_norm": "350", "dist_norm": "50", "baseline": { "pred": "gold", "correct": true, "margin": 0.9250896275043488, "lp_gold": -22.99281856417656, "lp_dist": -23.917908191680908, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.236621737480164, "lp_gold": -17.58333122730255, "lp_dist": -10.346709489822388, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -2.966416835784912, "lp_gold": -20.354339838027954, "lp_dist": -17.387923002243042, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -3.5315762758255005, "lp_gold": -21.129161953926086, "lp_dist": -17.597585678100586, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -7.064225733280182, "lp_gold": -18.191003382205963, "lp_dist": -11.126777648925781, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -7.836197316646576, "lp_gold": -21.282770097255707, "lp_dist": -13.44657278060913, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -7.236626446247101, "lp_gold": -17.583332121372223, "lp_dist": -10.346705675125122, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-207", "gold_norm": "22", "dist_norm": "16", "baseline": { "pred": "gold", "correct": true, "margin": 2.621975004673004, "lp_gold": -11.219844043254852, "lp_dist": -13.841819047927856, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.833777904510498, "lp_gold": -10.41726541519165, "lp_dist": -9.583487510681152, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.4578208923339844, "lp_gold": -15.501156091690063, "lp_dist": -14.043335199356079, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.725459337234497, "lp_gold": -15.87945008277893, "lp_dist": -14.153990745544434, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.1624937057495117, "lp_gold": -15.793409585952759, "lp_dist": -14.630915880203247, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.6316690444946289, "lp_gold": -10.619104146957397, "lp_dist": -9.987435102462769, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.8337790966033936, "lp_gold": -10.417269945144653, "lp_dist": -9.58349084854126, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-215", "gold_norm": "2", "dist_norm": "3", "baseline": { "pred": "gold", "correct": true, "margin": 3.387197434902191, "lp_gold": -9.424871981143951, "lp_dist": -12.812069416046143, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.5975170135498047, "lp_gold": -9.744040250778198, "lp_dist": -8.146523237228394, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -3.2604313492774963, "lp_gold": -12.714868068695068, "lp_dist": -9.454436719417572, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -3.084445893764496, "lp_gold": -12.945753574371338, "lp_dist": -9.861307680606842, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.5732927322387695, "lp_gold": -12.206480503082275, "lp_dist": -10.633187770843506, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.1921443939208984, "lp_gold": -10.054778575897217, "lp_dist": -8.862634181976318, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.5975122451782227, "lp_gold": -9.744039058685303, "lp_dist": -8.14652681350708, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-225", "gold_norm": "1596", "dist_norm": "81", "baseline": { "pred": "gold", "correct": true, "margin": 1.9876238331198692, "lp_gold": -17.49847326427698, "lp_dist": -19.48609709739685, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.1633647084236145, "lp_gold": -24.1348779797554, "lp_dist": -17.971513271331787, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.7931787371635437, "lp_gold": -18.82551997900009, "lp_dist": -17.032341241836548, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -2.224741965532303, "lp_gold": -19.164868861436844, "lp_dist": -16.94012689590454, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -10.08569085597992, "lp_gold": -30.44549548625946, "lp_dist": -20.35980463027954, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -9.3379967212677, "lp_gold": -30.265968084335327, "lp_dist": -20.927971363067627, "n_tokens_gold": 5, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -6.163362383842468, "lp_gold": -24.134878516197205, "lp_dist": -17.971516132354736, "n_tokens_gold": 5, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-226", "gold_norm": "81", "dist_norm": "56", "baseline": { "pred": "gold", "correct": true, "margin": 3.605985850095749, "lp_gold": -15.771342545747757, "lp_dist": -19.377328395843506, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.6331486701965332, "lp_gold": -16.669665813446045, "lp_dist": -15.036517143249512, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.214464545249939, "lp_gold": -16.49272656440735, "lp_dist": -15.27826201915741, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -0.9139895439147949, "lp_gold": -16.460952043533325, "lp_dist": -15.54696249961853, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.7367334365844727, "lp_gold": -19.27399492263794, "lp_dist": -18.537261486053467, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.8308906555175781, "lp_gold": -16.610596179962158, "lp_dist": -14.77970552444458, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.6331486701965332, "lp_gold": -16.66966152191162, "lp_dist": -15.036512851715088, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "gsm8k-test-238", "gold_norm": "60", "dist_norm": "4", "baseline": { "pred": "gold", "correct": true, "margin": 0.38430750370025635, "lp_gold": -13.078525424003601, "lp_dist": -13.462832927703857, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.5117335319519043, "lp_gold": -10.930900573730469, "lp_dist": -9.419167041778564, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -1.7834737300872803, "lp_gold": -15.81191873550415, "lp_dist": -14.02844500541687, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.6578322649002075, "lp_gold": -15.757366299629211, "lp_dist": -14.099534034729004, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -5.469817161560059, "lp_gold": -15.78016185760498, "lp_dist": -10.310344696044922, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.3997215032577515, "lp_gold": -11.266274809837341, "lp_dist": -9.86655330657959, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.5117324590682983, "lp_gold": -10.930898785591125, "lp_dist": -9.419166326522827, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-242", "gold_norm": "19", "dist_norm": "6", "baseline": { "pred": "gold", "correct": true, "margin": 5.181618273258209, "lp_gold": -12.157159745693207, "lp_dist": -17.338778018951416, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4070416688919067, "lp_gold": -10.997576355934143, "lp_dist": -9.590534687042236, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "patched_self": { "pred": "gold", "correct": true, "margin": 1.4950295239686966, "lp_gold": -14.810317918658257, "lp_dist": -16.305347442626953, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 1.47235550545156, "lp_gold": -14.627300599589944, "lp_dist": -16.099656105041504, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.7640156745910645, "lp_gold": -15.07509469985962, "lp_dist": -12.311079025268555, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.5993056297302246, "lp_gold": -12.642723798751831, "lp_dist": -10.043418169021606, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.4070385694503784, "lp_gold": -10.997580409049988, "lp_dist": -9.59054183959961, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "gsm8k-test-246", "gold_norm": "130000", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 4.715070237376494, "lp_gold": -12.472108629561262, "lp_dist": -17.187178866937757, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -7.156454911455512, "lp_gold": -13.647521084174514, "lp_dist": -6.491066172719002, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "patched_self": { "pred": "dist", "correct": false, "margin": -5.767381154000759, "lp_gold": -21.409174405038357, "lp_dist": -15.641793251037598, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -5.478326896904036, "lp_gold": -22.25712094712071, "lp_dist": -16.778794050216675, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -7.735332287847996, "lp_gold": -17.688330195844173, "lp_dist": -9.952997907996178, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -6.886516407132149, "lp_gold": -16.19584783911705, "lp_dist": -9.309331431984901, "n_tokens_gold": 7, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -7.15645333006978, "lp_gold": -13.64751996472478, "lp_dist": -6.491066634654999, "n_tokens_gold": 7, "n_tokens_dist": 3 } } ] }