| { |
| "meta": { |
| "model": "meta-llama/Llama-2-7b-chat-hf", |
| "device": "cuda", |
| "dtype": "fp32", |
| "layer": 10, |
| "layers_path": "model.layers", |
| "seed": 123, |
| "task": "gsm8k", |
| "eval_mode": "pair_logprob", |
| "eval_meta": { |
| "subspace_split": null, |
| "eval_split": "test", |
| "available_splits": [ |
| "train", |
| "test" |
| ], |
| "hf_id": "gsm8k/main" |
| }, |
| "n_eval_loaded": 256, |
| "n_scanned": 256, |
| "base_acc_scan": 0.625, |
| "ablt_acc_scan": 0.59375, |
| "flips_total": 31, |
| "flips_used": 31, |
| "patch_steps": [ |
| 0 |
| ], |
| "patch_n_steps": 1, |
| "Qs_path": "Q_shared_layer10.npy", |
| "Qs_shape": [ |
| 4096, |
| 97 |
| ], |
| "gold_text_prefix": " ", |
| "dist_text_prefix": " ", |
| "gold_max_tokens": 0, |
| "distractor_mode": "next_gold", |
| "answer_prefix_effective": "\nFinal answer:", |
| "max_new_tokens_effective": 64, |
| "run_coeff_controls": false, |
| "use_benchmark_loader": true, |
| "hf_id": "", |
| "hf_split": "test" |
| }, |
| "summary_on_flips": { |
| "patched_self": { |
| "n": 31, |
| "rescued": 11, |
| "rescued_pct": 35.483870967741936, |
| "mean_delta_margin_vs_ablated": 1.5506091117858887, |
| "median_delta_margin_vs_ablated": 1.8901087045669556 |
| }, |
| "control_time_shuffled": { |
| "n": 31, |
| "rescued": 11, |
| "rescued_pct": 35.483870967741936, |
| "mean_delta_margin_vs_ablated": 1.536723256111145, |
| "median_delta_margin_vs_ablated": 1.8725682497024536 |
| }, |
| "control_shared_randvec": { |
| "n": 31, |
| "rescued": 1, |
| "rescued_pct": 3.225806451612903, |
| "mean_delta_margin_vs_ablated": -0.34435272216796875, |
| "median_delta_margin_vs_ablated": -0.03446388244628906 |
| }, |
| "control_rand_subspace": { |
| "n": 31, |
| "rescued": 0, |
| "rescued_pct": 0.0, |
| "mean_delta_margin_vs_ablated": -0.26384562253952026, |
| "median_delta_margin_vs_ablated": -0.18717603385448456 |
| }, |
| "control_patch_nonshared": { |
| "n": 31, |
| "rescued": 0, |
| "rescued_pct": 0.0, |
| "mean_delta_margin_vs_ablated": 8.61082810388325e-07, |
| "median_delta_margin_vs_ablated": 1.0728836059570312e-06 |
| } |
| }, |
| "scan_rows": [ |
| { |
| "ex_id": "gsm8k-test-0", |
| "gold_norm": "50", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0485178977251053, |
| "lp_gold": -13.375523149967194, |
| "lp_dist": -15.424041047692299, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2319180071353912, |
| "lp_gold": -6.67494124174118, |
| "lp_dist": -8.906859248876572, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-1", |
| "gold_norm": "80", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.218225084245205, |
| "lp_gold": -16.316218174993992, |
| "lp_dist": -17.534443259239197, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6342043727636337, |
| "lp_gold": -18.493512138724327, |
| "lp_dist": -17.859307765960693, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-2", |
| "gold_norm": "12", |
| "dist_norm": "140", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.582239151000977, |
| "lp_gold": -19.479307651519775, |
| "lp_dist": -29.061546802520752, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.06528377532959, |
| "lp_gold": -16.74149775505066, |
| "lp_dist": -20.80678153038025, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-3", |
| "gold_norm": "140", |
| "dist_norm": "36", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5653446912765503, |
| "lp_gold": -19.955466985702515, |
| "lp_dist": -17.390122294425964, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.37339717149734497, |
| "lp_gold": -14.124846756458282, |
| "lp_dist": -13.751449584960938, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-4", |
| "gold_norm": "36", |
| "dist_norm": "3200", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 17.496737867593765, |
| "lp_gold": -13.73099598288536, |
| "lp_dist": -31.227733850479126, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.695431232452393, |
| "lp_gold": -7.723996877670288, |
| "lp_dist": -17.41942811012268, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-5", |
| "gold_norm": "3200", |
| "dist_norm": "38", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.075981711270288, |
| "lp_gold": -15.808944131014869, |
| "lp_dist": -19.884925842285156, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.112908275797963, |
| "lp_gold": -17.281133087351918, |
| "lp_dist": -15.168224811553955, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-6", |
| "gold_norm": "38", |
| "dist_norm": "32", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.0733052492141724, |
| "lp_gold": -17.57793438434601, |
| "lp_dist": -15.504629135131836, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.1486949920654297, |
| "lp_gold": -20.525099754333496, |
| "lp_dist": -20.376404762268066, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-7", |
| "gold_norm": "32", |
| "dist_norm": "92", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.4189205169677734, |
| "lp_gold": -16.66067409515381, |
| "lp_dist": -20.079594612121582, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.4474713802337646, |
| "lp_gold": -15.954271793365479, |
| "lp_dist": -18.401743173599243, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-8", |
| "gold_norm": "92", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.1885854713618755, |
| "lp_gold": -20.10318946838379, |
| "lp_dist": -16.914603997021914, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5822286009788513, |
| "lp_gold": -15.157714128494263, |
| "lp_dist": -12.575485527515411, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-9", |
| "gold_norm": "16", |
| "dist_norm": "45", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.206469178199768, |
| "lp_gold": -20.85190773010254, |
| "lp_dist": -17.64543855190277, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.272299289703369, |
| "lp_gold": -11.194756746292114, |
| "lp_dist": -13.467056035995483, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-10", |
| "gold_norm": "45", |
| "dist_norm": "270", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.154298067092896, |
| "lp_gold": -17.49683403968811, |
| "lp_dist": -28.651132106781006, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.5754551887512207, |
| "lp_gold": -13.103037357330322, |
| "lp_dist": -16.678492546081543, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-11", |
| "gold_norm": "270", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.8893778324127197, |
| "lp_gold": -21.884052515029907, |
| "lp_dist": -17.994674682617188, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.458778917789459, |
| "lp_gold": -14.477847814559937, |
| "lp_dist": -10.019068896770477, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-12", |
| "gold_norm": "100", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.500066578388214, |
| "lp_gold": -11.581663310527802, |
| "lp_dist": -17.081729888916016, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.39761683344841, |
| "lp_gold": -9.308580189943314, |
| "lp_dist": -11.706197023391724, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-13", |
| "gold_norm": "25", |
| "dist_norm": "800", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 20.992703570984304, |
| "lp_gold": -13.314849936403334, |
| "lp_dist": -34.30755350738764, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.717362227849662, |
| "lp_gold": -11.016974148340523, |
| "lp_dist": -18.734336376190186, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-14", |
| "gold_norm": "800", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.7909989710897207, |
| "lp_gold": -10.428668463602662, |
| "lp_dist": -13.219667434692383, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.28849396109581, |
| "lp_gold": -15.488610118627548, |
| "lp_dist": -9.200116157531738, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-15", |
| "gold_norm": "2", |
| "dist_norm": "7000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 14.264829635620117, |
| "lp_gold": -12.60490345954895, |
| "lp_dist": -26.869733095169067, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.93172574043274, |
| "lp_gold": -10.274073839187622, |
| "lp_dist": -21.20579957962036, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-16", |
| "gold_norm": "7000", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.534815393853933, |
| "lp_gold": -21.196847282815725, |
| "lp_dist": -16.662031888961792, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.092950224876404, |
| "lp_gold": -21.74782168865204, |
| "lp_dist": -14.654871463775635, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-17", |
| "gold_norm": "25", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.14834189414978, |
| "lp_gold": -15.3827223777771, |
| "lp_dist": -11.23438048362732, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8233906030654907, |
| "lp_gold": -9.256547331809998, |
| "lp_dist": -8.433156728744507, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-18", |
| "gold_norm": "3", |
| "dist_norm": "3430", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 31.8187518119812, |
| "lp_gold": -10.239798672497272, |
| "lp_dist": -42.058550484478474, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.154653549194336, |
| "lp_gold": -7.938319206237793, |
| "lp_dist": -24.09297275543213, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-19", |
| "gold_norm": "3430", |
| "dist_norm": "106", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.379680693149567, |
| "lp_gold": -23.033769607543945, |
| "lp_dist": -18.65408891439438, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -9.992487907409668, |
| "lp_gold": -21.179072380065918, |
| "lp_dist": -11.18658447265625, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-20", |
| "gold_norm": "106", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.92336449585855, |
| "lp_gold": -15.76830449141562, |
| "lp_dist": -20.69166898727417, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6747859213501215, |
| "lp_gold": -17.988985607400537, |
| "lp_dist": -17.314199686050415, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-21", |
| "gold_norm": "80", |
| "dist_norm": "26", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.6582831740379333, |
| "lp_gold": -10.863374054431915, |
| "lp_dist": -13.521657228469849, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6402748823165894, |
| "lp_gold": -11.24216091632843, |
| "lp_dist": -12.88243579864502, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-22", |
| "gold_norm": "26", |
| "dist_norm": "750", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.5539721846580505, |
| "lp_gold": -21.11834144592285, |
| "lp_dist": -25.672313630580902, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.063919067382812, |
| "lp_gold": -12.24570107460022, |
| "lp_dist": -20.309620141983032, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-23", |
| "gold_norm": "750", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.4596693105995655, |
| "lp_gold": -14.162512499839067, |
| "lp_dist": -13.702843189239502, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3531132936477661, |
| "lp_gold": -11.305097699165344, |
| "lp_dist": -10.951984405517578, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-24", |
| "gold_norm": "9", |
| "dist_norm": "40", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.006132304668426514, |
| "lp_gold": -15.839151382446289, |
| "lp_dist": -15.845283687114716, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.110446274280548, |
| "lp_gold": -12.25863265991211, |
| "lp_dist": -10.148186385631561, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-25", |
| "gold_norm": "40", |
| "dist_norm": "14", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.5176091194152832, |
| "lp_gold": -16.009315252304077, |
| "lp_dist": -15.491706132888794, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6056453585624695, |
| "lp_gold": -14.140560686588287, |
| "lp_dist": -15.746206045150757, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-26", |
| "gold_norm": "14", |
| "dist_norm": "160", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.092049598693848, |
| "lp_gold": -12.284036666154861, |
| "lp_dist": -25.37608626484871, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.250454902648926, |
| "lp_gold": -11.187321424484253, |
| "lp_dist": -22.43777632713318, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-27", |
| "gold_norm": "160", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.597537249326706, |
| "lp_gold": -12.841732293367386, |
| "lp_dist": -14.439269542694092, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.338132083415985, |
| "lp_gold": -17.455387771129608, |
| "lp_dist": -10.117255687713623, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-28", |
| "gold_norm": "6", |
| "dist_norm": "132", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.4012770652771, |
| "lp_gold": -12.933898210525513, |
| "lp_dist": -18.335175275802612, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.1544095277786255, |
| "lp_gold": -9.050714015960693, |
| "lp_dist": -16.20512354373932, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-29", |
| "gold_norm": "132", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.112765461206436, |
| "lp_gold": -18.76314067840576, |
| "lp_dist": -10.650375217199326, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.735072135925293, |
| "lp_gold": -16.513195633888245, |
| "lp_dist": -7.778123497962952, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-30", |
| "gold_norm": "8", |
| "dist_norm": "68", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2700021266937256, |
| "lp_gold": -11.72844409942627, |
| "lp_dist": -13.998446226119995, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.666181564331055, |
| "lp_gold": -7.928534984588623, |
| "lp_dist": -20.594716548919678, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-31", |
| "gold_norm": "68", |
| "dist_norm": "31", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.820281505584717, |
| "lp_gold": -15.837103843688965, |
| "lp_dist": -19.65738534927368, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.25214481353759766, |
| "lp_gold": -12.841001033782959, |
| "lp_dist": -12.588856220245361, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-32", |
| "gold_norm": "31", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.4860659539699554, |
| "lp_gold": -13.60796919465065, |
| "lp_dist": -20.094035148620605, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.465068936347961, |
| "lp_gold": -14.221534967422485, |
| "lp_dist": -18.686603903770447, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-33", |
| "gold_norm": "100", |
| "dist_norm": "1509", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.144955581985414, |
| "lp_gold": -13.722247913479805, |
| "lp_dist": -26.86720349546522, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.218970347195864, |
| "lp_gold": -9.529480028897524, |
| "lp_dist": -22.748450376093388, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-34", |
| "gold_norm": "1509", |
| "dist_norm": "480", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.574819326400757, |
| "lp_gold": -23.19943141937256, |
| "lp_dist": -16.6246120929718, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.709552764892578, |
| "lp_gold": -18.728264808654785, |
| "lp_dist": -11.018712043762207, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-35", |
| "gold_norm": "480", |
| "dist_norm": "520", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5111888945102692, |
| "lp_gold": -12.033819317817688, |
| "lp_dist": -13.545008212327957, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8960548639297485, |
| "lp_gold": -19.025392055511475, |
| "lp_dist": -16.129337191581726, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-36", |
| "gold_norm": "520", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.531530350446701, |
| "lp_gold": -14.413595885038376, |
| "lp_dist": -11.882065534591675, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.166505575180054, |
| "lp_gold": -12.240307569503784, |
| "lp_dist": -8.07380199432373, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-37", |
| "gold_norm": "3", |
| "dist_norm": "33", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.900035858154297, |
| "lp_gold": -12.652887344360352, |
| "lp_dist": -20.55292320251465, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.076467990875244, |
| "lp_gold": -7.961295485496521, |
| "lp_dist": -14.037763476371765, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-38", |
| "gold_norm": "33", |
| "dist_norm": "120", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.355147242546082, |
| "lp_gold": -12.304473280906677, |
| "lp_dist": -27.65962052345276, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.39077889919281, |
| "lp_gold": -11.524258255958557, |
| "lp_dist": -21.915037155151367, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-39", |
| "gold_norm": "120", |
| "dist_norm": "14", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.22332683950662613, |
| "lp_gold": -14.252640329301357, |
| "lp_dist": -14.475967168807983, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.3626268804073334, |
| "lp_gold": -10.953217655420303, |
| "lp_dist": -13.315844535827637, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-40", |
| "gold_norm": "14", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.754596941173077, |
| "lp_gold": -17.770805835723877, |
| "lp_dist": -10.0162088945508, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.5667039155960083, |
| "lp_gold": -8.350147247314453, |
| "lp_dist": -7.783443331718445, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-41", |
| "gold_norm": "20", |
| "dist_norm": "95200", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 33.85109522007406, |
| "lp_gold": -16.739925840869546, |
| "lp_dist": -50.5910210609436, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 23.679821968078613, |
| "lp_gold": -9.72859787940979, |
| "lp_dist": -33.4084198474884, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-42", |
| "gold_norm": "95200", |
| "dist_norm": "77", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7502023852430284, |
| "lp_gold": -19.08837911253795, |
| "lp_dist": -16.338176727294922, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.2610103897750378, |
| "lp_gold": -21.381718140095472, |
| "lp_dist": -18.120707750320435, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-43", |
| "gold_norm": "77", |
| "dist_norm": "81", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.041521549224853516, |
| "lp_gold": -20.968489170074463, |
| "lp_dist": -20.92696762084961, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2143611907958984, |
| "lp_gold": -14.122482776641846, |
| "lp_dist": -16.336843967437744, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-44", |
| "gold_norm": "81", |
| "dist_norm": "310", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.942630738019943, |
| "lp_gold": -12.305748492479324, |
| "lp_dist": -21.248379230499268, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2553623914718628, |
| "lp_gold": -13.983943223953247, |
| "lp_dist": -14.23930561542511, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-45", |
| "gold_norm": "310", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9456039071083069, |
| "lp_gold": -16.172270894050598, |
| "lp_dist": -17.117874801158905, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.856696009635925, |
| "lp_gold": -16.36608850955963, |
| "lp_dist": -10.509392499923706, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-46", |
| "gold_norm": "100", |
| "dist_norm": "160", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.3894251135061495, |
| "lp_gold": -12.40682859485969, |
| "lp_dist": -15.796253708365839, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0833441019058228, |
| "lp_gold": -16.92129546403885, |
| "lp_dist": -19.00463956594467, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-47", |
| "gold_norm": "160", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.824264804366976, |
| "lp_gold": -14.298029144760221, |
| "lp_dist": -15.122293949127197, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2988634258508682, |
| "lp_gold": -13.753293856978416, |
| "lp_dist": -14.052157282829285, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-48", |
| "gold_norm": "25", |
| "dist_norm": "1400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.125034153461456, |
| "lp_gold": -16.447975158691406, |
| "lp_dist": -23.573009312152863, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.08089584112167358, |
| "lp_gold": -13.430449962615967, |
| "lp_dist": -13.349554121494293, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-49", |
| "gold_norm": "1400", |
| "dist_norm": "120", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.9659185571945272, |
| "lp_gold": -15.535773673269432, |
| "lp_dist": -12.569855116074905, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.700849339365959, |
| "lp_gold": -12.968689993023872, |
| "lp_dist": -9.267840653657913, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-50", |
| "gold_norm": "120", |
| "dist_norm": "48", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.5417392253875732, |
| "lp_gold": -22.00163245201111, |
| "lp_dist": -25.54337167739868, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.251087546348572, |
| "lp_gold": -15.896643280982971, |
| "lp_dist": -21.147730827331543, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-51", |
| "gold_norm": "48", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9340271949768066, |
| "lp_gold": -11.738685846328735, |
| "lp_dist": -12.672713041305542, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.7094523906707764, |
| "lp_gold": -6.564473628997803, |
| "lp_dist": -8.273926019668579, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-52", |
| "gold_norm": "50", |
| "dist_norm": "15400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 19.066895127296448, |
| "lp_gold": -13.457320094108582, |
| "lp_dist": -32.52421522140503, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.942362904548645, |
| "lp_gold": -15.37247109413147, |
| "lp_dist": -22.314833998680115, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-53", |
| "gold_norm": "15400", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8634248977759853, |
| "lp_gold": -20.78267443238292, |
| "lp_dist": -18.919249534606934, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.060255475342274, |
| "lp_gold": -20.97295517474413, |
| "lp_dist": -16.912699699401855, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-54", |
| "gold_norm": "80", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1981298923492432, |
| "lp_gold": -18.350556135177612, |
| "lp_dist": -17.15242624282837, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.7748947478830814, |
| "lp_gold": -7.663371529430151, |
| "lp_dist": -11.438266277313232, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-55", |
| "gold_norm": "5", |
| "dist_norm": "14", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.197003062348813, |
| "lp_gold": -18.496329307556152, |
| "lp_dist": -12.29932624520734, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.144104599952698, |
| "lp_gold": -7.03524386882782, |
| "lp_dist": -14.179348468780518, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-56", |
| "gold_norm": "14", |
| "dist_norm": "31", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.266514074697625, |
| "lp_gold": -18.428703057870734, |
| "lp_dist": -26.69521713256836, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.1581010818481445, |
| "lp_gold": -12.810563087463379, |
| "lp_dist": -16.968664169311523, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-57", |
| "gold_norm": "31", |
| "dist_norm": "36", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7361334562301636, |
| "lp_gold": -14.841211199760437, |
| "lp_dist": -13.105077743530273, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6261711120605469, |
| "lp_gold": -13.683454990386963, |
| "lp_dist": -13.057283878326416, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-58", |
| "gold_norm": "36", |
| "dist_norm": "144", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.37734442949295044, |
| "lp_gold": -15.673691511154175, |
| "lp_dist": -16.051035940647125, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.521233081817627, |
| "lp_gold": -17.215554237365723, |
| "lp_dist": -20.73678731918335, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-59", |
| "gold_norm": "144", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.0488511323928833, |
| "lp_gold": -15.628765225410461, |
| "lp_dist": -13.579914093017578, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.597236156463623, |
| "lp_gold": -16.582991123199463, |
| "lp_dist": -13.98575496673584, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-60", |
| "gold_norm": "5", |
| "dist_norm": "750", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.9340181350708, |
| "lp_gold": -15.854983806610107, |
| "lp_dist": -27.789001941680908, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.577134609222412, |
| "lp_gold": -16.538414001464844, |
| "lp_dist": -24.115548610687256, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-61", |
| "gold_norm": "750", |
| "dist_norm": "38", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7815818190574646, |
| "lp_gold": -18.73184484243393, |
| "lp_dist": -17.950263023376465, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.5384882092475891, |
| "lp_gold": -11.338704288005829, |
| "lp_dist": -11.877192497253418, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-62", |
| "gold_norm": "38", |
| "dist_norm": "48", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.48445200920105, |
| "lp_gold": -17.717634916305542, |
| "lp_dist": -11.233182907104492, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.8313806354999542, |
| "lp_gold": -12.223527193069458, |
| "lp_dist": -8.392146557569504, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-63", |
| "gold_norm": "48", |
| "dist_norm": "655", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 19.08035659790039, |
| "lp_gold": -14.92322301864624, |
| "lp_dist": -34.00357961654663, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.06407618522644, |
| "lp_gold": -10.739889144897461, |
| "lp_dist": -23.8039653301239, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-64", |
| "gold_norm": "655", |
| "dist_norm": "800", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.8698419332504272, |
| "lp_gold": -17.930187582969666, |
| "lp_dist": -19.800029516220093, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4062272310256958, |
| "lp_gold": -14.47088611125946, |
| "lp_dist": -13.064658880233765, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-65", |
| "gold_norm": "800", |
| "dist_norm": "7300", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.858935464173555, |
| "lp_gold": -13.705154906958342, |
| "lp_dist": -25.564090371131897, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.260221555829048, |
| "lp_gold": -14.01822917163372, |
| "lp_dist": -25.27845072746277, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-66", |
| "gold_norm": "7300", |
| "dist_norm": "48", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.354404352605343, |
| "lp_gold": -20.02805521339178, |
| "lp_dist": -17.673650860786438, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.008578598499298096, |
| "lp_gold": -13.163566768169403, |
| "lp_dist": -13.172145366668701, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-67", |
| "gold_norm": "48", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.25957900285720825, |
| "lp_gold": -17.536937534809113, |
| "lp_dist": -17.277358531951904, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7365117073059082, |
| "lp_gold": -10.189218521118164, |
| "lp_dist": -8.452706813812256, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-68", |
| "gold_norm": "4", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.0612575560808182, |
| "lp_gold": -15.63377046585083, |
| "lp_dist": -14.572512909770012, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.9296765327453613, |
| "lp_gold": -9.090213418006897, |
| "lp_dist": -12.019889950752258, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-69", |
| "gold_norm": "15", |
| "dist_norm": "23", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.69707328081131, |
| "lp_gold": -14.628733813762665, |
| "lp_dist": -23.325807094573975, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.665110111236572, |
| "lp_gold": -12.951319694519043, |
| "lp_dist": -17.616429805755615, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-70", |
| "gold_norm": "23", |
| "dist_norm": "225", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.066251754760742, |
| "lp_gold": -16.185874462127686, |
| "lp_dist": -27.252126216888428, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.176498889923096, |
| "lp_gold": -14.897132635116577, |
| "lp_dist": -20.073631525039673, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-71", |
| "gold_norm": "225", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.3921156525611877, |
| "lp_gold": -16.729829609394073, |
| "lp_dist": -13.337713956832886, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.107092618942261, |
| "lp_gold": -17.290175914764404, |
| "lp_dist": -11.183083295822144, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-72", |
| "gold_norm": "15", |
| "dist_norm": "82", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.4555931091308594, |
| "lp_gold": -13.475011110305786, |
| "lp_dist": -16.930604219436646, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.554435849189758, |
| "lp_gold": -11.591153025627136, |
| "lp_dist": -21.145588874816895, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-73", |
| "gold_norm": "82", |
| "dist_norm": "1218", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.144330263137817, |
| "lp_gold": -14.399481773376465, |
| "lp_dist": -24.543812036514282, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.632067203521729, |
| "lp_gold": -11.898912191390991, |
| "lp_dist": -25.53097939491272, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-74", |
| "gold_norm": "1218", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -14.016261577606201, |
| "lp_gold": -28.183964252471924, |
| "lp_dist": -14.167702674865723, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -9.392110228538513, |
| "lp_gold": -21.020013689994812, |
| "lp_dist": -11.627903461456299, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-75", |
| "gold_norm": "2", |
| "dist_norm": "36", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.396540880203247, |
| "lp_gold": -13.256061553955078, |
| "lp_dist": -15.652602434158325, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.963439464569092, |
| "lp_gold": -9.20676326751709, |
| "lp_dist": -16.17020273208618, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-76", |
| "gold_norm": "36", |
| "dist_norm": "13", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.540708303451538, |
| "lp_gold": -18.379968881607056, |
| "lp_dist": -14.839260578155518, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6864776611328125, |
| "lp_gold": -14.157576084136963, |
| "lp_dist": -13.47109842300415, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-77", |
| "gold_norm": "13", |
| "dist_norm": "11", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.3661365509033203, |
| "lp_gold": -15.502496480941772, |
| "lp_dist": -15.868633031845093, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4615020751953125, |
| "lp_gold": -14.546976089477539, |
| "lp_dist": -15.008478164672852, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-78", |
| "gold_norm": "11", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.7840418815612793, |
| "lp_gold": -21.872905254364014, |
| "lp_dist": -18.088863372802734, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.002429008483887, |
| "lp_gold": -13.64483380317688, |
| "lp_dist": -9.642404794692993, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-79", |
| "gold_norm": "8", |
| "dist_norm": "440", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.767229557037354, |
| "lp_gold": -17.545647621154785, |
| "lp_dist": -34.31287717819214, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.454898834228516, |
| "lp_gold": -12.977333545684814, |
| "lp_dist": -24.43223237991333, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-80", |
| "gold_norm": "440", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.289784381631762, |
| "lp_gold": -15.706766793970019, |
| "lp_dist": -13.416982412338257, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.212664246559143, |
| "lp_gold": -17.012100338935852, |
| "lp_dist": -13.799436092376709, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-81", |
| "gold_norm": "2", |
| "dist_norm": "45", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.003484487533569, |
| "lp_gold": -13.458641052246094, |
| "lp_dist": -20.462125539779663, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.808353066444397, |
| "lp_gold": -8.184542536735535, |
| "lp_dist": -13.992895603179932, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-82", |
| "gold_norm": "45", |
| "dist_norm": "54", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.24556124210357666, |
| "lp_gold": -15.131654500961304, |
| "lp_dist": -14.886093258857727, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5171573162078857, |
| "lp_gold": -10.597485780715942, |
| "lp_dist": -12.114643096923828, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-83", |
| "gold_norm": "54", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.36360502243042, |
| "lp_gold": -13.119836330413818, |
| "lp_dist": -10.756231307983398, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.9401865005493164, |
| "lp_gold": -13.909927368164062, |
| "lp_dist": -10.969740867614746, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-84", |
| "gold_norm": "6", |
| "dist_norm": "240", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.525604009628296, |
| "lp_gold": -14.586916446685791, |
| "lp_dist": -31.112520456314087, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.351407527923584, |
| "lp_gold": -8.294451236724854, |
| "lp_dist": -20.645858764648438, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-85", |
| "gold_norm": "240", |
| "dist_norm": "428", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.200618744827807, |
| "lp_gold": -10.596241324208677, |
| "lp_dist": -15.796860069036484, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.575641840696335, |
| "lp_gold": -10.782865315675735, |
| "lp_dist": -16.35850715637207, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-86", |
| "gold_norm": "428", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -11.445145592093468, |
| "lp_gold": -21.372050523757935, |
| "lp_dist": -9.926904931664467, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8857234716415405, |
| "lp_gold": -13.641488909721375, |
| "lp_dist": -10.755765438079834, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-87", |
| "gold_norm": "5", |
| "dist_norm": "255", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.83140754699707, |
| "lp_gold": -11.654325008392334, |
| "lp_dist": -25.485732555389404, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.030074834823608, |
| "lp_gold": -10.351794719696045, |
| "lp_dist": -25.381869554519653, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-88", |
| "gold_norm": "255", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.704339981079102, |
| "lp_gold": -22.688746690750122, |
| "lp_dist": -17.98440670967102, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.790473118424416, |
| "lp_gold": -23.528611078858376, |
| "lp_dist": -18.73813796043396, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-89", |
| "gold_norm": "10", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6458263397216797, |
| "lp_gold": -14.022311687469482, |
| "lp_dist": -12.376485347747803, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.548365592956543, |
| "lp_gold": -11.53894329071045, |
| "lp_dist": -9.990577697753906, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-90", |
| "gold_norm": "9", |
| "dist_norm": "157", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.838926374912262, |
| "lp_gold": -14.729028940200806, |
| "lp_dist": -26.567955315113068, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.449068486690521, |
| "lp_gold": -13.700207710266113, |
| "lp_dist": -18.149276196956635, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-91", |
| "gold_norm": "157", |
| "dist_norm": "56", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.06636106967926025, |
| "lp_gold": -13.999522089958191, |
| "lp_dist": -13.93316102027893, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.18358194828033447, |
| "lp_gold": -12.361805081367493, |
| "lp_dist": -12.178223133087158, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-92", |
| "gold_norm": "56", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.014044851064682007, |
| "lp_gold": -13.622624963521957, |
| "lp_dist": -13.608580112457275, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.375391006469727, |
| "lp_gold": -15.001872539520264, |
| "lp_dist": -9.626481533050537, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-93", |
| "gold_norm": "5", |
| "dist_norm": "144", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.107487440109253, |
| "lp_gold": -21.01281452178955, |
| "lp_dist": -31.120301961898804, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.960463047027588, |
| "lp_gold": -8.431816339492798, |
| "lp_dist": -20.392279386520386, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-94", |
| "gold_norm": "144", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8007860428187996, |
| "lp_gold": -16.124470019945875, |
| "lp_dist": -15.323683977127075, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.586140275001526, |
| "lp_gold": -13.455833077430725, |
| "lp_dist": -10.8696928024292, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-95", |
| "gold_norm": "50", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.431166723370552, |
| "lp_gold": -14.887599676847458, |
| "lp_dist": -10.456432953476906, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6977656185626984, |
| "lp_gold": -7.156943529844284, |
| "lp_dist": -8.854709148406982, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-96", |
| "gold_norm": "4", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.549594163894653, |
| "lp_gold": -12.547298669815063, |
| "lp_dist": -19.096892833709717, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.145283490419388, |
| "lp_gold": -8.248348951339722, |
| "lp_dist": -10.39363244175911, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-97", |
| "gold_norm": "50", |
| "dist_norm": "42", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1262907013297081, |
| "lp_gold": -12.730935551226139, |
| "lp_dist": -12.857226252555847, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.2466068267822266, |
| "lp_gold": -9.356587171554565, |
| "lp_dist": -12.603193998336792, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-98", |
| "gold_norm": "42", |
| "dist_norm": "7", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.333731412887573, |
| "lp_gold": -18.121748208999634, |
| "lp_dist": -12.78801679611206, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.454429030418396, |
| "lp_gold": -11.41723620891571, |
| "lp_dist": -10.962807178497314, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-99", |
| "gold_norm": "7", |
| "dist_norm": "250", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.109447717666626, |
| "lp_gold": -17.65127396583557, |
| "lp_dist": -27.760721683502197, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.160389855504036, |
| "lp_gold": -12.602290153503418, |
| "lp_dist": -15.762680009007454, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-100", |
| "gold_norm": "250", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.937235951423645, |
| "lp_gold": -16.930358290672302, |
| "lp_dist": -18.867594242095947, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8317363262176514, |
| "lp_gold": -17.415368795394897, |
| "lp_dist": -15.583632469177246, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-101", |
| "gold_norm": "12", |
| "dist_norm": "7", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.4091547727584839, |
| "lp_gold": -11.118706822395325, |
| "lp_dist": -12.527861595153809, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.3088250160217285, |
| "lp_gold": -9.039770126342773, |
| "lp_dist": -11.348595142364502, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-102", |
| "gold_norm": "7", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.3529720306396484, |
| "lp_gold": -18.84420108795166, |
| "lp_dist": -17.49122905731201, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1846144199371338, |
| "lp_gold": -10.204635620117188, |
| "lp_dist": -9.020021200180054, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-103", |
| "gold_norm": "8", |
| "dist_norm": "26", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.383567810058594, |
| "lp_gold": -15.868620872497559, |
| "lp_dist": -26.252188682556152, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.597115993499756, |
| "lp_gold": -10.359461784362793, |
| "lp_dist": -18.95657777786255, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-104", |
| "gold_norm": "26", |
| "dist_norm": "42", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.550231754779816, |
| "lp_gold": -16.079154193401337, |
| "lp_dist": -20.629385948181152, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.743180751800537, |
| "lp_gold": -20.936619758605957, |
| "lp_dist": -17.19343900680542, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-105", |
| "gold_norm": "42", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2448320388793945, |
| "lp_gold": -17.29369354248047, |
| "lp_dist": -19.538525581359863, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.48531031608581543, |
| "lp_gold": -14.891574144363403, |
| "lp_dist": -14.406263828277588, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-106", |
| "gold_norm": "5", |
| "dist_norm": "14400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 17.918405055999756, |
| "lp_gold": -16.223863124847412, |
| "lp_dist": -34.14226818084717, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 18.689422607421875, |
| "lp_gold": -8.31445324420929, |
| "lp_dist": -27.003875851631165, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-107", |
| "gold_norm": "14400", |
| "dist_norm": "400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8998411595821381, |
| "lp_gold": -19.079706698656082, |
| "lp_dist": -19.97954785823822, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.72913409024477, |
| "lp_gold": -22.703229255974293, |
| "lp_dist": -20.974095165729523, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-108", |
| "gold_norm": "400", |
| "dist_norm": "40", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.00030357998912222683, |
| "lp_gold": -12.683453394594835, |
| "lp_dist": -12.683149814605713, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.014066597446799278, |
| "lp_gold": -7.615564605221152, |
| "lp_dist": -7.601498007774353, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-109", |
| "gold_norm": "40", |
| "dist_norm": "83", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.819165468215942, |
| "lp_gold": -15.469541311264038, |
| "lp_dist": -21.28870677947998, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.161367028951645, |
| "lp_gold": -13.052747160196304, |
| "lp_dist": -21.21411418914795, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-110", |
| "gold_norm": "83", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9284783601760864, |
| "lp_gold": -14.152065396308899, |
| "lp_dist": -15.080543756484985, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.6964447498321533, |
| "lp_gold": -11.795601606369019, |
| "lp_dist": -8.099156856536865, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-111", |
| "gold_norm": "10", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.51949143409729, |
| "lp_gold": -16.031707048416138, |
| "lp_dist": -22.551198482513428, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.987100124359131, |
| "lp_gold": -14.24100637435913, |
| "lp_dist": -20.22810649871826, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-112", |
| "gold_norm": "80", |
| "dist_norm": "180", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.0028108435217291, |
| "lp_gold": -14.749124482274055, |
| "lp_dist": -15.751935325795785, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.4700966998934746, |
| "lp_gold": -9.443349197506905, |
| "lp_dist": -11.91344589740038, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-113", |
| "gold_norm": "180", |
| "dist_norm": "1450000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 21.25386095046997, |
| "lp_gold": -27.976417541503906, |
| "lp_dist": -49.23027849197388, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 8 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.907071352005005, |
| "lp_gold": -15.60246878862381, |
| "lp_dist": -29.509540140628815, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 8 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-114", |
| "gold_norm": "1450000", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.66898221289739, |
| "lp_gold": -18.9738236120902, |
| "lp_dist": -13.30484139919281, |
| "n_tokens_gold": 8, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.122156334575266, |
| "lp_gold": -18.38626338308677, |
| "lp_dist": -10.264107048511505, |
| "n_tokens_gold": 8, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-115", |
| "gold_norm": "15", |
| "dist_norm": "1000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.356005743145943, |
| "lp_gold": -21.951110124588013, |
| "lp_dist": -30.307115867733955, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.199884116649628, |
| "lp_gold": -11.521326780319214, |
| "lp_dist": -19.72121089696884, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-116", |
| "gold_norm": "1000", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.19174530085001606, |
| "lp_gold": -13.59133626993571, |
| "lp_dist": -13.399590969085693, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.19964181631803513, |
| "lp_gold": -10.637855164706707, |
| "lp_dist": -10.438213348388672, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-117", |
| "gold_norm": "2", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.722217559814453, |
| "lp_gold": -18.274237632751465, |
| "lp_dist": -24.996455192565918, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5563411712646484, |
| "lp_gold": -12.917238712310791, |
| "lp_dist": -14.47357988357544, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-118", |
| "gold_norm": "15", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.2469024658203125, |
| "lp_gold": -14.20986533164978, |
| "lp_dist": -17.456767797470093, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.755454957485199, |
| "lp_gold": -7.566788613796234, |
| "lp_dist": -16.322243571281433, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-119", |
| "gold_norm": "100", |
| "dist_norm": "335", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.725587379769422, |
| "lp_gold": -11.31663225905504, |
| "lp_dist": -28.042219638824463, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.39561602845788, |
| "lp_gold": -11.48399594053626, |
| "lp_dist": -19.87961196899414, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-120", |
| "gold_norm": "335", |
| "dist_norm": "60", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.718258023262024, |
| "lp_gold": -16.84885323047638, |
| "lp_dist": -18.567111253738403, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.495100736618042, |
| "lp_gold": -16.78837823867798, |
| "lp_dist": -13.293277502059937, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-121", |
| "gold_norm": "60", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7624173164367676, |
| "lp_gold": -16.792863368988037, |
| "lp_dist": -14.03044605255127, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.886390924453735, |
| "lp_gold": -13.833235025405884, |
| "lp_dist": -7.946844100952148, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-122", |
| "gold_norm": "5", |
| "dist_norm": "9500", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.83846201375127, |
| "lp_gold": -13.696074485778809, |
| "lp_dist": -30.534536499530077, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.658583138138056, |
| "lp_gold": -8.453100323677063, |
| "lp_dist": -24.11168346181512, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-123", |
| "gold_norm": "9500", |
| "dist_norm": "160", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3972220839932561, |
| "lp_gold": -20.279614341445267, |
| "lp_dist": -19.88239225745201, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.62424199283123, |
| "lp_gold": -20.69040386378765, |
| "lp_dist": -18.06616187095642, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-124", |
| "gold_norm": "160", |
| "dist_norm": "1050", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.60233561617497, |
| "lp_gold": -18.274476603444782, |
| "lp_dist": -34.87681221961975, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.274514463730156, |
| "lp_gold": -13.8317128745839, |
| "lp_dist": -22.106227338314056, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-125", |
| "gold_norm": "1050", |
| "dist_norm": "91", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.0837584948167205, |
| "lp_gold": -23.45734657999128, |
| "lp_dist": -21.37358808517456, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.869788646697998, |
| "lp_gold": -20.34303617477417, |
| "lp_dist": -17.473247528076172, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-126", |
| "gold_norm": "91", |
| "dist_norm": "21", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.658190071582794, |
| "lp_gold": -20.631014347076416, |
| "lp_dist": -13.972824275493622, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.4234659671783447, |
| "lp_gold": -17.770225048065186, |
| "lp_dist": -15.34675908088684, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-127", |
| "gold_norm": "21", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.925290822982788, |
| "lp_gold": -16.08545808121562, |
| "lp_dist": -14.160167258232832, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.1263790130615234, |
| "lp_gold": -13.073553562164307, |
| "lp_dist": -10.947174549102783, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-128", |
| "gold_norm": "20", |
| "dist_norm": "36", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.3922419548034668, |
| "lp_gold": -19.574571132659912, |
| "lp_dist": -19.96681308746338, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.179409921169281, |
| "lp_gold": -8.625289022922516, |
| "lp_dist": -10.804698944091797, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-129", |
| "gold_norm": "36", |
| "dist_norm": "36", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -12.731476545333862, |
| "lp_dist": -12.731476545333862, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -11.376878261566162, |
| "lp_dist": -11.376878261566162, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-130", |
| "gold_norm": "36", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.552075147628784, |
| "lp_gold": -14.53696346282959, |
| "lp_dist": -11.984888315200806, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -11.998991012573242, |
| "lp_gold": -20.990919589996338, |
| "lp_dist": -8.991928577423096, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-131", |
| "gold_norm": "10", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.991529874503613, |
| "lp_gold": -16.921989023685455, |
| "lp_dist": -7.930459149181843, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6315575018525124, |
| "lp_gold": -6.705006085336208, |
| "lp_dist": -8.33656358718872, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-132", |
| "gold_norm": "5", |
| "dist_norm": "32", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.572678565979004, |
| "lp_gold": -12.411277294158936, |
| "lp_dist": -18.98395586013794, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.04911994934082, |
| "lp_gold": -8.343387603759766, |
| "lp_dist": -12.392507553100586, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-133", |
| "gold_norm": "32", |
| "dist_norm": "18", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6729400157928467, |
| "lp_gold": -15.0261971950531, |
| "lp_dist": -14.353257179260254, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8697174787521362, |
| "lp_gold": -10.817826390266418, |
| "lp_dist": -11.687543869018555, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-134", |
| "gold_norm": "18", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.9239641074091196, |
| "lp_gold": -15.171829616650939, |
| "lp_dist": -19.09579372406006, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7061721086502075, |
| "lp_gold": -9.731460690498352, |
| "lp_dist": -9.025288581848145, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-135", |
| "gold_norm": "4", |
| "dist_norm": "48", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.912032127380371, |
| "lp_gold": -11.661430835723877, |
| "lp_dist": -21.573462963104248, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.949398040771484, |
| "lp_gold": -13.34396743774414, |
| "lp_dist": -24.293365478515625, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-136", |
| "gold_norm": "48", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2137904167175293, |
| "lp_gold": -13.246366620063782, |
| "lp_dist": -12.032576203346252, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.785146713256836, |
| "lp_gold": -15.54097604751587, |
| "lp_dist": -10.755829334259033, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-137", |
| "gold_norm": "8", |
| "dist_norm": "21", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.050054788589478, |
| "lp_gold": -14.328342199325562, |
| "lp_dist": -23.37839698791504, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.801905870437622, |
| "lp_gold": -9.647645235061646, |
| "lp_dist": -13.449551105499268, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-138", |
| "gold_norm": "21", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.6713391542434692, |
| "lp_gold": -17.788984179496765, |
| "lp_dist": -20.460323333740234, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8663175106048584, |
| "lp_gold": -14.334570407867432, |
| "lp_dist": -13.468252897262573, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-139", |
| "gold_norm": "25", |
| "dist_norm": "3000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.303999066352844, |
| "lp_gold": -13.958717346191406, |
| "lp_dist": -24.26271641254425, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.2307329177856445, |
| "lp_gold": -7.678596615791321, |
| "lp_dist": -13.909329533576965, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-140", |
| "gold_norm": "3000", |
| "dist_norm": "40", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8300985433161259, |
| "lp_gold": -17.700348053127527, |
| "lp_dist": -15.870249509811401, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4543883726000786, |
| "lp_gold": -14.831104047596455, |
| "lp_dist": -15.285492420196533, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-141", |
| "gold_norm": "40", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.7375423088669777, |
| "lp_gold": -13.080543011426926, |
| "lp_dist": -14.818085320293903, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.48419055342674255, |
| "lp_gold": -11.210194662213326, |
| "lp_dist": -11.694385215640068, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-142", |
| "gold_norm": "50", |
| "dist_norm": "90", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.239281177520752, |
| "lp_gold": -13.6494460105896, |
| "lp_dist": -12.410164833068848, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5999003648757935, |
| "lp_gold": -9.939468264579773, |
| "lp_dist": -11.539368629455566, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-143", |
| "gold_norm": "90", |
| "dist_norm": "23", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3710329532623291, |
| "lp_gold": -20.11902666091919, |
| "lp_dist": -19.74799370765686, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.965712070465088, |
| "lp_gold": -15.742670059204102, |
| "lp_dist": -18.70838212966919, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-144", |
| "gold_norm": "23", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.9211039543151855, |
| "lp_gold": -17.69726538658142, |
| "lp_dist": -12.776161432266235, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.012661933898926, |
| "lp_gold": -12.401761054992676, |
| "lp_dist": -8.38909912109375, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-145", |
| "gold_norm": "2", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2492438331246376, |
| "lp_gold": -13.602060556411743, |
| "lp_dist": -14.85130438953638, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.6407327204942703, |
| "lp_gold": -12.194403648376465, |
| "lp_dist": -9.553670927882195, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-146", |
| "gold_norm": "50", |
| "dist_norm": "122", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.681281805038452, |
| "lp_gold": -16.079622983932495, |
| "lp_dist": -20.760904788970947, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.314260721206665, |
| "lp_gold": -11.10973858833313, |
| "lp_dist": -18.423999309539795, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-147", |
| "gold_norm": "122", |
| "dist_norm": "300", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.33387154340744, |
| "lp_gold": -18.671439349651337, |
| "lp_dist": -25.005310893058777, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.5978607535362244, |
| "lp_gold": -15.271158814430237, |
| "lp_dist": -15.869019567966461, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-148", |
| "gold_norm": "300", |
| "dist_norm": "448", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.451393851355533, |
| "lp_gold": -17.04763673870184, |
| "lp_dist": -30.499030590057373, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.197498982015532, |
| "lp_gold": -14.669522101816256, |
| "lp_dist": -27.867021083831787, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-149", |
| "gold_norm": "448", |
| "dist_norm": "2450", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.949738264083862, |
| "lp_gold": -25.570088386535645, |
| "lp_dist": -31.519826650619507, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.753702878952026, |
| "lp_gold": -20.34923005104065, |
| "lp_dist": -26.102932929992676, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-150", |
| "gold_norm": "2450", |
| "dist_norm": "803", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.90772854257375, |
| "lp_gold": -16.1460417015478, |
| "lp_dist": -22.05377024412155, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.6608445048332214, |
| "lp_gold": -14.72765988111496, |
| "lp_dist": -15.388504385948181, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-151", |
| "gold_norm": "803", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.22600507736206055, |
| "lp_gold": -20.624857425689697, |
| "lp_dist": -20.850862503051758, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.131483793258667, |
| "lp_gold": -19.228359699249268, |
| "lp_dist": -14.0968759059906, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-152", |
| "gold_norm": "16", |
| "dist_norm": "280", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.489606261253357, |
| "lp_gold": -13.729230046272278, |
| "lp_dist": -23.218836307525635, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.312919616699219, |
| "lp_gold": -9.783522605895996, |
| "lp_dist": -23.096442222595215, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-153", |
| "gold_norm": "280", |
| "dist_norm": "13", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.694299184717238, |
| "lp_gold": -15.244264638982713, |
| "lp_dist": -18.93856382369995, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8231047093868256, |
| "lp_gold": -12.64960965514183, |
| "lp_dist": -11.826504945755005, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-154", |
| "gold_norm": "13", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0602927803993225, |
| "lp_gold": -15.215918719768524, |
| "lp_dist": -17.276211500167847, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.633309006690979, |
| "lp_gold": -12.038846015930176, |
| "lp_dist": -11.405537009239197, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-155", |
| "gold_norm": "20", |
| "dist_norm": "14", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.590712457895279, |
| "lp_gold": -18.284266233444214, |
| "lp_dist": -15.693553775548935, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9870389699935913, |
| "lp_gold": -10.093660473823547, |
| "lp_dist": -12.080699443817139, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-156", |
| "gold_norm": "14", |
| "dist_norm": "32", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.239875316619873, |
| "lp_gold": -20.827781200408936, |
| "lp_dist": -25.06765651702881, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.260110855102539, |
| "lp_gold": -13.14831280708313, |
| "lp_dist": -15.408423662185669, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-157", |
| "gold_norm": "32", |
| "dist_norm": "105", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.336161613464355, |
| "lp_gold": -20.509262084960938, |
| "lp_dist": -32.84542369842529, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.297417879104614, |
| "lp_gold": -9.766870260238647, |
| "lp_dist": -16.06428813934326, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-158", |
| "gold_norm": "105", |
| "dist_norm": "71", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.14784783124923706, |
| "lp_gold": -21.096359431743622, |
| "lp_dist": -20.948511600494385, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4936403930187225, |
| "lp_gold": -13.522361606359482, |
| "lp_dist": -12.02872121334076, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-159", |
| "gold_norm": "71", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2031426429748535, |
| "lp_gold": -17.497971057891846, |
| "lp_dist": -16.294828414916992, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.283137083053589, |
| "lp_gold": -15.376394033432007, |
| "lp_dist": -12.093256950378418, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-160", |
| "gold_norm": "5", |
| "dist_norm": "30", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.20781135559082, |
| "lp_gold": -9.617193222045898, |
| "lp_dist": -14.825004577636719, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.284235119819641, |
| "lp_gold": -6.052669286727905, |
| "lp_dist": -9.336904406547546, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-161", |
| "gold_norm": "30", |
| "dist_norm": "95", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.308123826980591, |
| "lp_gold": -12.008728742599487, |
| "lp_dist": -18.316852569580078, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.064014196395874, |
| "lp_gold": -11.074568510055542, |
| "lp_dist": -16.138582706451416, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-162", |
| "gold_norm": "95", |
| "dist_norm": "147", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 14.487586110830307, |
| "lp_gold": -13.062050491571426, |
| "lp_dist": -27.549636602401733, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.7768335342407227, |
| "lp_gold": -18.569175243377686, |
| "lp_dist": -22.346008777618408, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-163", |
| "gold_norm": "147", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.9637865126132965, |
| "lp_gold": -13.891173975542188, |
| "lp_dist": -7.927387462928891, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.2743232250213623, |
| "lp_gold": -11.51265001296997, |
| "lp_dist": -11.238326787948608, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-164", |
| "gold_norm": "10", |
| "dist_norm": "40000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.0207504555583, |
| "lp_gold": -12.818441897630692, |
| "lp_dist": -28.83919235318899, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.683642621152103, |
| "lp_gold": -6.558068131096661, |
| "lp_dist": -22.241710752248764, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-165", |
| "gold_norm": "40000", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.0927228182554245, |
| "lp_gold": -21.754829093813896, |
| "lp_dist": -15.662106275558472, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.87526479922235, |
| "lp_gold": -16.18092787824571, |
| "lp_dist": -9.305663079023361, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-166", |
| "gold_norm": "12", |
| "dist_norm": "129200", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 17.634711742401123, |
| "lp_gold": -20.60856056213379, |
| "lp_dist": -38.24327230453491, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 7 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.27542757987976, |
| "lp_gold": -20.810616493225098, |
| "lp_dist": -37.08604407310486, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-167", |
| "gold_norm": "129200", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.816860965336673, |
| "lp_gold": -24.884800246800296, |
| "lp_dist": -16.067939281463623, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -12.447648953646421, |
| "lp_gold": -26.88768095895648, |
| "lp_dist": -14.440032005310059, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-168", |
| "gold_norm": "5", |
| "dist_norm": "45", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.262725353240967, |
| "lp_gold": -11.162125587463379, |
| "lp_dist": -18.424850940704346, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.129469871520996, |
| "lp_gold": -6.1789350509643555, |
| "lp_dist": -18.30840492248535, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-169", |
| "gold_norm": "45", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.7432926744222641, |
| "lp_gold": -13.45964826643467, |
| "lp_dist": -14.202940940856934, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.6613135635852814, |
| "lp_gold": -9.49636921286583, |
| "lp_dist": -12.15768277645111, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-170", |
| "gold_norm": "20", |
| "dist_norm": "1170", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 28.943727374076843, |
| "lp_gold": -13.921928644180298, |
| "lp_dist": -42.86565601825714, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.730559349060059, |
| "lp_gold": -15.503687143325806, |
| "lp_dist": -28.234246492385864, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-171", |
| "gold_norm": "1170", |
| "dist_norm": "192", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.4793071039021015, |
| "lp_gold": -25.866613794118166, |
| "lp_dist": -23.387306690216064, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.7092228829860687, |
| "lp_gold": -17.528388172388077, |
| "lp_dist": -20.237611055374146, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-172", |
| "gold_norm": "192", |
| "dist_norm": "14", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.4512736797332764, |
| "lp_gold": -20.52132660150528, |
| "lp_dist": -18.070052921772003, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.270252227783203, |
| "lp_gold": -18.500007390975952, |
| "lp_dist": -10.229755163192749, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-173", |
| "gold_norm": "14", |
| "dist_norm": "144", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.012117385864258, |
| "lp_gold": -19.01213574409485, |
| "lp_dist": -25.024253129959106, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.6395363807678223, |
| "lp_gold": -17.207820653915405, |
| "lp_dist": -20.847357034683228, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-174", |
| "gold_norm": "144", |
| "dist_norm": "350", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.220242428360507, |
| "lp_gold": -20.075438094558194, |
| "lp_dist": -29.2956805229187, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.203243670635857, |
| "lp_gold": -15.825250687426887, |
| "lp_dist": -24.028494358062744, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-175", |
| "gold_norm": "350", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9250896275043488, |
| "lp_gold": -22.99281856417656, |
| "lp_dist": -23.917908191680908, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.236621737480164, |
| "lp_gold": -17.58333122730255, |
| "lp_dist": -10.346709489822388, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-176", |
| "gold_norm": "50", |
| "dist_norm": "7", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.0019221305847167969, |
| "lp_gold": -14.026922941207886, |
| "lp_dist": -14.025000810623169, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7933419942855835, |
| "lp_gold": -10.8789883852005, |
| "lp_dist": -10.085646390914917, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-177", |
| "gold_norm": "7", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.36652660369873047, |
| "lp_gold": -14.853872776031494, |
| "lp_dist": -14.487346172332764, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.34556770324707, |
| "lp_gold": -7.91382908821106, |
| "lp_dist": -13.25939679145813, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-178", |
| "gold_norm": "50", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.387412190437317, |
| "lp_gold": -12.106852412223816, |
| "lp_dist": -15.494264602661133, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1638234257698059, |
| "lp_gold": -11.002189338207245, |
| "lp_dist": -11.16601276397705, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-179", |
| "gold_norm": "8", |
| "dist_norm": "3160", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 27.79468995332718, |
| "lp_gold": -10.52354496717453, |
| "lp_dist": -38.31823492050171, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 26.891671419143677, |
| "lp_gold": -6.7627270221710205, |
| "lp_dist": -33.6543984413147, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-180", |
| "gold_norm": "3160", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -14.208902917802334, |
| "lp_gold": -23.272248081862926, |
| "lp_dist": -9.063345164060593, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.284509412944317, |
| "lp_gold": -18.512411706149578, |
| "lp_dist": -10.227902293205261, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-181", |
| "gold_norm": "80", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.666141152381897, |
| "lp_gold": -11.426292300224304, |
| "lp_dist": -18.0924334526062, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.4361144304275513, |
| "lp_gold": -8.623531460762024, |
| "lp_dist": -12.059645891189575, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-182", |
| "gold_norm": "50", |
| "dist_norm": "40", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3921025022864342, |
| "lp_gold": -16.256853722035885, |
| "lp_dist": -15.86475121974945, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.5148345530033112, |
| "lp_gold": -11.546109974384308, |
| "lp_dist": -11.031275421380997, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-183", |
| "gold_norm": "40", |
| "dist_norm": "78", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.310973192565143, |
| "lp_gold": -11.783679460175335, |
| "lp_dist": -22.09465265274048, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.1819764897227287, |
| "lp_gold": -8.697102136909962, |
| "lp_dist": -11.87907862663269, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-184", |
| "gold_norm": "78", |
| "dist_norm": "273", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.903703212738037, |
| "lp_gold": -21.98734474182129, |
| "lp_dist": -31.891047954559326, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.26316213607788, |
| "lp_gold": -13.972403526306152, |
| "lp_dist": -25.235565662384033, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-185", |
| "gold_norm": "273", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.715305805206299, |
| "lp_gold": -18.186378479003906, |
| "lp_dist": -13.471072673797607, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.78067421913147, |
| "lp_gold": -15.708798170089722, |
| "lp_dist": -9.928123950958252, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-186", |
| "gold_norm": "2", |
| "dist_norm": "195", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.730733886361122, |
| "lp_gold": -12.38149118423462, |
| "lp_dist": -25.11222507059574, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.67141306400299, |
| "lp_gold": -8.928462505340576, |
| "lp_dist": -21.599875569343567, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-187", |
| "gold_norm": "195", |
| "dist_norm": "1128", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.125296980142593, |
| "lp_gold": -20.79087921977043, |
| "lp_dist": -35.916176199913025, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.027642607688904, |
| "lp_gold": -17.632879853248596, |
| "lp_dist": -29.6605224609375, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-188", |
| "gold_norm": "1128", |
| "dist_norm": "172", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5262241810560226, |
| "lp_gold": -21.482525154948235, |
| "lp_dist": -18.956300973892212, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.822476863861084, |
| "lp_gold": -22.20119798183441, |
| "lp_dist": -14.378721117973328, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-189", |
| "gold_norm": "172", |
| "dist_norm": "30", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -10.2972651720047, |
| "lp_gold": -25.187148094177246, |
| "lp_dist": -14.889882922172546, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.863584116101265, |
| "lp_gold": -19.34168529510498, |
| "lp_dist": -11.478101179003716, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-190", |
| "gold_norm": "30", |
| "dist_norm": "30", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.69141697883606, |
| "lp_dist": -20.69141697883606, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -14.99114179611206, |
| "lp_dist": -14.99114179611206, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-191", |
| "gold_norm": "30", |
| "dist_norm": "92", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 14.041085667908192, |
| "lp_gold": -11.936107210814953, |
| "lp_dist": -25.977192878723145, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.562521874904633, |
| "lp_gold": -8.070083677768707, |
| "lp_dist": -18.63260555267334, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-192", |
| "gold_norm": "92", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.352982759475708, |
| "lp_gold": -19.694137811660767, |
| "lp_dist": -19.34115505218506, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.613435983657837, |
| "lp_gold": -16.650076866149902, |
| "lp_dist": -9.036640882492065, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-193", |
| "gold_norm": "20", |
| "dist_norm": "540", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.857290744781494, |
| "lp_gold": -12.000582933425903, |
| "lp_dist": -17.857873678207397, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.146737933158875, |
| "lp_gold": -8.929409623146057, |
| "lp_dist": -20.07614755630493, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-194", |
| "gold_norm": "540", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.4153643026947975, |
| "lp_gold": -17.50879267603159, |
| "lp_dist": -17.093428373336792, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.476008802652359, |
| "lp_gold": -18.04153409600258, |
| "lp_dist": -11.56552529335022, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-195", |
| "gold_norm": "10", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -13.743456363677979, |
| "lp_dist": -13.743456363677979, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -11.619784355163574, |
| "lp_dist": -11.619784355163574, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-196", |
| "gold_norm": "10", |
| "dist_norm": "38", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.615695416927338, |
| "lp_gold": -13.448404610157013, |
| "lp_dist": -21.06410002708435, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2176668643951416, |
| "lp_gold": -12.037200689315796, |
| "lp_dist": -12.254867553710938, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-197", |
| "gold_norm": "38", |
| "dist_norm": "4000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 19.051328860223293, |
| "lp_gold": -18.077466011047363, |
| "lp_dist": -37.12879487127066, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.882668375968933, |
| "lp_gold": -18.516667366027832, |
| "lp_dist": -27.399335741996765, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-198", |
| "gold_norm": "4000", |
| "dist_norm": "594", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 10.496505833114497, |
| "lp_gold": -20.328653597389348, |
| "lp_dist": -30.825159430503845, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5506546348333359, |
| "lp_gold": -16.97495509684086, |
| "lp_dist": -18.525609731674194, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-199", |
| "gold_norm": "594", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -13.668475985527039, |
| "lp_gold": -24.507625102996826, |
| "lp_dist": -10.839149117469788, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -12.574656009674072, |
| "lp_gold": -20.77769374847412, |
| "lp_dist": -8.203037738800049, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-200", |
| "gold_norm": "2", |
| "dist_norm": "142", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 20.30094861984253, |
| "lp_gold": -13.631542205810547, |
| "lp_dist": -33.932490825653076, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.00571084022522, |
| "lp_gold": -11.859813690185547, |
| "lp_dist": -24.865524530410767, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-201", |
| "gold_norm": "142", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.474197149276733, |
| "lp_gold": -17.457672357559204, |
| "lp_dist": -12.98347520828247, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.4977235794067383, |
| "lp_gold": -15.609914779663086, |
| "lp_dist": -13.112191200256348, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-202", |
| "gold_norm": "9", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.3756675720214844, |
| "lp_gold": -12.738621711730957, |
| "lp_dist": -11.362954139709473, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2889900207519531, |
| "lp_gold": -12.575027465820312, |
| "lp_dist": -11.28603744506836, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-203", |
| "gold_norm": "6", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.780631840229034, |
| "lp_gold": -15.693442344665527, |
| "lp_dist": -22.47407418489456, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.30402946472168, |
| "lp_gold": -9.650990724563599, |
| "lp_dist": -16.95502018928528, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-204", |
| "gold_norm": "100", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.23692995309829712, |
| "lp_gold": -17.014364540576935, |
| "lp_dist": -16.777434587478638, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6824193000793457, |
| "lp_gold": -11.578391790390015, |
| "lp_dist": -10.895972490310669, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-205", |
| "gold_norm": "10", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3394050598144531, |
| "lp_gold": -18.575839042663574, |
| "lp_dist": -18.23643398284912, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.5095596313476562, |
| "lp_gold": -18.252729892730713, |
| "lp_dist": -18.76228952407837, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-206", |
| "gold_norm": "15", |
| "dist_norm": "22", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2520769834518433, |
| "lp_gold": -16.89211142063141, |
| "lp_dist": -18.144188404083252, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.4231153726577759, |
| "lp_gold": -14.139848232269287, |
| "lp_dist": -15.562963604927063, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-207", |
| "gold_norm": "22", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.621975004673004, |
| "lp_gold": -11.219844043254852, |
| "lp_dist": -13.841819047927856, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.833777904510498, |
| "lp_gold": -10.41726541519165, |
| "lp_dist": -9.583487510681152, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-208", |
| "gold_norm": "16", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -11.36221162811853, |
| "lp_dist": -11.36221162811853, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -13.241074323654175, |
| "lp_dist": -13.241074323654175, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-209", |
| "gold_norm": "16", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.475744724273682, |
| "lp_gold": -19.41911506652832, |
| "lp_dist": -11.943370342254639, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.438729763031006, |
| "lp_gold": -15.213366270065308, |
| "lp_dist": -10.774636507034302, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-210", |
| "gold_norm": "5", |
| "dist_norm": "23", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.465651273727417, |
| "lp_gold": -15.02237606048584, |
| "lp_dist": -21.488027334213257, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.424657344818115, |
| "lp_gold": -8.115961074829102, |
| "lp_dist": -15.540618419647217, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-211", |
| "gold_norm": "23", |
| "dist_norm": "30", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.4010072350502014, |
| "lp_gold": -15.865891933441162, |
| "lp_dist": -15.46488469839096, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.2698206305503845, |
| "lp_gold": -18.3642840385437, |
| "lp_dist": -13.094463407993317, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-212", |
| "gold_norm": "30", |
| "dist_norm": "14000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 23.520719528198242, |
| "lp_gold": -19.58930778503418, |
| "lp_dist": -43.11002731323242, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 18.009515285491943, |
| "lp_gold": -13.2671217918396, |
| "lp_dist": -31.276637077331543, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-213", |
| "gold_norm": "14000", |
| "dist_norm": "60", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.45991032384336, |
| "lp_gold": -21.998028149828315, |
| "lp_dist": -15.538117825984955, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.547004419262521, |
| "lp_gold": -14.710521432454698, |
| "lp_dist": -13.163517013192177, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-214", |
| "gold_norm": "60", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.948975563049316, |
| "lp_gold": -23.77088451385498, |
| "lp_dist": -14.821908950805664, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.54961109161377, |
| "lp_gold": -19.638930320739746, |
| "lp_dist": -11.089319229125977, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-215", |
| "gold_norm": "2", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.387197434902191, |
| "lp_gold": -9.424871981143951, |
| "lp_dist": -12.812069416046143, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5975170135498047, |
| "lp_gold": -9.744040250778198, |
| "lp_dist": -8.146523237228394, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-216", |
| "gold_norm": "3", |
| "dist_norm": "30", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.872649192810059, |
| "lp_gold": -14.82950735092163, |
| "lp_dist": -20.70215654373169, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.080531597137451, |
| "lp_gold": -11.414220809936523, |
| "lp_dist": -13.494752407073975, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-217", |
| "gold_norm": "30", |
| "dist_norm": "1920", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 21.697412703186274, |
| "lp_gold": -13.986085917800665, |
| "lp_dist": -35.68349862098694, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.313387870788574, |
| "lp_gold": -12.878417491912842, |
| "lp_dist": -25.191805362701416, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-218", |
| "gold_norm": "1920", |
| "dist_norm": "84", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6947197169065475, |
| "lp_gold": -25.65590851008892, |
| "lp_dist": -23.961188793182373, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.878614127635956, |
| "lp_gold": -21.65609782934189, |
| "lp_dist": -14.777483701705933, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-219", |
| "gold_norm": "84", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1246260404586792, |
| "lp_gold": -13.33847463130951, |
| "lp_dist": -12.21384859085083, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7234134674072266, |
| "lp_gold": -12.977782487869263, |
| "lp_dist": -10.254369020462036, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-220", |
| "gold_norm": "8", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.304245948791504, |
| "lp_gold": -15.825139999389648, |
| "lp_dist": -14.520894050598145, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1817718744277954, |
| "lp_gold": -12.667408466339111, |
| "lp_dist": -12.849180340766907, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-221", |
| "gold_norm": "12", |
| "dist_norm": "260", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.42927235364914, |
| "lp_gold": -9.642007768154144, |
| "lp_dist": -18.071280121803284, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.9336384534835815, |
| "lp_gold": -7.166749358177185, |
| "lp_dist": -14.100387811660767, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-222", |
| "gold_norm": "260", |
| "dist_norm": "288", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.7796518057584763, |
| "lp_gold": -18.89673836529255, |
| "lp_dist": -22.676390171051025, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0456210374832153, |
| "lp_gold": -16.525109887123108, |
| "lp_dist": -18.570730924606323, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-223", |
| "gold_norm": "288", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -19.030277393758297, |
| "lp_gold": -26.646236896514893, |
| "lp_dist": -7.615959502756596, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -15.758692368865013, |
| "lp_gold": -20.088956594467163, |
| "lp_dist": -4.33026422560215, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-224", |
| "gold_norm": "3", |
| "dist_norm": "1596", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.595462799072266, |
| "lp_gold": -14.905784606933594, |
| "lp_dist": -31.50124740600586, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.13599407672882, |
| "lp_gold": -13.950559616088867, |
| "lp_dist": -30.086553692817688, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-225", |
| "gold_norm": "1596", |
| "dist_norm": "81", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9876238331198692, |
| "lp_gold": -17.49847326427698, |
| "lp_dist": -19.48609709739685, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.1633647084236145, |
| "lp_gold": -24.1348779797554, |
| "lp_dist": -17.971513271331787, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-226", |
| "gold_norm": "81", |
| "dist_norm": "56", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.605985850095749, |
| "lp_gold": -15.771342545747757, |
| "lp_dist": -19.377328395843506, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6331486701965332, |
| "lp_gold": -16.669665813446045, |
| "lp_dist": -15.036517143249512, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-227", |
| "gold_norm": "56", |
| "dist_norm": "1490", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.239468812942505, |
| "lp_gold": -14.987546801567078, |
| "lp_dist": -31.227015614509583, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.551798105239868, |
| "lp_gold": -13.551477909088135, |
| "lp_dist": -25.103276014328003, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-228", |
| "gold_norm": "1490", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -19.20861628651619, |
| "lp_gold": -30.783629894256592, |
| "lp_dist": -11.575013607740402, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -13.095399379730225, |
| "lp_gold": -24.58754062652588, |
| "lp_dist": -11.492141246795654, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-229", |
| "gold_norm": "2", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.965806007385254, |
| "lp_gold": -11.508173823356628, |
| "lp_dist": -17.473979830741882, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.32473722100257874, |
| "lp_gold": -11.093923568725586, |
| "lp_dist": -11.418660789728165, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-230", |
| "gold_norm": "20", |
| "dist_norm": "11", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1644073724746704, |
| "lp_gold": -14.290618896484375, |
| "lp_dist": -13.126211524009705, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.0560493469238281, |
| "lp_gold": -12.166522026062012, |
| "lp_dist": -13.22257137298584, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-231", |
| "gold_norm": "11", |
| "dist_norm": "120", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.219505786895752, |
| "lp_gold": -14.914972305297852, |
| "lp_dist": -13.6954665184021, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.7893390655517578, |
| "lp_gold": -11.986905813217163, |
| "lp_dist": -13.776244878768921, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-232", |
| "gold_norm": "120", |
| "dist_norm": "45", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.419019672088325, |
| "lp_gold": -15.604393211193383, |
| "lp_dist": -20.023412883281708, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.708264172077179, |
| "lp_gold": -12.799233138561249, |
| "lp_dist": -13.507497310638428, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-233", |
| "gold_norm": "45", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5151035785675049, |
| "lp_gold": -11.359116911888123, |
| "lp_dist": -9.844013333320618, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.095129489898682, |
| "lp_gold": -14.39400601387024, |
| "lp_dist": -6.298876523971558, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-234", |
| "gold_norm": "10", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.05903661251068115, |
| "lp_gold": -13.167555451393127, |
| "lp_dist": -13.226592063903809, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.7660583406686783, |
| "lp_gold": -6.627322778105736, |
| "lp_dist": -10.393381118774414, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-235", |
| "gold_norm": "9", |
| "dist_norm": "33", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.191148281097412, |
| "lp_gold": -14.321090459823608, |
| "lp_dist": -21.51223874092102, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.510578155517578, |
| "lp_gold": -13.556029319763184, |
| "lp_dist": -16.06660747528076, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-236", |
| "gold_norm": "33", |
| "dist_norm": "150", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.496721982955933, |
| "lp_gold": -19.231878995895386, |
| "lp_dist": -28.72860097885132, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.684651017189026, |
| "lp_gold": -13.231264114379883, |
| "lp_dist": -18.91591513156891, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-237", |
| "gold_norm": "150", |
| "dist_norm": "60", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.496449222322553, |
| "lp_gold": -12.51727462792769, |
| "lp_dist": -20.013723850250244, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.5342855900526047, |
| "lp_gold": -11.47604425251484, |
| "lp_dist": -12.010329842567444, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-238", |
| "gold_norm": "60", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.38430750370025635, |
| "lp_gold": -13.078525424003601, |
| "lp_dist": -13.462832927703857, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5117335319519043, |
| "lp_gold": -10.930900573730469, |
| "lp_dist": -9.419167041778564, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-239", |
| "gold_norm": "4", |
| "dist_norm": "7", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.3680305480957031, |
| "lp_gold": -12.839935302734375, |
| "lp_dist": -13.207965850830078, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.0732803344726562, |
| "lp_gold": -11.00877571105957, |
| "lp_dist": -12.082056045532227, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-240", |
| "gold_norm": "7", |
| "dist_norm": "3140", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 22.65280568599701, |
| "lp_gold": -12.292606830596924, |
| "lp_dist": -34.94541251659393, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 18.48963165283203, |
| "lp_gold": -9.809606552124023, |
| "lp_dist": -28.299238204956055, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-241", |
| "gold_norm": "3140", |
| "dist_norm": "19", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.30290687084198, |
| "lp_gold": -22.36732530593872, |
| "lp_dist": -14.06441843509674, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -13.02580738067627, |
| "lp_gold": -25.325818061828613, |
| "lp_dist": -12.300010681152344, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-242", |
| "gold_norm": "19", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.181618273258209, |
| "lp_gold": -12.157159745693207, |
| "lp_dist": -17.338778018951416, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4070416688919067, |
| "lp_gold": -10.997576355934143, |
| "lp_dist": -9.590534687042236, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-243", |
| "gold_norm": "6", |
| "dist_norm": "90", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.3495291471481323, |
| "lp_gold": -19.138280868530273, |
| "lp_dist": -22.487810015678406, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.812922954559326, |
| "lp_gold": -11.337668418884277, |
| "lp_dist": -17.150591373443604, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-244", |
| "gold_norm": "90", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6097938957027509, |
| "lp_gold": -13.304834717731865, |
| "lp_dist": -11.695040822029114, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.5438682280946523, |
| "lp_gold": -7.949862555367872, |
| "lp_dist": -10.493730783462524, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-245", |
| "gold_norm": "10", |
| "dist_norm": "130000", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 26.08018460869789, |
| "lp_gold": -8.254620164632797, |
| "lp_dist": -34.33480477333069, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 7 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.28964650630951, |
| "lp_gold": -7.219025731086731, |
| "lp_dist": -20.50867223739624, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 7 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-246", |
| "gold_norm": "130000", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.715070237376494, |
| "lp_gold": -12.472108629561262, |
| "lp_dist": -17.187178866937757, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.156454911455512, |
| "lp_gold": -13.647521084174514, |
| "lp_dist": -6.491066172719002, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-247", |
| "gold_norm": "10", |
| "dist_norm": "525", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 17.677427016198635, |
| "lp_gold": -15.68656424432993, |
| "lp_dist": -33.363991260528564, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.708948612213135, |
| "lp_gold": -11.453617930412292, |
| "lp_dist": -23.162566542625427, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-248", |
| "gold_norm": "525", |
| "dist_norm": "180", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.852982550859451, |
| "lp_gold": -11.824598759412766, |
| "lp_dist": -18.677581310272217, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.6725820302963257, |
| "lp_gold": -11.64935302734375, |
| "lp_dist": -12.321935057640076, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-249", |
| "gold_norm": "180", |
| "dist_norm": "1200", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.544742852449417, |
| "lp_gold": -8.964889764785767, |
| "lp_dist": -22.509632617235184, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 15.23933944106102, |
| "lp_gold": -7.498180732131004, |
| "lp_dist": -22.737520173192024, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-250", |
| "gold_norm": "1200", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.737206868827343, |
| "lp_gold": -9.13471419364214, |
| "lp_dist": -18.871921062469482, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.1686492152512074, |
| "lp_gold": -8.043186407536268, |
| "lp_dist": -10.211835622787476, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-251", |
| "gold_norm": "25", |
| "dist_norm": "21", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8187389373779297, |
| "lp_gold": -12.140745043754578, |
| "lp_dist": -12.959483981132507, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.511936068534851, |
| "lp_gold": -6.530247092247009, |
| "lp_dist": -10.04218316078186, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-252", |
| "gold_norm": "21", |
| "dist_norm": "2304", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 16.503239154815674, |
| "lp_gold": -17.229759454727173, |
| "lp_dist": -33.73299860954285, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 14.275152683258057, |
| "lp_gold": -13.177631378173828, |
| "lp_dist": -27.452784061431885, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-253", |
| "gold_norm": "2304", |
| "dist_norm": "2325", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.856137990951538, |
| "lp_gold": -25.891463041305542, |
| "lp_dist": -30.74760103225708, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9878709018230438, |
| "lp_gold": -25.637850552797318, |
| "lp_dist": -26.62572145462036, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-254", |
| "gold_norm": "2325", |
| "dist_norm": "15", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.660225659608841, |
| "lp_gold": -13.980357348918915, |
| "lp_dist": -11.320131689310074, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.668605744838715, |
| "lp_gold": -16.375044524669647, |
| "lp_dist": -11.706438779830933, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-255", |
| "gold_norm": "15", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.416126787662506, |
| "lp_gold": -14.69802612066269, |
| "lp_dist": -21.114152908325195, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.287871062755585, |
| "lp_gold": -10.296239674091339, |
| "lp_dist": -17.584110736846924, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| } |
| ], |
| "flip_rows": [ |
| { |
| "ex_id": "gsm8k-test-1", |
| "gold_norm": "80", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.218225084245205, |
| "lp_gold": -16.316218174993992, |
| "lp_dist": -17.534443259239197, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6342043727636337, |
| "lp_gold": -18.493512138724327, |
| "lp_dist": -17.859307765960693, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.4292978644371033, |
| "lp_gold": -17.579256772994995, |
| "lp_dist": -15.149958908557892, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5465927198529243, |
| "lp_gold": -17.71218091994524, |
| "lp_dist": -15.165588200092316, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1809745579957962, |
| "lp_gold": -14.873322412371635, |
| "lp_dist": -15.054296970367432, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3117252141237259, |
| "lp_gold": -17.119899585843086, |
| "lp_dist": -16.80817437171936, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6342088878154755, |
| "lp_gold": -18.49351069331169, |
| "lp_dist": -17.859301805496216, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-5", |
| "gold_norm": "3200", |
| "dist_norm": "38", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.075981711270288, |
| "lp_gold": -15.808944131014869, |
| "lp_dist": -19.884925842285156, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.112908275797963, |
| "lp_gold": -17.281133087351918, |
| "lp_dist": -15.168224811553955, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.29585185274481773, |
| "lp_gold": -18.88566479459405, |
| "lp_dist": -19.181516647338867, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4162398586049676, |
| "lp_gold": -18.945031284354627, |
| "lp_dist": -19.361271142959595, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.9830470234155655, |
| "lp_gold": -17.90198041498661, |
| "lp_dist": -16.918933391571045, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.479109164327383, |
| "lp_gold": -18.568949338048697, |
| "lp_dist": -16.089840173721313, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.112905891612172, |
| "lp_gold": -17.28112688846886, |
| "lp_dist": -15.16822099685669, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-14", |
| "gold_norm": "800", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.7909989710897207, |
| "lp_gold": -10.428668463602662, |
| "lp_dist": -13.219667434692383, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.28849396109581, |
| "lp_gold": -15.488610118627548, |
| "lp_dist": -9.200116157531738, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.221886307001114, |
| "lp_gold": -17.41419091820717, |
| "lp_dist": -14.192304611206055, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.1399324536323547, |
| "lp_gold": -16.989762604236603, |
| "lp_dist": -13.849830150604248, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.316341996192932, |
| "lp_gold": -16.00817358493805, |
| "lp_dist": -10.691831588745117, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.094054877758026, |
| "lp_gold": -17.299417197704315, |
| "lp_dist": -11.205362319946289, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.288493096828461, |
| "lp_gold": -15.488613307476044, |
| "lp_dist": -9.200120210647583, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-20", |
| "gold_norm": "106", |
| "dist_norm": "80", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.92336449585855, |
| "lp_gold": -15.76830449141562, |
| "lp_dist": -20.69166898727417, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6747859213501215, |
| "lp_gold": -17.988985607400537, |
| "lp_dist": -17.314199686050415, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.9746298789978027, |
| "lp_gold": -22.760347604751587, |
| "lp_dist": -18.785717725753784, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.769364595413208, |
| "lp_gold": -22.492818355560303, |
| "lp_dist": -18.723453760147095, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8601390519179404, |
| "lp_gold": -22.44402221823111, |
| "lp_dist": -19.58388316631317, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.127611498348415, |
| "lp_gold": -16.912333111278713, |
| "lp_dist": -14.784721612930298, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6747779380530119, |
| "lp_gold": -17.988986684009433, |
| "lp_dist": -17.31420874595642, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-24", |
| "gold_norm": "9", |
| "dist_norm": "40", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.006132304668426514, |
| "lp_gold": -15.839151382446289, |
| "lp_dist": -15.845283687114716, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.110446274280548, |
| "lp_gold": -12.25863265991211, |
| "lp_dist": -10.148186385631561, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.04043316841125488, |
| "lp_gold": -14.208327531814575, |
| "lp_dist": -14.24876070022583, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.17713472247123718, |
| "lp_gold": -14.100894212722778, |
| "lp_dist": -14.278028935194016, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.736970603466034, |
| "lp_gold": -13.546704292297363, |
| "lp_dist": -11.80973368883133, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5443955063819885, |
| "lp_gold": -14.465348243713379, |
| "lp_dist": -12.92095273733139, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.110443741083145, |
| "lp_gold": -12.258633613586426, |
| "lp_dist": -10.14818987250328, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-27", |
| "gold_norm": "160", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.597537249326706, |
| "lp_gold": -12.841732293367386, |
| "lp_dist": -14.439269542694092, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.338132083415985, |
| "lp_gold": -17.455387771129608, |
| "lp_dist": -10.117255687713623, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.827625960111618, |
| "lp_gold": -15.895989626646042, |
| "lp_dist": -13.068363666534424, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.9387161433696747, |
| "lp_gold": -16.01883837580681, |
| "lp_dist": -13.080122232437134, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.034337878227234, |
| "lp_gold": -23.63157594203949, |
| "lp_dist": -15.597238063812256, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.914303302764893, |
| "lp_gold": -19.091363430023193, |
| "lp_dist": -11.1770601272583, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.338137567043304, |
| "lp_gold": -17.455391585826874, |
| "lp_dist": -10.11725401878357, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-31", |
| "gold_norm": "68", |
| "dist_norm": "31", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.820281505584717, |
| "lp_gold": -15.837103843688965, |
| "lp_dist": -19.65738534927368, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.25214481353759766, |
| "lp_gold": -12.841001033782959, |
| "lp_dist": -12.588856220245361, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.767996072769165, |
| "lp_gold": -13.063536882400513, |
| "lp_dist": -17.831532955169678, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.778961181640625, |
| "lp_gold": -12.704540252685547, |
| "lp_dist": -17.483501434326172, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.2866086959838867, |
| "lp_gold": -13.741567134857178, |
| "lp_dist": -13.454958438873291, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.2309262752532959, |
| "lp_gold": -13.31624436378479, |
| "lp_dist": -13.085318088531494, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.2521398067474365, |
| "lp_gold": -12.840993165969849, |
| "lp_dist": -12.588853359222412, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-35", |
| "gold_norm": "480", |
| "dist_norm": "520", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.5111888945102692, |
| "lp_gold": -12.033819317817688, |
| "lp_dist": -13.545008212327957, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8960548639297485, |
| "lp_gold": -19.025392055511475, |
| "lp_dist": -16.129337191581726, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.173134446144104, |
| "lp_gold": -16.326287806034088, |
| "lp_dist": -16.499422252178192, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.15610426664352417, |
| "lp_gold": -16.418395936489105, |
| "lp_dist": -16.57450020313263, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.612240791320801, |
| "lp_gold": -17.892987489700317, |
| "lp_dist": -15.280746698379517, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.1868066787719727, |
| "lp_gold": -17.928332090377808, |
| "lp_dist": -14.741525411605835, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8960519433021545, |
| "lp_gold": -19.0253963470459, |
| "lp_dist": -16.129344403743744, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-45", |
| "gold_norm": "310", |
| "dist_norm": "100", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9456039071083069, |
| "lp_gold": -16.172270894050598, |
| "lp_dist": -17.117874801158905, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.856696009635925, |
| "lp_gold": -16.36608850955963, |
| "lp_dist": -10.509392499923706, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7440242022275925, |
| "lp_gold": -17.919243693351746, |
| "lp_dist": -16.175219491124153, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6509404331445694, |
| "lp_gold": -17.9195556640625, |
| "lp_dist": -16.26861523091793, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.5654631853103638, |
| "lp_gold": -15.380214095115662, |
| "lp_dist": -11.814750909805298, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.163789927959442, |
| "lp_gold": -17.801445245742798, |
| "lp_dist": -11.637655317783356, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.856698274612427, |
| "lp_gold": -16.366087794303894, |
| "lp_dist": -10.509389519691467, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-48", |
| "gold_norm": "25", |
| "dist_norm": "1400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.125034153461456, |
| "lp_gold": -16.447975158691406, |
| "lp_dist": -23.573009312152863, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.08089584112167358, |
| "lp_gold": -13.430449962615967, |
| "lp_dist": -13.349554121494293, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.025165379047394, |
| "lp_gold": -15.498599290847778, |
| "lp_dist": -19.523764669895172, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.622505933046341, |
| "lp_gold": -15.329206466674805, |
| "lp_dist": -18.951712399721146, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6573466360569, |
| "lp_gold": -16.551159858703613, |
| "lp_dist": -14.893813222646713, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2268932834267616, |
| "lp_gold": -15.390444993972778, |
| "lp_dist": -14.163551710546017, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.08089244365692139, |
| "lp_gold": -13.43044662475586, |
| "lp_dist": -13.349554181098938, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-64", |
| "gold_norm": "655", |
| "dist_norm": "800", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.8698419332504272, |
| "lp_gold": -17.930187582969666, |
| "lp_dist": -19.800029516220093, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4062272310256958, |
| "lp_gold": -14.47088611125946, |
| "lp_dist": -13.064658880233765, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.005745887756347656, |
| "lp_gold": -18.56607985496521, |
| "lp_dist": -18.560333967208862, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.33770978450775146, |
| "lp_gold": -18.740556836128235, |
| "lp_dist": -18.402847051620483, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1954535841941833, |
| "lp_gold": -16.162434339523315, |
| "lp_dist": -14.966980755329132, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.086472988128662, |
| "lp_gold": -14.544449806213379, |
| "lp_dist": -13.457976818084717, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4062250852584839, |
| "lp_gold": -14.470888018608093, |
| "lp_dist": -13.06466293334961, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-100", |
| "gold_norm": "250", |
| "dist_norm": "12", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.937235951423645, |
| "lp_gold": -16.930358290672302, |
| "lp_dist": -18.867594242095947, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8317363262176514, |
| "lp_gold": -17.415368795394897, |
| "lp_dist": -15.583632469177246, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7292950004339218, |
| "lp_gold": -17.88250456750393, |
| "lp_dist": -16.153209567070007, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.9416460394859314, |
| "lp_gold": -17.420925438404083, |
| "lp_dist": -15.479279398918152, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7523078918457031, |
| "lp_gold": -15.994086980819702, |
| "lp_dist": -14.241779088973999, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.070462703704834, |
| "lp_gold": -15.625900983810425, |
| "lp_dist": -14.55543828010559, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8317327499389648, |
| "lp_gold": -17.415366888046265, |
| "lp_dist": -15.5836341381073, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-104", |
| "gold_norm": "26", |
| "dist_norm": "42", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.550231754779816, |
| "lp_gold": -16.079154193401337, |
| "lp_dist": -20.629385948181152, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.743180751800537, |
| "lp_gold": -20.936619758605957, |
| "lp_dist": -17.19343900680542, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.501150131225586, |
| "lp_gold": -20.479767084121704, |
| "lp_dist": -18.978616952896118, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.586869239807129, |
| "lp_gold": -20.261481761932373, |
| "lp_dist": -18.674612522125244, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.488091826438904, |
| "lp_gold": -20.40666627883911, |
| "lp_dist": -17.918574452400208, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.9733150005340576, |
| "lp_gold": -20.461706161499023, |
| "lp_dist": -17.488391160964966, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.7431836128234863, |
| "lp_gold": -20.936622619628906, |
| "lp_dist": -17.19343900680542, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-105", |
| "gold_norm": "42", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2448320388793945, |
| "lp_gold": -17.29369354248047, |
| "lp_dist": -19.538525581359863, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.48531031608581543, |
| "lp_gold": -14.891574144363403, |
| "lp_dist": -14.406263828277588, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6142416000366211, |
| "lp_gold": -20.099190711975098, |
| "lp_dist": -19.484949111938477, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.2368631362915039, |
| "lp_gold": -19.605250358581543, |
| "lp_dist": -19.36838722229004, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.025758981704712, |
| "lp_gold": -16.692798852920532, |
| "lp_dist": -15.66703987121582, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.37449169158935547, |
| "lp_gold": -14.766992568969727, |
| "lp_dist": -14.392500877380371, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.48531413078308105, |
| "lp_gold": -14.891570806503296, |
| "lp_dist": -14.406256675720215, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-107", |
| "gold_norm": "14400", |
| "dist_norm": "400", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8998411595821381, |
| "lp_gold": -19.079706698656082, |
| "lp_dist": -19.97954785823822, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.72913409024477, |
| "lp_gold": -22.703229255974293, |
| "lp_dist": -20.974095165729523, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.19682685285806656, |
| "lp_gold": -22.464857898652554, |
| "lp_dist": -22.66168475151062, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.14343415200710297, |
| "lp_gold": -22.713250055909157, |
| "lp_dist": -22.85668420791626, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.5759153068065643, |
| "lp_gold": -27.662475764751434, |
| "lp_dist": -24.08656045794487, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.206382006406784, |
| "lp_gold": -25.311469167470932, |
| "lp_dist": -23.105087161064148, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7291278392076492, |
| "lp_gold": -22.70323248207569, |
| "lp_dist": -20.974104642868042, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-110", |
| "gold_norm": "83", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9284783601760864, |
| "lp_gold": -14.152065396308899, |
| "lp_dist": -15.080543756484985, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.6964447498321533, |
| "lp_gold": -11.795601606369019, |
| "lp_dist": -8.099156856536865, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5404622554779053, |
| "lp_gold": -14.069255948066711, |
| "lp_dist": -12.528793692588806, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4889593124389648, |
| "lp_gold": -14.175909280776978, |
| "lp_dist": -12.686949968338013, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8712440729141235, |
| "lp_gold": -14.255087852478027, |
| "lp_dist": -11.383843779563904, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.085860013961792, |
| "lp_gold": -12.02087950706482, |
| "lp_dist": -8.935019493103027, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.696447730064392, |
| "lp_gold": -11.795600891113281, |
| "lp_dist": -8.09915316104889, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-120", |
| "gold_norm": "335", |
| "dist_norm": "60", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.718258023262024, |
| "lp_gold": -16.84885323047638, |
| "lp_dist": -18.567111253738403, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.495100736618042, |
| "lp_gold": -16.78837823867798, |
| "lp_dist": -13.293277502059937, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.745102643966675, |
| "lp_gold": -21.85818600654602, |
| "lp_dist": -15.113083362579346, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.640980243682861, |
| "lp_gold": -21.43053102493286, |
| "lp_dist": -14.78955078125, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.621345937252045, |
| "lp_gold": -18.077104091644287, |
| "lp_dist": -13.455758154392242, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.602914571762085, |
| "lp_gold": -19.734163284301758, |
| "lp_dist": -15.131248712539673, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.4950921535491943, |
| "lp_gold": -16.78837251663208, |
| "lp_dist": -13.293280363082886, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-134", |
| "gold_norm": "18", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.9239641074091196, |
| "lp_gold": -15.171829616650939, |
| "lp_dist": -19.09579372406006, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7061721086502075, |
| "lp_gold": -9.731460690498352, |
| "lp_dist": -9.025288581848145, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.6682674884796143, |
| "lp_gold": -13.212559461593628, |
| "lp_dist": -13.880826950073242, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.691381573677063, |
| "lp_gold": -13.084003806114197, |
| "lp_dist": -13.77538537979126, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.0653414726257324, |
| "lp_gold": -11.70475959777832, |
| "lp_dist": -8.639418125152588, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6070647239685059, |
| "lp_gold": -11.957983255386353, |
| "lp_dist": -10.350918531417847, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7061715722084045, |
| "lp_gold": -9.731466829776764, |
| "lp_dist": -9.02529525756836, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-138", |
| "gold_norm": "21", |
| "dist_norm": "25", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.6713391542434692, |
| "lp_gold": -17.788984179496765, |
| "lp_dist": -20.460323333740234, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8663175106048584, |
| "lp_gold": -14.334570407867432, |
| "lp_dist": -13.468252897262573, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.3505210876464844, |
| "lp_gold": -21.58456540107727, |
| "lp_dist": -21.935086488723755, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2903881072998047, |
| "lp_gold": -22.286190509796143, |
| "lp_dist": -22.576578617095947, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.9333076477050781, |
| "lp_gold": -15.113465785980225, |
| "lp_dist": -14.180158138275146, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7268631458282471, |
| "lp_gold": -13.463084697723389, |
| "lp_dist": -12.736221551895142, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8663196563720703, |
| "lp_gold": -14.334563970565796, |
| "lp_dist": -13.468244314193726, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-145", |
| "gold_norm": "2", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2492438331246376, |
| "lp_gold": -13.602060556411743, |
| "lp_dist": -14.85130438953638, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.6407327204942703, |
| "lp_gold": -12.194403648376465, |
| "lp_dist": -9.553670927882195, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.6734669059514999, |
| "lp_gold": -13.396984100341797, |
| "lp_dist": -14.070451006293297, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8539667278528214, |
| "lp_gold": -13.266191005706787, |
| "lp_dist": -14.120157733559608, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.3427896350622177, |
| "lp_gold": -10.751940488815308, |
| "lp_dist": -9.40915085375309, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.827908754348755, |
| "lp_gold": -14.305924892425537, |
| "lp_dist": -11.478016138076782, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.640733018517494, |
| "lp_gold": -12.194400548934937, |
| "lp_dist": -9.553667530417442, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-151", |
| "gold_norm": "803", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.22600507736206055, |
| "lp_gold": -20.624857425689697, |
| "lp_dist": -20.850862503051758, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.131483793258667, |
| "lp_gold": -19.228359699249268, |
| "lp_dist": -14.0968759059906, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.2413750886917114, |
| "lp_gold": -19.84936547279358, |
| "lp_dist": -16.607990384101868, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.131054997444153, |
| "lp_gold": -19.767980694770813, |
| "lp_dist": -16.63692569732666, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.6819539070129395, |
| "lp_gold": -18.495857000350952, |
| "lp_dist": -13.813903093338013, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.957081317901611, |
| "lp_gold": -20.852898836135864, |
| "lp_dist": -15.895817518234253, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.131482243537903, |
| "lp_gold": -19.228361129760742, |
| "lp_dist": -14.09687888622284, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-153", |
| "gold_norm": "280", |
| "dist_norm": "13", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.694299184717238, |
| "lp_gold": -15.244264638982713, |
| "lp_dist": -18.93856382369995, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8231047093868256, |
| "lp_gold": -12.64960965514183, |
| "lp_dist": -11.826504945755005, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8308207541704178, |
| "lp_gold": -15.758303448557854, |
| "lp_dist": -16.58912420272827, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.7265715599060059, |
| "lp_gold": -16.042329788208008, |
| "lp_dist": -16.768901348114014, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1412931680679321, |
| "lp_gold": -14.319903492927551, |
| "lp_dist": -13.17861032485962, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4246297478675842, |
| "lp_gold": -15.16783195734024, |
| "lp_dist": -13.743202209472656, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8231084495782852, |
| "lp_gold": -12.649619355797768, |
| "lp_dist": -11.826510906219482, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-154", |
| "gold_norm": "13", |
| "dist_norm": "20", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0602927803993225, |
| "lp_gold": -15.215918719768524, |
| "lp_dist": -17.276211500167847, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.633309006690979, |
| "lp_gold": -12.038846015930176, |
| "lp_dist": -11.405537009239197, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6033051013946533, |
| "lp_gold": -17.432909965515137, |
| "lp_dist": -16.829604864120483, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6372992992401123, |
| "lp_gold": -17.39775514602661, |
| "lp_dist": -16.7604558467865, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.48968505859375, |
| "lp_gold": -13.174037456512451, |
| "lp_dist": -11.684352397918701, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2143868207931519, |
| "lp_gold": -12.643470287322998, |
| "lp_dist": -11.429083466529846, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6333088874816895, |
| "lp_gold": -12.038848400115967, |
| "lp_dist": -11.405539512634277, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-175", |
| "gold_norm": "350", |
| "dist_norm": "50", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9250896275043488, |
| "lp_gold": -22.99281856417656, |
| "lp_dist": -23.917908191680908, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.236621737480164, |
| "lp_gold": -17.58333122730255, |
| "lp_dist": -10.346709489822388, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.966416835784912, |
| "lp_gold": -20.354339838027954, |
| "lp_dist": -17.387923002243042, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.5315762758255005, |
| "lp_gold": -21.129161953926086, |
| "lp_dist": -17.597585678100586, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.064225733280182, |
| "lp_gold": -18.191003382205963, |
| "lp_dist": -11.126777648925781, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.836197316646576, |
| "lp_gold": -21.282770097255707, |
| "lp_dist": -13.44657278060913, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.236626446247101, |
| "lp_gold": -17.583332121372223, |
| "lp_dist": -10.346705675125122, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-207", |
| "gold_norm": "22", |
| "dist_norm": "16", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.621975004673004, |
| "lp_gold": -11.219844043254852, |
| "lp_dist": -13.841819047927856, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.833777904510498, |
| "lp_gold": -10.41726541519165, |
| "lp_dist": -9.583487510681152, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4578208923339844, |
| "lp_gold": -15.501156091690063, |
| "lp_dist": -14.043335199356079, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.725459337234497, |
| "lp_gold": -15.87945008277893, |
| "lp_dist": -14.153990745544434, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1624937057495117, |
| "lp_gold": -15.793409585952759, |
| "lp_dist": -14.630915880203247, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6316690444946289, |
| "lp_gold": -10.619104146957397, |
| "lp_dist": -9.987435102462769, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8337790966033936, |
| "lp_gold": -10.417269945144653, |
| "lp_dist": -9.58349084854126, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-215", |
| "gold_norm": "2", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.387197434902191, |
| "lp_gold": -9.424871981143951, |
| "lp_dist": -12.812069416046143, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5975170135498047, |
| "lp_gold": -9.744040250778198, |
| "lp_dist": -8.146523237228394, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.2604313492774963, |
| "lp_gold": -12.714868068695068, |
| "lp_dist": -9.454436719417572, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.084445893764496, |
| "lp_gold": -12.945753574371338, |
| "lp_dist": -9.861307680606842, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5732927322387695, |
| "lp_gold": -12.206480503082275, |
| "lp_dist": -10.633187770843506, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1921443939208984, |
| "lp_gold": -10.054778575897217, |
| "lp_dist": -8.862634181976318, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5975122451782227, |
| "lp_gold": -9.744039058685303, |
| "lp_dist": -8.14652681350708, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-225", |
| "gold_norm": "1596", |
| "dist_norm": "81", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9876238331198692, |
| "lp_gold": -17.49847326427698, |
| "lp_dist": -19.48609709739685, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.1633647084236145, |
| "lp_gold": -24.1348779797554, |
| "lp_dist": -17.971513271331787, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7931787371635437, |
| "lp_gold": -18.82551997900009, |
| "lp_dist": -17.032341241836548, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.224741965532303, |
| "lp_gold": -19.164868861436844, |
| "lp_dist": -16.94012689590454, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -10.08569085597992, |
| "lp_gold": -30.44549548625946, |
| "lp_dist": -20.35980463027954, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -9.3379967212677, |
| "lp_gold": -30.265968084335327, |
| "lp_dist": -20.927971363067627, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.163362383842468, |
| "lp_gold": -24.134878516197205, |
| "lp_dist": -17.971516132354736, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-226", |
| "gold_norm": "81", |
| "dist_norm": "56", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.605985850095749, |
| "lp_gold": -15.771342545747757, |
| "lp_dist": -19.377328395843506, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6331486701965332, |
| "lp_gold": -16.669665813446045, |
| "lp_dist": -15.036517143249512, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.214464545249939, |
| "lp_gold": -16.49272656440735, |
| "lp_dist": -15.27826201915741, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.9139895439147949, |
| "lp_gold": -16.460952043533325, |
| "lp_dist": -15.54696249961853, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7367334365844727, |
| "lp_gold": -19.27399492263794, |
| "lp_dist": -18.537261486053467, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8308906555175781, |
| "lp_gold": -16.610596179962158, |
| "lp_dist": -14.77970552444458, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6331486701965332, |
| "lp_gold": -16.66966152191162, |
| "lp_dist": -15.036512851715088, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-238", |
| "gold_norm": "60", |
| "dist_norm": "4", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.38430750370025635, |
| "lp_gold": -13.078525424003601, |
| "lp_dist": -13.462832927703857, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5117335319519043, |
| "lp_gold": -10.930900573730469, |
| "lp_dist": -9.419167041778564, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7834737300872803, |
| "lp_gold": -15.81191873550415, |
| "lp_dist": -14.02844500541687, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.6578322649002075, |
| "lp_gold": -15.757366299629211, |
| "lp_dist": -14.099534034729004, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.469817161560059, |
| "lp_gold": -15.78016185760498, |
| "lp_dist": -10.310344696044922, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.3997215032577515, |
| "lp_gold": -11.266274809837341, |
| "lp_dist": -9.86655330657959, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5117324590682983, |
| "lp_gold": -10.930898785591125, |
| "lp_dist": -9.419166326522827, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-242", |
| "gold_norm": "19", |
| "dist_norm": "6", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.181618273258209, |
| "lp_gold": -12.157159745693207, |
| "lp_dist": -17.338778018951416, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4070416688919067, |
| "lp_gold": -10.997576355934143, |
| "lp_dist": -9.590534687042236, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.4950295239686966, |
| "lp_gold": -14.810317918658257, |
| "lp_dist": -16.305347442626953, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.47235550545156, |
| "lp_gold": -14.627300599589944, |
| "lp_dist": -16.099656105041504, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7640156745910645, |
| "lp_gold": -15.07509469985962, |
| "lp_dist": -12.311079025268555, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5993056297302246, |
| "lp_gold": -12.642723798751831, |
| "lp_dist": -10.043418169021606, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4070385694503784, |
| "lp_gold": -10.997580409049988, |
| "lp_dist": -9.59054183959961, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "gsm8k-test-246", |
| "gold_norm": "130000", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.715070237376494, |
| "lp_gold": -12.472108629561262, |
| "lp_dist": -17.187178866937757, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.156454911455512, |
| "lp_gold": -13.647521084174514, |
| "lp_dist": -6.491066172719002, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.767381154000759, |
| "lp_gold": -21.409174405038357, |
| "lp_dist": -15.641793251037598, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.478326896904036, |
| "lp_gold": -22.25712094712071, |
| "lp_dist": -16.778794050216675, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.735332287847996, |
| "lp_gold": -17.688330195844173, |
| "lp_dist": -9.952997907996178, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.886516407132149, |
| "lp_gold": -16.19584783911705, |
| "lp_dist": -9.309331431984901, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.15645333006978, |
| "lp_gold": -13.64751996472478, |
| "lp_dist": -6.491066634654999, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 3 |
| } |
| } |
| ] |
| } |