{ "meta": { "model": "meta-llama/Llama-2-7b-chat-hf", "device": "cuda", "dtype": "fp32", "layer": 10, "layers_path": "model.layers", "seed": 123, "task": "humaneval", "eval_mode": "pair_logprob", "eval_meta": { "hf_id": "openai_humaneval", "split": "test", "n_total": 164 }, "n_eval_loaded": 164, "n_scanned": 164, "base_acc_scan": 0.6585365853658537, "ablt_acc_scan": 0.6402439024390244, "flips_total": 8, "flips_used": 8, "patch_steps": [ 0, 1, 2, 3 ], "patch_n_steps": 4, "Qs_path": "Q_shared_layer10.npy", "Qs_shape": [ 4096, 97 ], "gold_text_prefix": " ", "dist_text_prefix": " ", "gold_max_tokens": 128, "distractor_mode": "next_gold", "answer_prefix_effective": "\nFinal answer:", "max_new_tokens_effective": 64, "run_coeff_controls": false, "use_benchmark_loader": false, "hf_id": "openai_humaneval", "hf_split": "test" }, "summary_on_flips": { "patched_self": { "n": 8, "rescued": 6, "rescued_pct": 75.0, "mean_delta_margin_vs_ablated": 2.1062567234039307, "median_delta_margin_vs_ablated": 1.8823415040969849 }, "control_time_shuffled": { "n": 8, "rescued": 5, "rescued_pct": 62.5, "mean_delta_margin_vs_ablated": 1.611517310142517, "median_delta_margin_vs_ablated": 1.736205816268921 }, "control_shared_randvec": { "n": 8, "rescued": 0, "rescued_pct": 0.0, "mean_delta_margin_vs_ablated": -0.5279055237770081, "median_delta_margin_vs_ablated": -0.47387340664863586 }, "control_rand_subspace": { "n": 8, "rescued": 1, "rescued_pct": 12.5, "mean_delta_margin_vs_ablated": -0.11733363568782806, "median_delta_margin_vs_ablated": -0.24908697605133057 }, "control_patch_nonshared": { "n": 8, "rescued": 1, "rescued_pct": 12.5, "mean_delta_margin_vs_ablated": 0.048883724957704544, "median_delta_margin_vs_ablated": 0.0663357526063919 } }, "scan_rows": [ { "ex_id": "openai_humaneval-test-18", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.234334945678711, "lp_gold": -19.061742782592773, "lp_dist": -21.296077728271484, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.2801809310913086, "lp_gold": -21.540138244628906, "lp_dist": -23.820319175720215, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-31", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -24.834365844726562, "lp_dist": -24.834365844726562, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -26.02178192138672, "lp_dist": -26.02178192138672, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-158", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.071807861328125, "lp_dist": -20.071807861328125, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.830044746398926, "lp_dist": -20.830044746398926, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-43", "gold_norm": "0", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 0.9933905601501465, "lp_gold": -19.265485286712646, "lp_dist": -20.258875846862793, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.1291828155517578, "lp_gold": -20.913329124450684, "lp_dist": -21.04251194000244, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-39", "gold_norm": "-1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.44877076148986816, "lp_gold": -19.808196783065796, "lp_dist": -20.256967544555664, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.15900945663452148, "lp_gold": -22.082778453826904, "lp_dist": -21.923768997192383, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-15", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 1.9036493301391602, "lp_gold": -21.704959869384766, "lp_dist": -23.608609199523926, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.597311019897461, "lp_gold": -22.718390464782715, "lp_dist": -24.315701484680176, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-151", "gold_norm": "0", "dist_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()", "baseline": { "pred": "gold", "correct": true, "margin": 121.7853691404589, "lp_gold": -18.80201482772827, "lp_dist": -140.58738396818717, "n_tokens_gold": 2, "n_tokens_dist": 79 }, "ablated": { "pred": "gold", "correct": true, "margin": 121.62771476514445, "lp_gold": -21.097187042236328, "lp_dist": -142.72490180738077, "n_tokens_gold": 2, "n_tokens_dist": 79 } }, { "ex_id": "openai_humaneval-test-101", "gold_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()", "dist_norm": "return sorted(list(set(l)))", "baseline": { "pred": "dist", "correct": false, "margin": -31.890593631385258, "lp_gold": -82.64533315385356, "lp_dist": -50.7547395224683, "n_tokens_gold": 79, "n_tokens_dist": 10 }, "ablated": { "pred": "dist", "correct": false, "margin": -36.78753035544207, "lp_gold": -92.11622255231043, "lp_dist": -55.32869219686836, "n_tokens_gold": 79, "n_tokens_dist": 10 } }, { "ex_id": "openai_humaneval-test-34", "gold_norm": "return sorted(list(set(l)))", "dist_norm": "for e in l:\n if e >= t:\n return False\n return True", "baseline": { "pred": "gold", "correct": true, "margin": 49.12246614873311, "lp_gold": -11.745759465690753, "lp_dist": -60.868225614423864, "n_tokens_gold": 10, "n_tokens_dist": 21 }, "ablated": { "pred": "gold", "correct": true, "margin": 45.176009765720664, "lp_gold": -15.221526360244752, "lp_dist": -60.397536125965416, "n_tokens_gold": 10, "n_tokens_dist": 21 } }, { "ex_id": "openai_humaneval-test-52", "gold_norm": "for e in l:\n if e >= t:\n return False\n return True", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -18.150727199346875, "lp_gold": -34.72359650018916, "lp_dist": -16.572869300842285, "n_tokens_gold": 21, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -18.006402769460692, "lp_gold": -34.587387361898436, "lp_dist": -16.580984592437744, "n_tokens_gold": 21, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-113", "gold_norm": "1", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 2.4202613830566406, "lp_gold": -14.915795803070068, "lp_dist": -17.33605718612671, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.1670427322387695, "lp_gold": -17.457444190979004, "lp_dist": -19.624486923217773, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-119", "gold_norm": "2", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.728981971740723, "lp_dist": -19.728981971740723, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.550307273864746, "lp_dist": -20.550307273864746, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-83", "gold_norm": "2", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.055248260498047, "lp_gold": -21.746171951293945, "lp_dist": -19.6909236907959, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.8670825958251953, "lp_gold": -22.857107162475586, "lp_dist": -20.99002456665039, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-116", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.5492916107177734, "lp_gold": -13.82323694229126, "lp_dist": -16.372528553009033, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.914440155029297, "lp_gold": -15.276045799255371, "lp_dist": -19.190485954284668, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-56", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.711745262145996, "lp_dist": -20.711745262145996, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.994271278381348, "lp_dist": -22.994271278381348, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-131", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.21437931060791, "lp_dist": -20.21437931060791, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.886200428009033, "lp_dist": -21.886200428009033, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-1", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -23.005115509033203, "lp_dist": -23.005115509033203, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.74118995666504, "lp_dist": -22.74118995666504, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-159", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -4.005736351013184, "lp_gold": -21.52419900894165, "lp_dist": -17.518462657928467, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.665071487426758, "lp_gold": -20.96841859817505, "lp_dist": -17.30334711074829, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-123", "gold_norm": "1", "dist_norm": "return len(string)", "baseline": { "pred": "gold", "correct": true, "margin": 22.28865046799183, "lp_gold": -18.46173858642578, "lp_dist": -40.75038905441761, "n_tokens_gold": 2, "n_tokens_dist": 6 }, "ablated": { "pred": "gold", "correct": true, "margin": 21.772113933227956, "lp_gold": -18.439892768859863, "lp_dist": -40.21200670208782, "n_tokens_gold": 2, "n_tokens_dist": 6 } }, { "ex_id": "openai_humaneval-test-23", "gold_norm": "return len(string)", "dist_norm": "29", "baseline": { "pred": "gold", "correct": true, "margin": 12.790524657903006, "lp_gold": -15.688145462336252, "lp_dist": -28.478670120239258, "n_tokens_gold": 6, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 13.356130968688376, "lp_gold": -16.101557362915628, "lp_dist": -29.457688331604004, "n_tokens_gold": 6, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-124", "gold_norm": "29", "dist_norm": "return encode_cyclic(encode_cyclic(s))", "baseline": { "pred": "gold", "correct": true, "margin": 50.18577869143337, "lp_gold": -21.59093189239502, "lp_dist": -71.77671058382839, "n_tokens_gold": 3, "n_tokens_dist": 14 }, "ablated": { "pred": "gold", "correct": true, "margin": 56.64552312903106, "lp_gold": -22.290241479873657, "lp_dist": -78.93576460890472, "n_tokens_gold": 3, "n_tokens_dist": 14 } }, { "ex_id": "openai_humaneval-test-38", "gold_norm": "return encode_cyclic(encode_cyclic(s))", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -2.7194257393894077, "lp_gold": -29.536041797739017, "lp_dist": -26.81661605834961, "n_tokens_gold": 14, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.8930526294279844, "lp_gold": -29.922911218600348, "lp_dist": -26.029858589172363, "n_tokens_gold": 14, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-84", "gold_norm": "2", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.775970458984375, "lp_dist": -19.775970458984375, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.496947288513184, "lp_dist": -20.496947288513184, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-41", "gold_norm": "2", "dist_norm": "122", "baseline": { "pred": "gold", "correct": true, "margin": 8.278376579284668, "lp_gold": -22.390517234802246, "lp_dist": -30.668893814086914, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 9.729191303253174, "lp_gold": -21.189680099487305, "lp_dist": -30.91887140274048, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-134", "gold_norm": "122", "dist_norm": "1.0", "baseline": { "pred": "gold", "correct": true, "margin": 0.8403654620051384, "lp_gold": -27.5423321723938, "lp_dist": -28.382697634398937, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.457980029284954, "lp_gold": -29.874002933502197, "lp_dist": -27.416022904217243, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-2", "gold_norm": "1.0", "dist_norm": "+2", "baseline": { "pred": "gold", "correct": true, "margin": 5.533992663025856, "lp_gold": -19.524202451109886, "lp_dist": -25.058195114135742, "n_tokens_gold": 4, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.357768684625626, "lp_gold": -17.571494430303574, "lp_dist": -25.9292631149292, "n_tokens_gold": 4, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-80", "gold_norm": "+2", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -5.821111679077148, "lp_gold": -24.557270050048828, "lp_dist": -18.73615837097168, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.8031325340271, "lp_gold": -26.246718406677246, "lp_dist": -20.443585872650146, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-74", "gold_norm": "2", "dist_norm": "5", "baseline": { "pred": "gold", "correct": true, "margin": 1.2735700607299805, "lp_gold": -17.82392930984497, "lp_dist": -19.09749937057495, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.1347217559814453, "lp_gold": -18.707550048828125, "lp_dist": -19.84227180480957, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-162", "gold_norm": "5", "dist_norm": "8", "baseline": { "pred": "gold", "correct": true, "margin": 0.4634990692138672, "lp_gold": -19.24656867980957, "lp_dist": -19.710067749023438, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.9875469207763672, "lp_gold": -21.240734100341797, "lp_dist": -22.228281021118164, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-138", "gold_norm": "8", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -1.1303815841674805, "lp_gold": -21.265185356140137, "lp_dist": -20.134803771972656, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.3591747283935547, "lp_gold": -23.056735038757324, "lp_dist": -21.69756031036377, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-87", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.202168941497803, "lp_dist": -21.202168941497803, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.12106418609619, "lp_dist": -22.12106418609619, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-145", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.991208076477051, "lp_gold": -22.26430034637451, "lp_dist": -19.27309226989746, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.47486686706543, "lp_gold": -24.317991256713867, "lp_dist": -19.843124389648438, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-54", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.8734645843505859, "lp_gold": -22.476731300354004, "lp_dist": -23.35019588470459, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.6492033004760742, "lp_gold": -24.899346351623535, "lp_dist": -26.54854965209961, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-109", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.480426788330078, "lp_gold": -19.18559741973877, "lp_dist": -16.70517063140869, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.790494918823242, "lp_gold": -21.397884845733643, "lp_dist": -16.6073899269104, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-102", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.423691272735596, "lp_dist": -19.423691272735596, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.474623680114746, "lp_dist": -20.474623680114746, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-62", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -17.160552978515625, "lp_dist": -17.160552978515625, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -15.657040119171143, "lp_dist": -15.657040119171143, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-129", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.168509483337402, "lp_dist": -19.168509483337402, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.092866897583008, "lp_dist": -19.092866897583008, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-110", "gold_norm": "1", "dist_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", "baseline": { "pred": "gold", "correct": true, "margin": 56.64819990084652, "lp_gold": -12.325343608856201, "lp_dist": -68.97354350970272, "n_tokens_gold": 2, "n_tokens_dist": 33 }, "ablated": { "pred": "gold", "correct": true, "margin": 60.25500547605043, "lp_gold": -11.27043867111206, "lp_dist": -71.52544414716249, "n_tokens_gold": 2, "n_tokens_dist": 33 } }, { "ex_id": "openai_humaneval-test-4", "gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.45850569046490364, "lp_gold": -22.989404618372987, "lp_dist": -23.44791030883789, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.605282360978833, "lp_gold": -30.1749826020677, "lp_dist": -21.569700241088867, "n_tokens_gold": 33, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-8", "gold_norm": "1", "dist_norm": "10", "baseline": { "pred": "gold", "correct": true, "margin": 3.0166356563568115, "lp_gold": -18.3596510887146, "lp_dist": -21.37628674507141, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.0787312984466553, "lp_gold": -18.83952569961548, "lp_dist": -21.918256998062134, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-97", "gold_norm": "10", "dist_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b", "baseline": { "pred": "gold", "correct": true, "margin": 94.84300347900216, "lp_gold": -21.25941014289856, "lp_dist": -116.10241362190072, "n_tokens_gold": 3, "n_tokens_dist": 100 }, "ablated": { "pred": "gold", "correct": true, "margin": 98.10631249527887, "lp_gold": -23.950812816619873, "lp_dist": -122.05712531189874, "n_tokens_gold": 3, "n_tokens_dist": 100 } }, { "ex_id": "openai_humaneval-test-137", "gold_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -67.42598734535835, "lp_gold": -83.42734728493355, "lp_dist": -16.001359939575195, "n_tokens_gold": 100, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -67.17782856585814, "lp_gold": -82.82997252108885, "lp_dist": -15.652143955230713, "n_tokens_gold": 100, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-96", "gold_norm": "0", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 3.6102466583251953, "lp_gold": -18.234556198120117, "lp_dist": -21.844802856445312, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.688672065734863, "lp_gold": -19.75603199005127, "lp_dist": -24.444704055786133, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-65", "gold_norm": "-1", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -0.8471782207489014, "lp_gold": -22.299251317977905, "lp_dist": -21.452073097229004, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.24910449981689453, "lp_gold": -23.086091995239258, "lp_dist": -22.836987495422363, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-114", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -1.8158721923828125, "lp_gold": -22.96446418762207, "lp_dist": -21.148591995239258, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.1736278533935547, "lp_gold": -25.378070831298828, "lp_dist": -22.204442977905273, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-117", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -15.196922779083252, "lp_dist": -15.196922779083252, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -15.9734787940979, "lp_dist": -15.9734787940979, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-155", "gold_norm": "1", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 2.606046199798584, "lp_gold": -18.941298484802246, "lp_dist": -21.54734468460083, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.5056796073913574, "lp_gold": -20.002915382385254, "lp_dist": -23.50859498977661, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-37", "gold_norm": "-1", "dist_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])", "baseline": { "pred": "gold", "correct": true, "margin": 86.6937904068327, "lp_gold": -20.384744882583618, "lp_dist": -107.07853528941632, "n_tokens_gold": 3, "n_tokens_dist": 20 }, "ablated": { "pred": "gold", "correct": true, "margin": 87.06271385207947, "lp_gold": -21.526390075683594, "lp_dist": -108.58910392776306, "n_tokens_gold": 3, "n_tokens_dist": 20 } }, { "ex_id": "openai_humaneval-test-115", "gold_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])", "dist_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])", "baseline": { "pred": "gold", "correct": true, "margin": 24.88441585241617, "lp_gold": -46.10199257756449, "lp_dist": -70.98640842998066, "n_tokens_gold": 20, "n_tokens_dist": 24 }, "ablated": { "pred": "gold", "correct": true, "margin": 29.905949345506542, "lp_gold": -44.645553992972054, "lp_dist": -74.5515033384786, "n_tokens_gold": 20, "n_tokens_dist": 24 } }, { "ex_id": "openai_humaneval-test-86", "gold_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -14.466940372163663, "lp_gold": -36.1265668560809, "lp_dist": -21.659626483917236, "n_tokens_gold": 24, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -15.646357826058647, "lp_gold": -39.13540583020904, "lp_dist": -23.48904800415039, "n_tokens_gold": 24, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-85", "gold_norm": "0", "dist_norm": "return x + y", "baseline": { "pred": "gold", "correct": true, "margin": 6.443425707519054, "lp_gold": -16.92650079727173, "lp_dist": -23.369926504790783, "n_tokens_gold": 2, "n_tokens_dist": 5 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.885157495737076, "lp_gold": -17.333487033843994, "lp_dist": -23.21864452958107, "n_tokens_gold": 2, "n_tokens_dist": 5 } }, { "ex_id": "openai_humaneval-test-53", "gold_norm": "return x + y", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 9.916433400284404, "lp_gold": -10.99827282987917, "lp_dist": -20.914706230163574, "n_tokens_gold": 5, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 9.794849171276837, "lp_gold": -10.414248690966815, "lp_dist": -20.209097862243652, "n_tokens_gold": 5, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-130", "gold_norm": "2", "dist_norm": "+1", "baseline": { "pred": "gold", "correct": true, "margin": 2.369633674621582, "lp_gold": -20.579893589019775, "lp_dist": -22.949527263641357, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.291049003601074, "lp_gold": -21.44382381439209, "lp_dist": -24.734872817993164, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-139", "gold_norm": "+1", "dist_norm": "0.0", "baseline": { "pred": "gold", "correct": true, "margin": 1.2652113437652588, "lp_gold": -22.48181676864624, "lp_dist": -23.7470281124115, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.1343896389007568, "lp_gold": -23.847239017486572, "lp_dist": -22.712849378585815, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-81", "gold_norm": "0.0", "dist_norm": "9", "baseline": { "pred": "dist", "correct": false, "margin": -0.3655529320240021, "lp_gold": -22.439337760210037, "lp_dist": -22.073784828186035, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.13648897409439087, "lp_gold": -23.279959738254547, "lp_dist": -23.143470764160156, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-19", "gold_norm": "9", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.07897377014160156, "lp_gold": -22.66366195678711, "lp_dist": -22.74263572692871, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4677810668945312, "lp_gold": -23.768733024597168, "lp_dist": -22.300951957702637, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-69", "gold_norm": "1", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 2.650723457336426, "lp_gold": -15.441932678222656, "lp_dist": -18.092656135559082, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.28917121887207, "lp_gold": -14.679984092712402, "lp_dist": -18.969155311584473, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-122", "gold_norm": "2", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -0.9139537811279297, "lp_gold": -18.55172109603882, "lp_dist": -17.63776731491089, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.36802101135253906, "lp_gold": -18.541946411132812, "lp_dist": -18.173925399780273, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-108", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.612565040588379, "lp_gold": -22.057893753051758, "lp_dist": -19.44532871246338, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.8640785217285156, "lp_gold": -23.09704303741455, "lp_dist": -19.232964515686035, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-48", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.96462059020996, "lp_dist": -21.96462059020996, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -24.449016571044922, "lp_dist": -24.449016571044922, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-17", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 1.34698486328125, "lp_gold": -21.55142116546631, "lp_dist": -22.89840602874756, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.0376691818237305, "lp_gold": -20.85892963409424, "lp_dist": -21.89659881591797, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-95", "gold_norm": "0", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -1.8023662567138672, "lp_gold": -20.999281883239746, "lp_dist": -19.19691562652588, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.70697021484375, "lp_gold": -21.713737964630127, "lp_dist": -20.006767749786377, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-91", "gold_norm": "2", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.7028169631958, "lp_dist": -21.7028169631958, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.45036220550537, "lp_dist": -22.45036220550537, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-79", "gold_norm": "2", "dist_norm": "+1", "baseline": { "pred": "gold", "correct": true, "margin": 6.754400730133057, "lp_gold": -17.42607831954956, "lp_dist": -24.180479049682617, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.1651506423950195, "lp_gold": -19.892897605895996, "lp_dist": -26.058048248291016, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-14", "gold_norm": "+1", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -3.5707898139953613, "lp_gold": -29.604674816131592, "lp_dist": -26.03388500213623, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.903932571411133, "lp_gold": -27.936614990234375, "lp_dist": -22.032682418823242, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-77", "gold_norm": "3", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.08347511291503906, "lp_gold": -20.79250478744507, "lp_dist": -20.875979900360107, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.4646930694580078, "lp_gold": -21.741466522216797, "lp_dist": -22.206159591674805, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-3", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.1768722534179688, "lp_gold": -23.416447639465332, "lp_dist": -21.239575386047363, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.3070783615112305, "lp_gold": -20.640877723693848, "lp_dist": -16.333799362182617, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-146", "gold_norm": "1", "dist_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False", "baseline": { "pred": "gold", "correct": true, "margin": 52.92330244462937, "lp_gold": -17.06475257873535, "lp_dist": -69.98805502336472, "n_tokens_gold": 2, "n_tokens_dist": 27 }, "ablated": { "pred": "gold", "correct": true, "margin": 50.48234937642701, "lp_gold": -18.093742847442627, "lp_dist": -68.57609222386964, "n_tokens_gold": 2, "n_tokens_dist": 27 } }, { "ex_id": "openai_humaneval-test-57", "gold_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -19.544386039720848, "lp_gold": -39.81637967680581, "lp_dist": -20.27199363708496, "n_tokens_gold": 27, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -23.215539787866874, "lp_gold": -47.03316483364324, "lp_dist": -23.817625045776367, "n_tokens_gold": 27, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-143", "gold_norm": "2", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -18.56871795654297, "lp_dist": -18.56871795654297, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -18.614916801452637, "lp_dist": -18.614916801452637, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-0", "gold_norm": "2", "dist_norm": "+1", "baseline": { "pred": "gold", "correct": true, "margin": 7.107102870941162, "lp_gold": -24.08918285369873, "lp_dist": -31.196285724639893, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.970984935760498, "lp_gold": -24.804469108581543, "lp_dist": -31.77545404434204, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-106", "gold_norm": "+1", "dist_norm": "26", "baseline": { "pred": "gold", "correct": true, "margin": 2.2219808101654053, "lp_gold": -21.646430253982544, "lp_dist": -23.86841106414795, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4264450073242188, "lp_gold": -24.843106269836426, "lp_dist": -23.416661262512207, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-50", "gold_norm": "26", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -0.6608308553695679, "lp_gold": -26.048061728477478, "lp_dist": -25.38723087310791, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.0806753635406494, "lp_gold": -25.965544939041138, "lp_dist": -23.88486957550049, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-58", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.0009937286376953, "lp_gold": -17.66666841506958, "lp_dist": -19.667662143707275, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.9970569610595703, "lp_gold": -19.63721752166748, "lp_dist": -21.63427448272705, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-147", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.277851104736328, "lp_gold": -21.182048797607422, "lp_dist": -17.904197692871094, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.9687089920043945, "lp_gold": -23.235244750976562, "lp_dist": -19.266535758972168, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-160", "gold_norm": "1", "dist_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)", "baseline": { "pred": "gold", "correct": true, "margin": 82.88829492655168, "lp_gold": -16.034985065460205, "lp_dist": -98.92327999201189, "n_tokens_gold": 2, "n_tokens_dist": 45 }, "ablated": { "pred": "gold", "correct": true, "margin": 79.75362334529109, "lp_gold": -16.3989200592041, "lp_dist": -96.15254340449519, "n_tokens_gold": 2, "n_tokens_dist": 45 } }, { "ex_id": "openai_humaneval-test-67", "gold_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -49.84833694786221, "lp_gold": -66.66380525917202, "lp_dist": -16.815468311309814, "n_tokens_gold": 45, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -48.502373244005184, "lp_gold": -65.89947893782232, "lp_dist": -17.39710569381714, "n_tokens_gold": 45, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-55", "gold_norm": "2", "dist_norm": "-1", "baseline": { "pred": "dist", "correct": false, "margin": -0.25999921560287476, "lp_gold": -24.11659049987793, "lp_dist": -23.856591284275055, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.37139952182769775, "lp_gold": -24.66786289215088, "lp_dist": -25.039262413978577, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-118", "gold_norm": "-1", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -6.466064929962158, "lp_gold": -23.7993106842041, "lp_dist": -17.333245754241943, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.198596954345703, "lp_gold": -26.275648593902588, "lp_dist": -18.077051639556885, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-154", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -14.774529933929443, "lp_dist": -14.774529933929443, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -16.673503875732422, "lp_dist": -16.673503875732422, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-25", "gold_norm": "1", "dist_norm": "7", "baseline": { "pred": "gold", "correct": true, "margin": 3.230724334716797, "lp_gold": -22.422992706298828, "lp_dist": -25.653717041015625, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.7021970748901367, "lp_gold": -19.818692207336426, "lp_dist": -23.520889282226562, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-36", "gold_norm": "7", "dist_norm": "3", "baseline": { "pred": "gold", "correct": true, "margin": 2.6618576049804688, "lp_gold": -20.638415336608887, "lp_dist": -23.300272941589355, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.1126937866210938, "lp_gold": -22.601848602294922, "lp_dist": -25.714542388916016, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-63", "gold_norm": "3", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -1.4981985092163086, "lp_gold": -23.173019409179688, "lp_dist": -21.67482089996338, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.069631576538086, "lp_gold": -24.268108367919922, "lp_dist": -23.198476791381836, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-132", "gold_norm": "2", "dist_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]", "baseline": { "pred": "gold", "correct": true, "margin": 68.18116059236388, "lp_gold": -18.65871238708496, "lp_dist": -86.83987297944884, "n_tokens_gold": 2, "n_tokens_dist": 44 }, "ablated": { "pred": "gold", "correct": true, "margin": 67.7937639305901, "lp_gold": -20.1024751663208, "lp_dist": -87.8962390969109, "n_tokens_gold": 2, "n_tokens_dist": 44 } }, { "ex_id": "openai_humaneval-test-21", "gold_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]", "dist_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result", "baseline": { "pred": "gold", "correct": true, "margin": 23.721003586428438, "lp_gold": -41.06687730719631, "lp_dist": -64.78788089362475, "n_tokens_gold": 44, "n_tokens_dist": 69 }, "ablated": { "pred": "gold", "correct": true, "margin": 32.38431728373951, "lp_gold": -42.50547727979096, "lp_dist": -74.88979456353047, "n_tokens_gold": 44, "n_tokens_dist": 69 } }, { "ex_id": "openai_humaneval-test-9", "gold_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -29.805189722821147, "lp_gold": -50.413360232159526, "lp_dist": -20.60817050933838, "n_tokens_gold": 69, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -51.59206402557629, "lp_gold": -70.91829026001233, "lp_dist": -19.326226234436035, "n_tokens_gold": 69, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-72", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 5.30158805847168, "lp_gold": -13.809431552886963, "lp_dist": -19.111019611358643, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 4.650420188903809, "lp_gold": -15.45082139968872, "lp_dist": -20.10124158859253, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-128", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.34075927734375, "lp_gold": -21.099153995513916, "lp_dist": -17.758394718170166, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -4.7813920974731445, "lp_gold": -22.55471706390381, "lp_dist": -17.773324966430664, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-126", "gold_norm": "1", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 7.619829177856445, "lp_gold": -16.480793476104736, "lp_dist": -24.10062265396118, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 7.267373561859131, "lp_gold": -18.10063362121582, "lp_dist": -25.36800718307495, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-70", "gold_norm": "-1", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -2.621180534362793, "lp_gold": -23.137717247009277, "lp_dist": -20.516536712646484, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.558529853820801, "lp_gold": -25.226367950439453, "lp_dist": -22.667838096618652, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-40", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -18.633878707885742, "lp_dist": -18.633878707885742, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.059532165527344, "lp_dist": -21.059532165527344, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-66", "gold_norm": "0", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -0.13283920288085938, "lp_gold": -17.218024253845215, "lp_dist": -17.085185050964355, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.1121377944946289, "lp_gold": -18.084847927093506, "lp_dist": -17.972710132598877, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-71", "gold_norm": "2", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -1.2154407501220703, "lp_gold": -18.285062789916992, "lp_dist": -17.069622039794922, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.0220775604248047, "lp_gold": -19.058651447296143, "lp_dist": -18.036573886871338, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-107", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.3218297958374023, "lp_gold": -17.986241340637207, "lp_dist": -20.30807113647461, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.979066848754883, "lp_gold": -16.702817916870117, "lp_dist": -19.681884765625, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-32", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 2.1487884521484375, "lp_gold": -22.8189640045166, "lp_dist": -24.96775245666504, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.087052345275879, "lp_gold": -22.50095844268799, "lp_dist": -24.588010787963867, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-11", "gold_norm": "1", "dist_norm": "2.0", "baseline": { "pred": "gold", "correct": true, "margin": 13.553829669952393, "lp_gold": -20.95840072631836, "lp_dist": -34.51223039627075, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 11.875772953033447, "lp_gold": -22.25577449798584, "lp_dist": -34.13154745101929, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-47", "gold_norm": "2.0", "dist_norm": "return [x for x in values if isinstance(x, int)]", "baseline": { "pred": "gold", "correct": true, "margin": 22.397947419434786, "lp_gold": -22.972108960151672, "lp_dist": -45.37005637958646, "n_tokens_gold": 4, "n_tokens_dist": 16 }, "ablated": { "pred": "gold", "correct": true, "margin": 21.156506193903624, "lp_gold": -25.384591817855835, "lp_dist": -46.54109801175946, "n_tokens_gold": 4, "n_tokens_dist": 16 } }, { "ex_id": "openai_humaneval-test-22", "gold_norm": "return [x for x in values if isinstance(x, int)]", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.9452141776912413, "lp_gold": -21.9394551262028, "lp_dist": -24.884669303894043, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.21686965267372216, "lp_gold": -23.520365059778214, "lp_dist": -23.303495407104492, "n_tokens_gold": 16, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-140", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.853110790252686, "lp_dist": -22.853110790252686, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -24.441052436828613, "lp_dist": -24.441052436828613, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-30", "gold_norm": "0", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 1.6810879707336426, "lp_gold": -22.556360244750977, "lp_dist": -24.23744821548462, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.2648732662200928, "lp_gold": -24.702096939086914, "lp_dist": -26.966970205307007, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-10", "gold_norm": "-1", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -26.95520782470703, "lp_dist": -26.95520782470703, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -24.403673887252808, "lp_dist": -24.403673887252808, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-112", "gold_norm": "-1", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -6.778575420379639, "lp_gold": -21.71489953994751, "lp_dist": -14.936324119567871, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.789752006530762, "lp_gold": -22.717198848724365, "lp_dist": -15.927446842193604, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-111", "gold_norm": "1", "dist_norm": "return ''.join(strings)", "baseline": { "pred": "gold", "correct": true, "margin": 23.62041076645255, "lp_gold": -18.88068723678589, "lp_dist": -42.50109800323844, "n_tokens_gold": 2, "n_tokens_dist": 8 }, "ablated": { "pred": "gold", "correct": true, "margin": 20.59145486354828, "lp_gold": -19.92290210723877, "lp_dist": -40.51435697078705, "n_tokens_gold": 2, "n_tokens_dist": 8 } }, { "ex_id": "openai_humaneval-test-28", "gold_norm": "return ''.join(strings)", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -1.083352681776887, "lp_gold": -25.75984155811966, "lp_dist": -24.676488876342773, "n_tokens_gold": 8, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.2199663514859367, "lp_gold": -22.93246036817959, "lp_dist": -24.152426719665527, "n_tokens_gold": 8, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-135", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -16.229734420776367, "lp_dist": -16.229734420776367, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -16.821220874786377, "lp_dist": -16.821220874786377, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-94", "gold_norm": "1", "dist_norm": "while b:\n a, b = b, a % b\n return a", "baseline": { "pred": "gold", "correct": true, "margin": 27.14890766143799, "lp_gold": -17.605591773986816, "lp_dist": -44.754499435424805, "n_tokens_gold": 2, "n_tokens_dist": 19 }, "ablated": { "pred": "gold", "correct": true, "margin": 32.34489552024752, "lp_gold": -17.090249061584473, "lp_dist": -49.43514458183199, "n_tokens_gold": 2, "n_tokens_dist": 19 } }, { "ex_id": "openai_humaneval-test-13", "gold_norm": "while b:\n a, b = b, a % b\n return a", "dist_norm": "return [x for x in strings if substring in x]", "baseline": { "pred": "gold", "correct": true, "margin": 27.657666564541046, "lp_gold": -24.580688443994973, "lp_dist": -52.23835500853602, "n_tokens_gold": 19, "n_tokens_dist": 13 }, "ablated": { "pred": "gold", "correct": true, "margin": 26.75688247599828, "lp_gold": -29.810987717977696, "lp_dist": -56.56787019397598, "n_tokens_gold": 19, "n_tokens_dist": 13 } }, { "ex_id": "openai_humaneval-test-7", "gold_norm": "return [x for x in strings if substring in x]", "dist_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b", "baseline": { "pred": "gold", "correct": true, "margin": 60.87681590977445, "lp_gold": -23.353178574070625, "lp_dist": -84.22999448384508, "n_tokens_gold": 13, "n_tokens_dist": 37 }, "ablated": { "pred": "gold", "correct": true, "margin": 83.19848300620379, "lp_gold": -21.97879713297425, "lp_dist": -105.17728013917804, "n_tokens_gold": 13, "n_tokens_dist": 37 } }, { "ex_id": "openai_humaneval-test-157", "gold_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -22.37864390958157, "lp_gold": -42.11520473111477, "lp_dist": -19.736560821533203, "n_tokens_gold": 37, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -21.383852594010023, "lp_gold": -42.70623838197389, "lp_dist": -21.322385787963867, "n_tokens_gold": 37, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-49", "gold_norm": "2", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 6.1286115646362305, "lp_gold": -14.368673324584961, "lp_dist": -20.49728488922119, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.370944976806641, "lp_gold": -15.46406078338623, "lp_dist": -21.83500576019287, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-120", "gold_norm": "0", "dist_norm": "26", "baseline": { "pred": "gold", "correct": true, "margin": 4.207955360412598, "lp_gold": -19.637977600097656, "lp_dist": -23.845932960510254, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.25955867767334, "lp_gold": -19.60517692565918, "lp_dist": -21.86473560333252, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-89", "gold_norm": "26", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -7.005466938018799, "lp_gold": -25.530439853668213, "lp_dist": -18.524972915649414, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.288686752319336, "lp_gold": -25.664198875427246, "lp_dist": -19.37551212310791, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-98", "gold_norm": "1", "dist_norm": "2", "baseline": { "pred": "gold", "correct": true, "margin": 1.9043750762939453, "lp_gold": -21.17328405380249, "lp_dist": -23.077659130096436, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.2112646102905273, "lp_gold": -22.75312328338623, "lp_dist": -24.964387893676758, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-100", "gold_norm": "2", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.1701860427856445, "lp_gold": -22.101744651794434, "lp_dist": -19.93155860900879, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.7842912673950195, "lp_gold": -23.208542823791504, "lp_dist": -21.424251556396484, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-64", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -17.467191696166992, "lp_dist": -17.467191696166992, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -17.525324821472168, "lp_dist": -17.525324821472168, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-103", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.922170639038086, "lp_gold": -16.26321840286255, "lp_dist": -19.185389041900635, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.1872482299804688, "lp_gold": -18.55193328857422, "lp_dist": -21.739181518554688, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-125", "gold_norm": "0", "dist_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])", "baseline": { "pred": "gold", "correct": true, "margin": 48.67979192888379, "lp_gold": -21.47081232070923, "lp_dist": -70.15060424959302, "n_tokens_gold": 2, "n_tokens_dist": 34 }, "ablated": { "pred": "gold", "correct": true, "margin": 55.4912621024414, "lp_gold": -21.899943828582764, "lp_dist": -77.39120593102416, "n_tokens_gold": 2, "n_tokens_dist": 34 } }, { "ex_id": "openai_humaneval-test-51", "gold_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -15.132140384576928, "lp_gold": -33.735738979242456, "lp_dist": -18.603598594665527, "n_tokens_gold": 34, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -17.125044157398406, "lp_gold": -38.15538339842624, "lp_dist": -21.030339241027832, "n_tokens_gold": 34, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-90", "gold_norm": "1", "dist_norm": "return [abs(x-y) for x,y in zip(game,guess)]", "baseline": { "pred": "gold", "correct": true, "margin": 53.59280555654732, "lp_gold": -17.83990478515625, "lp_dist": -71.43271034170357, "n_tokens_gold": 2, "n_tokens_dist": 21 }, "ablated": { "pred": "gold", "correct": true, "margin": 53.46250804614101, "lp_gold": -17.562668800354004, "lp_dist": -71.02517684649501, "n_tokens_gold": 2, "n_tokens_dist": 21 } }, { "ex_id": "openai_humaneval-test-152", "gold_norm": "return [abs(x-y) for x,y in zip(game,guess)]", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 5.4121496778377605, "lp_gold": -12.769094695483773, "lp_dist": -18.181244373321533, "n_tokens_gold": 21, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 6.846039920872499, "lp_gold": -11.968789428645323, "lp_dist": -18.814829349517822, "n_tokens_gold": 21, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-24", "gold_norm": "0", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -1.762319564819336, "lp_gold": -21.209729194641113, "lp_dist": -19.447409629821777, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.050973892211914, "lp_gold": -23.011981964111328, "lp_dist": -20.961008071899414, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-20", "gold_norm": "2", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.5170793533325195, "lp_gold": -21.059219360351562, "lp_dist": -18.542140007019043, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.842561721801758, "lp_gold": -24.164966583251953, "lp_dist": -20.322404861450195, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-148", "gold_norm": "1", "dist_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False", "baseline": { "pred": "gold", "correct": true, "margin": 84.42804804586194, "lp_gold": -12.750839233398438, "lp_dist": -97.17888727926038, "n_tokens_gold": 2, "n_tokens_dist": 63 }, "ablated": { "pred": "gold", "correct": true, "margin": 85.3895359868402, "lp_gold": -12.885719776153564, "lp_dist": -98.27525576299377, "n_tokens_gold": 2, "n_tokens_dist": 63 } }, { "ex_id": "openai_humaneval-test-92", "gold_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -15.070005173284699, "lp_gold": -29.284181828100373, "lp_dist": -14.214176654815674, "n_tokens_gold": 63, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -16.731429186844252, "lp_gold": -33.610841837906264, "lp_dist": -16.87941265106201, "n_tokens_gold": 63, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-26", "gold_norm": "1", "dist_norm": "return len(set(string.lower()))", "baseline": { "pred": "gold", "correct": true, "margin": 39.93833448708756, "lp_gold": -21.593355178833008, "lp_dist": -61.53168966592057, "n_tokens_gold": 2, "n_tokens_dist": 10 }, "ablated": { "pred": "gold", "correct": true, "margin": 44.07288610804244, "lp_gold": -19.37473964691162, "lp_dist": -63.44762575495406, "n_tokens_gold": 2, "n_tokens_dist": 10 } }, { "ex_id": "openai_humaneval-test-16", "gold_norm": "return len(set(string.lower()))", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -0.23131456784904003, "lp_gold": -21.334825424477458, "lp_dist": -21.103510856628418, "n_tokens_gold": 10, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 1.4797544392640702, "lp_gold": -20.777993210882414, "lp_dist": -22.257747650146484, "n_tokens_gold": 10, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-142", "gold_norm": "3", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -1.5028867721557617, "lp_gold": -20.428752899169922, "lp_dist": -18.92586612701416, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.940119743347168, "lp_gold": -20.01767873764038, "lp_dist": -19.077558994293213, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-99", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.057826042175293, "lp_dist": -20.057826042175293, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.03865337371826, "lp_dist": -21.03865337371826, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-127", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.7136526107788086, "lp_gold": -19.18015956878662, "lp_dist": -15.466506958007812, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.9199934005737305, "lp_gold": -19.79235076904297, "lp_dist": -15.872357368469238, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-156", "gold_norm": "1", "dist_norm": "3", "baseline": { "pred": "gold", "correct": true, "margin": 2.246551513671875, "lp_gold": -19.624220848083496, "lp_dist": -21.87077236175537, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.5017471313476562, "lp_gold": -21.236965656280518, "lp_dist": -23.738712787628174, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-141", "gold_norm": "3", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.184619903564453, "lp_gold": -20.60717535018921, "lp_dist": -17.422555446624756, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.860507011413574, "lp_gold": -21.143166542053223, "lp_dist": -18.28265953063965, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-78", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.2415552139282227, "lp_gold": -19.249483585357666, "lp_dist": -21.49103879928589, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.3341121673583984, "lp_gold": -19.18357276916504, "lp_dist": -21.517684936523438, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-68", "gold_norm": "0", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -1.2041006088256836, "lp_gold": -18.397984981536865, "lp_dist": -17.19388437271118, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.9792289733886719, "lp_gold": -17.703375339508057, "lp_dist": -16.724146366119385, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-93", "gold_norm": "2", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -2.300654411315918, "lp_gold": -20.44687557220459, "lp_dist": -18.146221160888672, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.002674102783203, "lp_gold": -19.50245952606201, "lp_dist": -17.49978542327881, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-60", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.0008726119995117, "lp_gold": -17.277544021606445, "lp_dist": -19.278416633605957, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 2.495342254638672, "lp_gold": -20.217015266418457, "lp_dist": -22.71235752105713, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-82", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -18.858778476715088, "lp_dist": -18.858778476715088, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.178369522094727, "lp_dist": -20.178369522094727, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-59", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -24.201942443847656, "lp_dist": -24.201942443847656, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -25.577584266662598, "lp_dist": -25.577584266662598, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-149", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.985135078430176, "lp_gold": -18.369908809661865, "lp_dist": -14.38477373123169, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -3.8884334564208984, "lp_gold": -19.89932155609131, "lp_dist": -16.01088809967041, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-42", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 3.361058235168457, "lp_gold": -16.945013999938965, "lp_dist": -20.306072235107422, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.650448799133301, "lp_gold": -18.96267032623291, "lp_dist": -22.61311912536621, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-163", "gold_norm": "0", "dist_norm": "2", "baseline": { "pred": "dist", "correct": false, "margin": -0.24646282196044922, "lp_gold": -21.461342811584473, "lp_dist": -21.214879989624023, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.1632556915283203, "lp_gold": -22.695805549621582, "lp_dist": -22.859061241149902, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-133", "gold_norm": "2", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 5.3052263259887695, "lp_gold": -19.276253700256348, "lp_dist": -24.581480026245117, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.072274208068848, "lp_gold": -20.15150022506714, "lp_dist": -25.223774433135986, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-161", "gold_norm": "-1", "dist_norm": "return [x for x in strings if x.startswith(prefix)]", "baseline": { "pred": "gold", "correct": true, "margin": 34.15456820592226, "lp_gold": -21.663207292556763, "lp_dist": -55.81777549847902, "n_tokens_gold": 3, "n_tokens_dist": 16 }, "ablated": { "pred": "gold", "correct": true, "margin": 39.88924138105904, "lp_gold": -23.96406078338623, "lp_dist": -63.85330216444527, "n_tokens_gold": 3, "n_tokens_dist": 16 } }, { "ex_id": "openai_humaneval-test-29", "gold_norm": "return [x for x in strings if x.startswith(prefix)]", "dist_norm": "return string.swapcase()", "baseline": { "pred": "gold", "correct": true, "margin": 29.161806431533478, "lp_gold": -23.64715713548503, "lp_dist": -52.80896356701851, "n_tokens_gold": 16, "n_tokens_dist": 7 }, "ablated": { "pred": "gold", "correct": true, "margin": 27.964344927147977, "lp_gold": -21.276650241537936, "lp_dist": -49.24099516868591, "n_tokens_gold": 16, "n_tokens_dist": 7 } }, { "ex_id": "openai_humaneval-test-27", "gold_norm": "return string.swapcase()", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 5.143668868753593, "lp_gold": -18.235226890828926, "lp_dist": -23.37889575958252, "n_tokens_gold": 7, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.050230901411851, "lp_gold": -19.54451664192311, "lp_dist": -22.59474754333496, "n_tokens_gold": 7, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-61", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.488422393798828, "lp_gold": -21.18766164779663, "lp_dist": -17.699239253997803, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.8497695922851562, "lp_gold": -23.119681358337402, "lp_dist": -20.269911766052246, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-104", "gold_norm": "1", "dist_norm": "2.0", "baseline": { "pred": "gold", "correct": true, "margin": 8.777350425720215, "lp_gold": -21.164722442626953, "lp_dist": -29.942072868347168, "n_tokens_gold": 2, "n_tokens_dist": 4 }, "ablated": { "pred": "gold", "correct": true, "margin": 8.533888697624207, "lp_gold": -22.994476318359375, "lp_dist": -31.52836501598358, "n_tokens_gold": 2, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-45", "gold_norm": "2.0", "dist_norm": "101", "baseline": { "pred": "gold", "correct": true, "margin": 5.062639269977808, "lp_gold": -26.36994906887412, "lp_dist": -31.43258833885193, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.08874404430389404, "lp_gold": -27.106158316135406, "lp_dist": -27.017414271831512, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-75", "gold_norm": "101", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -7.495836019515991, "lp_gold": -24.720885515213013, "lp_dist": -17.22504949569702, "n_tokens_gold": 4, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -6.730545282363892, "lp_gold": -24.493883848190308, "lp_dist": -17.763338565826416, "n_tokens_gold": 4, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-6", "gold_norm": "1", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 3.3637475967407227, "lp_gold": -19.45144271850586, "lp_dist": -22.815190315246582, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 3.392610549926758, "lp_gold": -19.24812602996826, "lp_dist": -22.64073657989502, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-136", "gold_norm": "0", "dist_norm": "3", "baseline": { "pred": "dist", "correct": false, "margin": -0.40959739685058594, "lp_gold": -18.25609064102173, "lp_dist": -17.846493244171143, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0858154296875, "lp_gold": -19.771966457366943, "lp_dist": -19.857781887054443, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-33", "gold_norm": "3", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.021329879760742188, "lp_gold": -19.846437454223633, "lp_dist": -19.867767333984375, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.7291717529296875, "lp_gold": -21.070161819458008, "lp_dist": -21.799333572387695, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-44", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.9867525100708, "lp_dist": -22.9867525100708, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.455985069274902, "lp_dist": -22.455985069274902, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-88", "gold_norm": "0", "dist_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s", "baseline": { "pred": "gold", "correct": true, "margin": 91.28553534628554, "lp_gold": -19.742467880249023, "lp_dist": -111.02800322653457, "n_tokens_gold": 2, "n_tokens_dist": 48 }, "ablated": { "pred": "gold", "correct": true, "margin": 100.06942167917623, "lp_gold": -20.66820240020752, "lp_dist": -120.73762407938375, "n_tokens_gold": 2, "n_tokens_dist": 48 } }, { "ex_id": "openai_humaneval-test-12", "gold_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s", "dist_norm": "9", "baseline": { "pred": "dist", "correct": false, "margin": -26.818854912871643, "lp_gold": -51.12532578384332, "lp_dist": -24.30647087097168, "n_tokens_gold": 48, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -32.62358831267056, "lp_gold": -56.370979059740876, "lp_dist": -23.747390747070312, "n_tokens_gold": 48, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-105", "gold_norm": "9", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.5095605850219727, "lp_gold": -20.889235973358154, "lp_dist": -17.37967538833618, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.047847747802734, "lp_gold": -22.52873468399048, "lp_dist": -17.480886936187744, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-121", "gold_norm": "1", "dist_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False", "baseline": { "pred": "gold", "correct": true, "margin": 97.35365189886329, "lp_gold": -20.428816318511963, "lp_dist": -117.78246821737525, "n_tokens_gold": 2, "n_tokens_dist": 75 }, "ablated": { "pred": "gold", "correct": true, "margin": 109.46097758087126, "lp_gold": -22.744394302368164, "lp_dist": -132.20537188323942, "n_tokens_gold": 2, "n_tokens_dist": 75 } }, { "ex_id": "openai_humaneval-test-144", "gold_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -39.772302772080536, "lp_gold": -56.47381558564865, "lp_dist": -16.701512813568115, "n_tokens_gold": 75, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -56.80937039689411, "lp_gold": -74.05506884888996, "lp_dist": -17.24569845199585, "n_tokens_gold": 75, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-73", "gold_norm": "1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -19.193764686584473, "lp_dist": -19.193764686584473, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -20.223108291625977, "lp_dist": -20.223108291625977, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-76", "gold_norm": "1", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 4.929764986038208, "lp_gold": -15.572730541229248, "lp_dist": -20.502495527267456, "n_tokens_gold": 2, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 5.190648078918457, "lp_gold": -16.13015127182007, "lp_dist": -21.320799350738525, "n_tokens_gold": 2, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-5", "gold_norm": "-1", "dist_norm": "-1", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -25.595508337020874, "lp_dist": -25.595508337020874, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -30.356321334838867, "lp_dist": -30.356321334838867, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-46", "gold_norm": "-1", "dist_norm": "0", "baseline": { "pred": "dist", "correct": false, "margin": -3.3807437419891357, "lp_gold": -22.49609923362732, "lp_dist": -19.115355491638184, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.765854597091675, "lp_gold": -24.438928365707397, "lp_dist": -21.673073768615723, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-150", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.250157833099365, "lp_dist": -22.250157833099365, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -22.30048179626465, "lp_dist": -22.30048179626465, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-35", "gold_norm": "0", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -21.539960861206055, "lp_dist": -21.539960861206055, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "gold", "correct": true, "margin": 0.0, "lp_gold": -23.488576889038086, "lp_dist": -23.488576889038086, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-153", "gold_norm": "0", "dist_norm": "1", "baseline": { "pred": "dist", "correct": false, "margin": -3.9339113235473633, "lp_gold": -19.433120727539062, "lp_dist": -15.4992094039917, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -5.801613807678223, "lp_gold": -20.252619743347168, "lp_dist": -14.451005935668945, "n_tokens_gold": 2, "n_tokens_dist": 2 } } ], "flip_rows": [ { "ex_id": "openai_humaneval-test-39", "gold_norm": "-1", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.44877076148986816, "lp_gold": -19.808196783065796, "lp_dist": -20.256967544555664, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.15900945663452148, "lp_gold": -22.082778453826904, "lp_dist": -21.923768997192383, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.70906001329422, "lp_gold": -19.808191001415253, "lp_dist": -20.517251014709473, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.2600139379501343, "lp_gold": -20.45387899875641, "lp_dist": -20.713892936706543, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.896909236907959, "lp_gold": -22.914924144744873, "lp_dist": -21.018014907836914, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "gold", "correct": true, "margin": 0.4487643241882324, "lp_gold": -22.053846836090088, "lp_dist": -22.50261116027832, "n_tokens_gold": 3, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.5229349136352539, "lp_gold": -22.082777976989746, "lp_dist": -21.559843063354492, "n_tokens_gold": 3, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-134", "gold_norm": "122", "dist_norm": "1.0", "baseline": { "pred": "gold", "correct": true, "margin": 0.8403654620051384, "lp_gold": -27.5423321723938, "lp_dist": -28.382697634398937, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -2.457980029284954, "lp_gold": -29.874002933502197, "lp_dist": -27.416022904217243, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "patched_self": { "pred": "dist", "correct": false, "margin": -2.6199172660708427, "lp_gold": -27.5423264503479, "lp_dist": -24.922409184277058, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.1696217805147171, "lp_gold": -28.275829792022705, "lp_dist": -28.445451572537422, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.7151931561529636, "lp_gold": -29.58313512802124, "lp_dist": -26.867941971868277, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -2.835217572748661, "lp_gold": -29.671478271484375, "lp_dist": -26.836260698735714, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -2.1110972091555595, "lp_gold": -29.87399911880493, "lp_dist": -27.762901909649372, "n_tokens_gold": 4, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-4", "gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.45850569046490364, "lp_gold": -22.989404618372987, "lp_dist": -23.44791030883789, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -8.605282360978833, "lp_gold": -30.1749826020677, "lp_dist": -21.569700241088867, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "patched_self": { "pred": "dist", "correct": false, "margin": -6.219808316351077, "lp_gold": -29.551916813970706, "lp_dist": -23.33210849761963, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -6.893468835648207, "lp_gold": -30.730242708023695, "lp_dist": -23.83677387237549, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -8.074352583655013, "lp_gold": -29.26745828128685, "lp_dist": -21.193105697631836, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -9.070045394459783, "lp_gold": -29.0644835658465, "lp_dist": -19.99443817138672, "n_tokens_gold": 33, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -8.754907058842889, "lp_gold": -30.174978660710565, "lp_dist": -21.420071601867676, "n_tokens_gold": 33, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-139", "gold_norm": "+1", "dist_norm": "0.0", "baseline": { "pred": "gold", "correct": true, "margin": 1.2652113437652588, "lp_gold": -22.48181676864624, "lp_dist": -23.7470281124115, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.1343896389007568, "lp_gold": -23.847239017486572, "lp_dist": -22.712849378585815, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.8445468246936798, "lp_gold": -22.48181438446045, "lp_dist": -23.32636120915413, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -1.0516579747200012, "lp_gold": -24.160611152648926, "lp_dist": -23.108953177928925, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -2.6032419204711914, "lp_gold": -22.94077157974243, "lp_dist": -20.33752965927124, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.2553260326385498, "lp_gold": -24.39438009262085, "lp_dist": -23.1390540599823, "n_tokens_gold": 3, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.7304326295852661, "lp_gold": -23.84722900390625, "lp_dist": -23.116796374320984, "n_tokens_gold": 3, "n_tokens_dist": 4 } }, { "ex_id": "openai_humaneval-test-19", "gold_norm": "9", "dist_norm": "1", "baseline": { "pred": "gold", "correct": true, "margin": 0.07897377014160156, "lp_gold": -22.66366195678711, "lp_dist": -22.74263572692871, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4677810668945312, "lp_gold": -23.768733024597168, "lp_dist": -22.300951957702637, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.07897186279296875, "lp_gold": -22.663668632507324, "lp_dist": -22.742640495300293, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 0.2928171157836914, "lp_gold": -23.486221313476562, "lp_dist": -23.779038429260254, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.1306524276733398, "lp_gold": -22.043014526367188, "lp_dist": -20.912362098693848, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.335036277770996, "lp_gold": -23.414420127868652, "lp_dist": -22.079383850097656, "n_tokens_gold": 2, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.4677820205688477, "lp_gold": -23.768739700317383, "lp_dist": -22.300957679748535, "n_tokens_gold": 2, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-106", "gold_norm": "+1", "dist_norm": "26", "baseline": { "pred": "gold", "correct": true, "margin": 2.2219808101654053, "lp_gold": -21.646430253982544, "lp_dist": -23.86841106414795, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "ablated": { "pred": "dist", "correct": false, "margin": -1.4264450073242188, "lp_gold": -24.843106269836426, "lp_dist": -23.416661262512207, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "patched_self": { "pred": "gold", "correct": true, "margin": 0.3593015670776367, "lp_gold": -21.646427631378174, "lp_dist": -22.00572919845581, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_time_shuffled": { "pred": "dist", "correct": false, "margin": -2.295396327972412, "lp_gold": -24.159332752227783, "lp_dist": -21.86393642425537, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -1.9753856658935547, "lp_gold": -25.059080600738525, "lp_dist": -23.08369493484497, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -1.8524909019470215, "lp_gold": -25.18923282623291, "lp_dist": -23.33674192428589, "n_tokens_gold": 3, "n_tokens_dist": 3 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -1.216343879699707, "lp_gold": -24.843104362487793, "lp_dist": -23.626760482788086, "n_tokens_gold": 3, "n_tokens_dist": 3 } }, { "ex_id": "openai_humaneval-test-22", "gold_norm": "return [x for x in values if isinstance(x, int)]", "dist_norm": "0", "baseline": { "pred": "gold", "correct": true, "margin": 2.9452141776912413, "lp_gold": -21.9394551262028, "lp_dist": -24.884669303894043, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.21686965267372216, "lp_gold": -23.520365059778214, "lp_dist": -23.303495407104492, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "patched_self": { "pred": "gold", "correct": true, "margin": 2.3960064357215742, "lp_gold": -21.941916614205184, "lp_dist": -24.337923049926758, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 3.289606383860928, "lp_gold": -21.179310509144443, "lp_dist": -24.46891689300537, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.6156758015997639, "lp_gold": -24.370192403162264, "lp_dist": -23.7545166015625, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.07147166511367686, "lp_gold": -25.524140808790435, "lp_dist": -25.452669143676758, "n_tokens_gold": 16, "n_tokens_dist": 2 }, "control_patch_nonshared": { "pred": "dist", "correct": false, "margin": -0.40586215297889794, "lp_gold": -23.520359337732316, "lp_dist": -23.114497184753418, "n_tokens_gold": 16, "n_tokens_dist": 2 } }, { "ex_id": "openai_humaneval-test-45", "gold_norm": "2.0", "dist_norm": "101", "baseline": { "pred": "gold", "correct": true, "margin": 5.062639269977808, "lp_gold": -26.36994906887412, "lp_dist": -31.43258833885193, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "ablated": { "pred": "dist", "correct": false, "margin": -0.08874404430389404, "lp_gold": -27.106158316135406, "lp_dist": -27.017414271831512, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "patched_self": { "pred": "gold", "correct": true, "margin": 5.745390687137842, "lp_gold": -26.369948115199804, "lp_dist": -32.11533880233765, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_time_shuffled": { "pred": "gold", "correct": true, "margin": 3.564100921154022, "lp_gold": -25.915751039981842, "lp_dist": -29.479851961135864, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_shared_randvec": { "pred": "dist", "correct": false, "margin": -0.768334724009037, "lp_gold": -28.143043376505375, "lp_dist": -27.374708652496338, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_rand_subspace": { "pred": "dist", "correct": false, "margin": -0.5243468135595322, "lp_gold": -28.490523919463158, "lp_dist": -27.966177105903625, "n_tokens_gold": 4, "n_tokens_dist": 4 }, "control_patch_nonshared": { "pred": "gold", "correct": true, "margin": 0.04392840713262558, "lp_gold": -27.1061624661088, "lp_dist": -27.150090873241425, "n_tokens_gold": 4, "n_tokens_dist": 4 } } ] }