decodeshare / artifacts /patch_back /results /openanswer /humaneval_pairlogprob.json
Zishan-Shao's picture
Upload folder using huggingface_hub
aa0e435 verified
{
"meta": {
"model": "meta-llama/Llama-2-7b-chat-hf",
"device": "cuda",
"dtype": "fp32",
"layer": 10,
"layers_path": "model.layers",
"seed": 123,
"task": "humaneval",
"eval_mode": "pair_logprob",
"eval_meta": {
"hf_id": "openai_humaneval",
"split": "test",
"n_total": 164
},
"n_eval_loaded": 164,
"n_scanned": 164,
"base_acc_scan": 0.6585365853658537,
"ablt_acc_scan": 0.6402439024390244,
"flips_total": 8,
"flips_used": 8,
"patch_steps": [
0,
1,
2,
3
],
"patch_n_steps": 4,
"Qs_path": "Q_shared_layer10.npy",
"Qs_shape": [
4096,
97
],
"gold_text_prefix": " ",
"dist_text_prefix": " ",
"gold_max_tokens": 128,
"distractor_mode": "next_gold",
"answer_prefix_effective": "\nFinal answer:",
"max_new_tokens_effective": 64,
"run_coeff_controls": false,
"use_benchmark_loader": false,
"hf_id": "openai_humaneval",
"hf_split": "test"
},
"summary_on_flips": {
"patched_self": {
"n": 8,
"rescued": 6,
"rescued_pct": 75.0,
"mean_delta_margin_vs_ablated": 2.1062567234039307,
"median_delta_margin_vs_ablated": 1.8823415040969849
},
"control_time_shuffled": {
"n": 8,
"rescued": 5,
"rescued_pct": 62.5,
"mean_delta_margin_vs_ablated": 1.611517310142517,
"median_delta_margin_vs_ablated": 1.736205816268921
},
"control_shared_randvec": {
"n": 8,
"rescued": 0,
"rescued_pct": 0.0,
"mean_delta_margin_vs_ablated": -0.5279055237770081,
"median_delta_margin_vs_ablated": -0.47387340664863586
},
"control_rand_subspace": {
"n": 8,
"rescued": 1,
"rescued_pct": 12.5,
"mean_delta_margin_vs_ablated": -0.11733363568782806,
"median_delta_margin_vs_ablated": -0.24908697605133057
},
"control_patch_nonshared": {
"n": 8,
"rescued": 1,
"rescued_pct": 12.5,
"mean_delta_margin_vs_ablated": 0.048883724957704544,
"median_delta_margin_vs_ablated": 0.0663357526063919
}
},
"scan_rows": [
{
"ex_id": "openai_humaneval-test-18",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.234334945678711,
"lp_gold": -19.061742782592773,
"lp_dist": -21.296077728271484,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.2801809310913086,
"lp_gold": -21.540138244628906,
"lp_dist": -23.820319175720215,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-31",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -24.834365844726562,
"lp_dist": -24.834365844726562,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -26.02178192138672,
"lp_dist": -26.02178192138672,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-158",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.071807861328125,
"lp_dist": -20.071807861328125,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.830044746398926,
"lp_dist": -20.830044746398926,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-43",
"gold_norm": "0",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.9933905601501465,
"lp_gold": -19.265485286712646,
"lp_dist": -20.258875846862793,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.1291828155517578,
"lp_gold": -20.913329124450684,
"lp_dist": -21.04251194000244,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-39",
"gold_norm": "-1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.44877076148986816,
"lp_gold": -19.808196783065796,
"lp_dist": -20.256967544555664,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.15900945663452148,
"lp_gold": -22.082778453826904,
"lp_dist": -21.923768997192383,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-15",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.9036493301391602,
"lp_gold": -21.704959869384766,
"lp_dist": -23.608609199523926,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.597311019897461,
"lp_gold": -22.718390464782715,
"lp_dist": -24.315701484680176,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-151",
"gold_norm": "0",
"dist_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 121.7853691404589,
"lp_gold": -18.80201482772827,
"lp_dist": -140.58738396818717,
"n_tokens_gold": 2,
"n_tokens_dist": 79
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 121.62771476514445,
"lp_gold": -21.097187042236328,
"lp_dist": -142.72490180738077,
"n_tokens_gold": 2,
"n_tokens_dist": 79
}
},
{
"ex_id": "openai_humaneval-test-101",
"gold_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()",
"dist_norm": "return sorted(list(set(l)))",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -31.890593631385258,
"lp_gold": -82.64533315385356,
"lp_dist": -50.7547395224683,
"n_tokens_gold": 79,
"n_tokens_dist": 10
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -36.78753035544207,
"lp_gold": -92.11622255231043,
"lp_dist": -55.32869219686836,
"n_tokens_gold": 79,
"n_tokens_dist": 10
}
},
{
"ex_id": "openai_humaneval-test-34",
"gold_norm": "return sorted(list(set(l)))",
"dist_norm": "for e in l:\n if e >= t:\n return False\n return True",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 49.12246614873311,
"lp_gold": -11.745759465690753,
"lp_dist": -60.868225614423864,
"n_tokens_gold": 10,
"n_tokens_dist": 21
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 45.176009765720664,
"lp_gold": -15.221526360244752,
"lp_dist": -60.397536125965416,
"n_tokens_gold": 10,
"n_tokens_dist": 21
}
},
{
"ex_id": "openai_humaneval-test-52",
"gold_norm": "for e in l:\n if e >= t:\n return False\n return True",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -18.150727199346875,
"lp_gold": -34.72359650018916,
"lp_dist": -16.572869300842285,
"n_tokens_gold": 21,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -18.006402769460692,
"lp_gold": -34.587387361898436,
"lp_dist": -16.580984592437744,
"n_tokens_gold": 21,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-113",
"gold_norm": "1",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.4202613830566406,
"lp_gold": -14.915795803070068,
"lp_dist": -17.33605718612671,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.1670427322387695,
"lp_gold": -17.457444190979004,
"lp_dist": -19.624486923217773,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-119",
"gold_norm": "2",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.728981971740723,
"lp_dist": -19.728981971740723,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.550307273864746,
"lp_dist": -20.550307273864746,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-83",
"gold_norm": "2",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.055248260498047,
"lp_gold": -21.746171951293945,
"lp_dist": -19.6909236907959,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.8670825958251953,
"lp_gold": -22.857107162475586,
"lp_dist": -20.99002456665039,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-116",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.5492916107177734,
"lp_gold": -13.82323694229126,
"lp_dist": -16.372528553009033,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.914440155029297,
"lp_gold": -15.276045799255371,
"lp_dist": -19.190485954284668,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-56",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.711745262145996,
"lp_dist": -20.711745262145996,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.994271278381348,
"lp_dist": -22.994271278381348,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-131",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.21437931060791,
"lp_dist": -20.21437931060791,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.886200428009033,
"lp_dist": -21.886200428009033,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-1",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -23.005115509033203,
"lp_dist": -23.005115509033203,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.74118995666504,
"lp_dist": -22.74118995666504,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-159",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -4.005736351013184,
"lp_gold": -21.52419900894165,
"lp_dist": -17.518462657928467,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.665071487426758,
"lp_gold": -20.96841859817505,
"lp_dist": -17.30334711074829,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-123",
"gold_norm": "1",
"dist_norm": "return len(string)",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 22.28865046799183,
"lp_gold": -18.46173858642578,
"lp_dist": -40.75038905441761,
"n_tokens_gold": 2,
"n_tokens_dist": 6
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 21.772113933227956,
"lp_gold": -18.439892768859863,
"lp_dist": -40.21200670208782,
"n_tokens_gold": 2,
"n_tokens_dist": 6
}
},
{
"ex_id": "openai_humaneval-test-23",
"gold_norm": "return len(string)",
"dist_norm": "29",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 12.790524657903006,
"lp_gold": -15.688145462336252,
"lp_dist": -28.478670120239258,
"n_tokens_gold": 6,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 13.356130968688376,
"lp_gold": -16.101557362915628,
"lp_dist": -29.457688331604004,
"n_tokens_gold": 6,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-124",
"gold_norm": "29",
"dist_norm": "return encode_cyclic(encode_cyclic(s))",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 50.18577869143337,
"lp_gold": -21.59093189239502,
"lp_dist": -71.77671058382839,
"n_tokens_gold": 3,
"n_tokens_dist": 14
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 56.64552312903106,
"lp_gold": -22.290241479873657,
"lp_dist": -78.93576460890472,
"n_tokens_gold": 3,
"n_tokens_dist": 14
}
},
{
"ex_id": "openai_humaneval-test-38",
"gold_norm": "return encode_cyclic(encode_cyclic(s))",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.7194257393894077,
"lp_gold": -29.536041797739017,
"lp_dist": -26.81661605834961,
"n_tokens_gold": 14,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.8930526294279844,
"lp_gold": -29.922911218600348,
"lp_dist": -26.029858589172363,
"n_tokens_gold": 14,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-84",
"gold_norm": "2",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.775970458984375,
"lp_dist": -19.775970458984375,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.496947288513184,
"lp_dist": -20.496947288513184,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-41",
"gold_norm": "2",
"dist_norm": "122",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.278376579284668,
"lp_gold": -22.390517234802246,
"lp_dist": -30.668893814086914,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 9.729191303253174,
"lp_gold": -21.189680099487305,
"lp_dist": -30.91887140274048,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-134",
"gold_norm": "122",
"dist_norm": "1.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8403654620051384,
"lp_gold": -27.5423321723938,
"lp_dist": -28.382697634398937,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.457980029284954,
"lp_gold": -29.874002933502197,
"lp_dist": -27.416022904217243,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-2",
"gold_norm": "1.0",
"dist_norm": "+2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.533992663025856,
"lp_gold": -19.524202451109886,
"lp_dist": -25.058195114135742,
"n_tokens_gold": 4,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.357768684625626,
"lp_gold": -17.571494430303574,
"lp_dist": -25.9292631149292,
"n_tokens_gold": 4,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-80",
"gold_norm": "+2",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -5.821111679077148,
"lp_gold": -24.557270050048828,
"lp_dist": -18.73615837097168,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.8031325340271,
"lp_gold": -26.246718406677246,
"lp_dist": -20.443585872650146,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-74",
"gold_norm": "2",
"dist_norm": "5",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2735700607299805,
"lp_gold": -17.82392930984497,
"lp_dist": -19.09749937057495,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.1347217559814453,
"lp_gold": -18.707550048828125,
"lp_dist": -19.84227180480957,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-162",
"gold_norm": "5",
"dist_norm": "8",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.4634990692138672,
"lp_gold": -19.24656867980957,
"lp_dist": -19.710067749023438,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.9875469207763672,
"lp_gold": -21.240734100341797,
"lp_dist": -22.228281021118164,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-138",
"gold_norm": "8",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.1303815841674805,
"lp_gold": -21.265185356140137,
"lp_dist": -20.134803771972656,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.3591747283935547,
"lp_gold": -23.056735038757324,
"lp_dist": -21.69756031036377,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-87",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.202168941497803,
"lp_dist": -21.202168941497803,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.12106418609619,
"lp_dist": -22.12106418609619,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-145",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.991208076477051,
"lp_gold": -22.26430034637451,
"lp_dist": -19.27309226989746,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.47486686706543,
"lp_gold": -24.317991256713867,
"lp_dist": -19.843124389648438,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-54",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8734645843505859,
"lp_gold": -22.476731300354004,
"lp_dist": -23.35019588470459,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.6492033004760742,
"lp_gold": -24.899346351623535,
"lp_dist": -26.54854965209961,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-109",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.480426788330078,
"lp_gold": -19.18559741973877,
"lp_dist": -16.70517063140869,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.790494918823242,
"lp_gold": -21.397884845733643,
"lp_dist": -16.6073899269104,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-102",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.423691272735596,
"lp_dist": -19.423691272735596,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.474623680114746,
"lp_dist": -20.474623680114746,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-62",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -17.160552978515625,
"lp_dist": -17.160552978515625,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -15.657040119171143,
"lp_dist": -15.657040119171143,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-129",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.168509483337402,
"lp_dist": -19.168509483337402,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.092866897583008,
"lp_dist": -19.092866897583008,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-110",
"gold_norm": "1",
"dist_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 56.64819990084652,
"lp_gold": -12.325343608856201,
"lp_dist": -68.97354350970272,
"n_tokens_gold": 2,
"n_tokens_dist": 33
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 60.25500547605043,
"lp_gold": -11.27043867111206,
"lp_dist": -71.52544414716249,
"n_tokens_gold": 2,
"n_tokens_dist": 33
}
},
{
"ex_id": "openai_humaneval-test-4",
"gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.45850569046490364,
"lp_gold": -22.989404618372987,
"lp_dist": -23.44791030883789,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.605282360978833,
"lp_gold": -30.1749826020677,
"lp_dist": -21.569700241088867,
"n_tokens_gold": 33,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-8",
"gold_norm": "1",
"dist_norm": "10",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.0166356563568115,
"lp_gold": -18.3596510887146,
"lp_dist": -21.37628674507141,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.0787312984466553,
"lp_gold": -18.83952569961548,
"lp_dist": -21.918256998062134,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-97",
"gold_norm": "10",
"dist_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 94.84300347900216,
"lp_gold": -21.25941014289856,
"lp_dist": -116.10241362190072,
"n_tokens_gold": 3,
"n_tokens_dist": 100
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 98.10631249527887,
"lp_gold": -23.950812816619873,
"lp_dist": -122.05712531189874,
"n_tokens_gold": 3,
"n_tokens_dist": 100
}
},
{
"ex_id": "openai_humaneval-test-137",
"gold_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -67.42598734535835,
"lp_gold": -83.42734728493355,
"lp_dist": -16.001359939575195,
"n_tokens_gold": 100,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -67.17782856585814,
"lp_gold": -82.82997252108885,
"lp_dist": -15.652143955230713,
"n_tokens_gold": 100,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-96",
"gold_norm": "0",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.6102466583251953,
"lp_gold": -18.234556198120117,
"lp_dist": -21.844802856445312,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.688672065734863,
"lp_gold": -19.75603199005127,
"lp_dist": -24.444704055786133,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-65",
"gold_norm": "-1",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.8471782207489014,
"lp_gold": -22.299251317977905,
"lp_dist": -21.452073097229004,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.24910449981689453,
"lp_gold": -23.086091995239258,
"lp_dist": -22.836987495422363,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-114",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.8158721923828125,
"lp_gold": -22.96446418762207,
"lp_dist": -21.148591995239258,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.1736278533935547,
"lp_gold": -25.378070831298828,
"lp_dist": -22.204442977905273,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-117",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -15.196922779083252,
"lp_dist": -15.196922779083252,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -15.9734787940979,
"lp_dist": -15.9734787940979,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-155",
"gold_norm": "1",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.606046199798584,
"lp_gold": -18.941298484802246,
"lp_dist": -21.54734468460083,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.5056796073913574,
"lp_gold": -20.002915382385254,
"lp_dist": -23.50859498977661,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-37",
"gold_norm": "-1",
"dist_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 86.6937904068327,
"lp_gold": -20.384744882583618,
"lp_dist": -107.07853528941632,
"n_tokens_gold": 3,
"n_tokens_dist": 20
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 87.06271385207947,
"lp_gold": -21.526390075683594,
"lp_dist": -108.58910392776306,
"n_tokens_gold": 3,
"n_tokens_dist": 20
}
},
{
"ex_id": "openai_humaneval-test-115",
"gold_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])",
"dist_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 24.88441585241617,
"lp_gold": -46.10199257756449,
"lp_dist": -70.98640842998066,
"n_tokens_gold": 20,
"n_tokens_dist": 24
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 29.905949345506542,
"lp_gold": -44.645553992972054,
"lp_dist": -74.5515033384786,
"n_tokens_gold": 20,
"n_tokens_dist": 24
}
},
{
"ex_id": "openai_humaneval-test-86",
"gold_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -14.466940372163663,
"lp_gold": -36.1265668560809,
"lp_dist": -21.659626483917236,
"n_tokens_gold": 24,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -15.646357826058647,
"lp_gold": -39.13540583020904,
"lp_dist": -23.48904800415039,
"n_tokens_gold": 24,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-85",
"gold_norm": "0",
"dist_norm": "return x + y",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.443425707519054,
"lp_gold": -16.92650079727173,
"lp_dist": -23.369926504790783,
"n_tokens_gold": 2,
"n_tokens_dist": 5
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.885157495737076,
"lp_gold": -17.333487033843994,
"lp_dist": -23.21864452958107,
"n_tokens_gold": 2,
"n_tokens_dist": 5
}
},
{
"ex_id": "openai_humaneval-test-53",
"gold_norm": "return x + y",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 9.916433400284404,
"lp_gold": -10.99827282987917,
"lp_dist": -20.914706230163574,
"n_tokens_gold": 5,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 9.794849171276837,
"lp_gold": -10.414248690966815,
"lp_dist": -20.209097862243652,
"n_tokens_gold": 5,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-130",
"gold_norm": "2",
"dist_norm": "+1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.369633674621582,
"lp_gold": -20.579893589019775,
"lp_dist": -22.949527263641357,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.291049003601074,
"lp_gold": -21.44382381439209,
"lp_dist": -24.734872817993164,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-139",
"gold_norm": "+1",
"dist_norm": "0.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2652113437652588,
"lp_gold": -22.48181676864624,
"lp_dist": -23.7470281124115,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.1343896389007568,
"lp_gold": -23.847239017486572,
"lp_dist": -22.712849378585815,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-81",
"gold_norm": "0.0",
"dist_norm": "9",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.3655529320240021,
"lp_gold": -22.439337760210037,
"lp_dist": -22.073784828186035,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.13648897409439087,
"lp_gold": -23.279959738254547,
"lp_dist": -23.143470764160156,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-19",
"gold_norm": "9",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.07897377014160156,
"lp_gold": -22.66366195678711,
"lp_dist": -22.74263572692871,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4677810668945312,
"lp_gold": -23.768733024597168,
"lp_dist": -22.300951957702637,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-69",
"gold_norm": "1",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.650723457336426,
"lp_gold": -15.441932678222656,
"lp_dist": -18.092656135559082,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.28917121887207,
"lp_gold": -14.679984092712402,
"lp_dist": -18.969155311584473,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-122",
"gold_norm": "2",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.9139537811279297,
"lp_gold": -18.55172109603882,
"lp_dist": -17.63776731491089,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.36802101135253906,
"lp_gold": -18.541946411132812,
"lp_dist": -18.173925399780273,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-108",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.612565040588379,
"lp_gold": -22.057893753051758,
"lp_dist": -19.44532871246338,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.8640785217285156,
"lp_gold": -23.09704303741455,
"lp_dist": -19.232964515686035,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-48",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.96462059020996,
"lp_dist": -21.96462059020996,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -24.449016571044922,
"lp_dist": -24.449016571044922,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-17",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.34698486328125,
"lp_gold": -21.55142116546631,
"lp_dist": -22.89840602874756,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.0376691818237305,
"lp_gold": -20.85892963409424,
"lp_dist": -21.89659881591797,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-95",
"gold_norm": "0",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.8023662567138672,
"lp_gold": -20.999281883239746,
"lp_dist": -19.19691562652588,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.70697021484375,
"lp_gold": -21.713737964630127,
"lp_dist": -20.006767749786377,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-91",
"gold_norm": "2",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.7028169631958,
"lp_dist": -21.7028169631958,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.45036220550537,
"lp_dist": -22.45036220550537,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-79",
"gold_norm": "2",
"dist_norm": "+1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.754400730133057,
"lp_gold": -17.42607831954956,
"lp_dist": -24.180479049682617,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.1651506423950195,
"lp_gold": -19.892897605895996,
"lp_dist": -26.058048248291016,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-14",
"gold_norm": "+1",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.5707898139953613,
"lp_gold": -29.604674816131592,
"lp_dist": -26.03388500213623,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.903932571411133,
"lp_gold": -27.936614990234375,
"lp_dist": -22.032682418823242,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-77",
"gold_norm": "3",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.08347511291503906,
"lp_gold": -20.79250478744507,
"lp_dist": -20.875979900360107,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.4646930694580078,
"lp_gold": -21.741466522216797,
"lp_dist": -22.206159591674805,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-3",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.1768722534179688,
"lp_gold": -23.416447639465332,
"lp_dist": -21.239575386047363,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.3070783615112305,
"lp_gold": -20.640877723693848,
"lp_dist": -16.333799362182617,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-146",
"gold_norm": "1",
"dist_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 52.92330244462937,
"lp_gold": -17.06475257873535,
"lp_dist": -69.98805502336472,
"n_tokens_gold": 2,
"n_tokens_dist": 27
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 50.48234937642701,
"lp_gold": -18.093742847442627,
"lp_dist": -68.57609222386964,
"n_tokens_gold": 2,
"n_tokens_dist": 27
}
},
{
"ex_id": "openai_humaneval-test-57",
"gold_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -19.544386039720848,
"lp_gold": -39.81637967680581,
"lp_dist": -20.27199363708496,
"n_tokens_gold": 27,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -23.215539787866874,
"lp_gold": -47.03316483364324,
"lp_dist": -23.817625045776367,
"n_tokens_gold": 27,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-143",
"gold_norm": "2",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -18.56871795654297,
"lp_dist": -18.56871795654297,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -18.614916801452637,
"lp_dist": -18.614916801452637,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-0",
"gold_norm": "2",
"dist_norm": "+1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.107102870941162,
"lp_gold": -24.08918285369873,
"lp_dist": -31.196285724639893,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.970984935760498,
"lp_gold": -24.804469108581543,
"lp_dist": -31.77545404434204,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-106",
"gold_norm": "+1",
"dist_norm": "26",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2219808101654053,
"lp_gold": -21.646430253982544,
"lp_dist": -23.86841106414795,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4264450073242188,
"lp_gold": -24.843106269836426,
"lp_dist": -23.416661262512207,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-50",
"gold_norm": "26",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.6608308553695679,
"lp_gold": -26.048061728477478,
"lp_dist": -25.38723087310791,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.0806753635406494,
"lp_gold": -25.965544939041138,
"lp_dist": -23.88486957550049,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-58",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.0009937286376953,
"lp_gold": -17.66666841506958,
"lp_dist": -19.667662143707275,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.9970569610595703,
"lp_gold": -19.63721752166748,
"lp_dist": -21.63427448272705,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-147",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.277851104736328,
"lp_gold": -21.182048797607422,
"lp_dist": -17.904197692871094,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.9687089920043945,
"lp_gold": -23.235244750976562,
"lp_dist": -19.266535758972168,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-160",
"gold_norm": "1",
"dist_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 82.88829492655168,
"lp_gold": -16.034985065460205,
"lp_dist": -98.92327999201189,
"n_tokens_gold": 2,
"n_tokens_dist": 45
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 79.75362334529109,
"lp_gold": -16.3989200592041,
"lp_dist": -96.15254340449519,
"n_tokens_gold": 2,
"n_tokens_dist": 45
}
},
{
"ex_id": "openai_humaneval-test-67",
"gold_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -49.84833694786221,
"lp_gold": -66.66380525917202,
"lp_dist": -16.815468311309814,
"n_tokens_gold": 45,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -48.502373244005184,
"lp_gold": -65.89947893782232,
"lp_dist": -17.39710569381714,
"n_tokens_gold": 45,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-55",
"gold_norm": "2",
"dist_norm": "-1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.25999921560287476,
"lp_gold": -24.11659049987793,
"lp_dist": -23.856591284275055,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.37139952182769775,
"lp_gold": -24.66786289215088,
"lp_dist": -25.039262413978577,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-118",
"gold_norm": "-1",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.466064929962158,
"lp_gold": -23.7993106842041,
"lp_dist": -17.333245754241943,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.198596954345703,
"lp_gold": -26.275648593902588,
"lp_dist": -18.077051639556885,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-154",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -14.774529933929443,
"lp_dist": -14.774529933929443,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -16.673503875732422,
"lp_dist": -16.673503875732422,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-25",
"gold_norm": "1",
"dist_norm": "7",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.230724334716797,
"lp_gold": -22.422992706298828,
"lp_dist": -25.653717041015625,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.7021970748901367,
"lp_gold": -19.818692207336426,
"lp_dist": -23.520889282226562,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-36",
"gold_norm": "7",
"dist_norm": "3",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.6618576049804688,
"lp_gold": -20.638415336608887,
"lp_dist": -23.300272941589355,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.1126937866210938,
"lp_gold": -22.601848602294922,
"lp_dist": -25.714542388916016,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-63",
"gold_norm": "3",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.4981985092163086,
"lp_gold": -23.173019409179688,
"lp_dist": -21.67482089996338,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.069631576538086,
"lp_gold": -24.268108367919922,
"lp_dist": -23.198476791381836,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-132",
"gold_norm": "2",
"dist_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 68.18116059236388,
"lp_gold": -18.65871238708496,
"lp_dist": -86.83987297944884,
"n_tokens_gold": 2,
"n_tokens_dist": 44
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 67.7937639305901,
"lp_gold": -20.1024751663208,
"lp_dist": -87.8962390969109,
"n_tokens_gold": 2,
"n_tokens_dist": 44
}
},
{
"ex_id": "openai_humaneval-test-21",
"gold_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]",
"dist_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 23.721003586428438,
"lp_gold": -41.06687730719631,
"lp_dist": -64.78788089362475,
"n_tokens_gold": 44,
"n_tokens_dist": 69
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 32.38431728373951,
"lp_gold": -42.50547727979096,
"lp_dist": -74.88979456353047,
"n_tokens_gold": 44,
"n_tokens_dist": 69
}
},
{
"ex_id": "openai_humaneval-test-9",
"gold_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -29.805189722821147,
"lp_gold": -50.413360232159526,
"lp_dist": -20.60817050933838,
"n_tokens_gold": 69,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -51.59206402557629,
"lp_gold": -70.91829026001233,
"lp_dist": -19.326226234436035,
"n_tokens_gold": 69,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-72",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.30158805847168,
"lp_gold": -13.809431552886963,
"lp_dist": -19.111019611358643,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 4.650420188903809,
"lp_gold": -15.45082139968872,
"lp_dist": -20.10124158859253,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-128",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.34075927734375,
"lp_gold": -21.099153995513916,
"lp_dist": -17.758394718170166,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -4.7813920974731445,
"lp_gold": -22.55471706390381,
"lp_dist": -17.773324966430664,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-126",
"gold_norm": "1",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 7.619829177856445,
"lp_gold": -16.480793476104736,
"lp_dist": -24.10062265396118,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 7.267373561859131,
"lp_gold": -18.10063362121582,
"lp_dist": -25.36800718307495,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-70",
"gold_norm": "-1",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.621180534362793,
"lp_gold": -23.137717247009277,
"lp_dist": -20.516536712646484,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.558529853820801,
"lp_gold": -25.226367950439453,
"lp_dist": -22.667838096618652,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-40",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -18.633878707885742,
"lp_dist": -18.633878707885742,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.059532165527344,
"lp_dist": -21.059532165527344,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-66",
"gold_norm": "0",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.13283920288085938,
"lp_gold": -17.218024253845215,
"lp_dist": -17.085185050964355,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.1121377944946289,
"lp_gold": -18.084847927093506,
"lp_dist": -17.972710132598877,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-71",
"gold_norm": "2",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.2154407501220703,
"lp_gold": -18.285062789916992,
"lp_dist": -17.069622039794922,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.0220775604248047,
"lp_gold": -19.058651447296143,
"lp_dist": -18.036573886871338,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-107",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.3218297958374023,
"lp_gold": -17.986241340637207,
"lp_dist": -20.30807113647461,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.979066848754883,
"lp_gold": -16.702817916870117,
"lp_dist": -19.681884765625,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-32",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.1487884521484375,
"lp_gold": -22.8189640045166,
"lp_dist": -24.96775245666504,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.087052345275879,
"lp_gold": -22.50095844268799,
"lp_dist": -24.588010787963867,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-11",
"gold_norm": "1",
"dist_norm": "2.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 13.553829669952393,
"lp_gold": -20.95840072631836,
"lp_dist": -34.51223039627075,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 11.875772953033447,
"lp_gold": -22.25577449798584,
"lp_dist": -34.13154745101929,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-47",
"gold_norm": "2.0",
"dist_norm": "return [x for x in values if isinstance(x, int)]",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 22.397947419434786,
"lp_gold": -22.972108960151672,
"lp_dist": -45.37005637958646,
"n_tokens_gold": 4,
"n_tokens_dist": 16
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 21.156506193903624,
"lp_gold": -25.384591817855835,
"lp_dist": -46.54109801175946,
"n_tokens_gold": 4,
"n_tokens_dist": 16
}
},
{
"ex_id": "openai_humaneval-test-22",
"gold_norm": "return [x for x in values if isinstance(x, int)]",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.9452141776912413,
"lp_gold": -21.9394551262028,
"lp_dist": -24.884669303894043,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.21686965267372216,
"lp_gold": -23.520365059778214,
"lp_dist": -23.303495407104492,
"n_tokens_gold": 16,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-140",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.853110790252686,
"lp_dist": -22.853110790252686,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -24.441052436828613,
"lp_dist": -24.441052436828613,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-30",
"gold_norm": "0",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.6810879707336426,
"lp_gold": -22.556360244750977,
"lp_dist": -24.23744821548462,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.2648732662200928,
"lp_gold": -24.702096939086914,
"lp_dist": -26.966970205307007,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-10",
"gold_norm": "-1",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -26.95520782470703,
"lp_dist": -26.95520782470703,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -24.403673887252808,
"lp_dist": -24.403673887252808,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-112",
"gold_norm": "-1",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -6.778575420379639,
"lp_gold": -21.71489953994751,
"lp_dist": -14.936324119567871,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.789752006530762,
"lp_gold": -22.717198848724365,
"lp_dist": -15.927446842193604,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-111",
"gold_norm": "1",
"dist_norm": "return ''.join(strings)",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 23.62041076645255,
"lp_gold": -18.88068723678589,
"lp_dist": -42.50109800323844,
"n_tokens_gold": 2,
"n_tokens_dist": 8
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 20.59145486354828,
"lp_gold": -19.92290210723877,
"lp_dist": -40.51435697078705,
"n_tokens_gold": 2,
"n_tokens_dist": 8
}
},
{
"ex_id": "openai_humaneval-test-28",
"gold_norm": "return ''.join(strings)",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.083352681776887,
"lp_gold": -25.75984155811966,
"lp_dist": -24.676488876342773,
"n_tokens_gold": 8,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.2199663514859367,
"lp_gold": -22.93246036817959,
"lp_dist": -24.152426719665527,
"n_tokens_gold": 8,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-135",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -16.229734420776367,
"lp_dist": -16.229734420776367,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -16.821220874786377,
"lp_dist": -16.821220874786377,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-94",
"gold_norm": "1",
"dist_norm": "while b:\n a, b = b, a % b\n return a",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 27.14890766143799,
"lp_gold": -17.605591773986816,
"lp_dist": -44.754499435424805,
"n_tokens_gold": 2,
"n_tokens_dist": 19
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 32.34489552024752,
"lp_gold": -17.090249061584473,
"lp_dist": -49.43514458183199,
"n_tokens_gold": 2,
"n_tokens_dist": 19
}
},
{
"ex_id": "openai_humaneval-test-13",
"gold_norm": "while b:\n a, b = b, a % b\n return a",
"dist_norm": "return [x for x in strings if substring in x]",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 27.657666564541046,
"lp_gold": -24.580688443994973,
"lp_dist": -52.23835500853602,
"n_tokens_gold": 19,
"n_tokens_dist": 13
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 26.75688247599828,
"lp_gold": -29.810987717977696,
"lp_dist": -56.56787019397598,
"n_tokens_gold": 19,
"n_tokens_dist": 13
}
},
{
"ex_id": "openai_humaneval-test-7",
"gold_norm": "return [x for x in strings if substring in x]",
"dist_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 60.87681590977445,
"lp_gold": -23.353178574070625,
"lp_dist": -84.22999448384508,
"n_tokens_gold": 13,
"n_tokens_dist": 37
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 83.19848300620379,
"lp_gold": -21.97879713297425,
"lp_dist": -105.17728013917804,
"n_tokens_gold": 13,
"n_tokens_dist": 37
}
},
{
"ex_id": "openai_humaneval-test-157",
"gold_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -22.37864390958157,
"lp_gold": -42.11520473111477,
"lp_dist": -19.736560821533203,
"n_tokens_gold": 37,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -21.383852594010023,
"lp_gold": -42.70623838197389,
"lp_dist": -21.322385787963867,
"n_tokens_gold": 37,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-49",
"gold_norm": "2",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 6.1286115646362305,
"lp_gold": -14.368673324584961,
"lp_dist": -20.49728488922119,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.370944976806641,
"lp_gold": -15.46406078338623,
"lp_dist": -21.83500576019287,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-120",
"gold_norm": "0",
"dist_norm": "26",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.207955360412598,
"lp_gold": -19.637977600097656,
"lp_dist": -23.845932960510254,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.25955867767334,
"lp_gold": -19.60517692565918,
"lp_dist": -21.86473560333252,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-89",
"gold_norm": "26",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -7.005466938018799,
"lp_gold": -25.530439853668213,
"lp_dist": -18.524972915649414,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.288686752319336,
"lp_gold": -25.664198875427246,
"lp_dist": -19.37551212310791,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-98",
"gold_norm": "1",
"dist_norm": "2",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.9043750762939453,
"lp_gold": -21.17328405380249,
"lp_dist": -23.077659130096436,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.2112646102905273,
"lp_gold": -22.75312328338623,
"lp_dist": -24.964387893676758,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-100",
"gold_norm": "2",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.1701860427856445,
"lp_gold": -22.101744651794434,
"lp_dist": -19.93155860900879,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.7842912673950195,
"lp_gold": -23.208542823791504,
"lp_dist": -21.424251556396484,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-64",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -17.467191696166992,
"lp_dist": -17.467191696166992,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -17.525324821472168,
"lp_dist": -17.525324821472168,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-103",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.922170639038086,
"lp_gold": -16.26321840286255,
"lp_dist": -19.185389041900635,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.1872482299804688,
"lp_gold": -18.55193328857422,
"lp_dist": -21.739181518554688,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-125",
"gold_norm": "0",
"dist_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 48.67979192888379,
"lp_gold": -21.47081232070923,
"lp_dist": -70.15060424959302,
"n_tokens_gold": 2,
"n_tokens_dist": 34
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 55.4912621024414,
"lp_gold": -21.899943828582764,
"lp_dist": -77.39120593102416,
"n_tokens_gold": 2,
"n_tokens_dist": 34
}
},
{
"ex_id": "openai_humaneval-test-51",
"gold_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -15.132140384576928,
"lp_gold": -33.735738979242456,
"lp_dist": -18.603598594665527,
"n_tokens_gold": 34,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -17.125044157398406,
"lp_gold": -38.15538339842624,
"lp_dist": -21.030339241027832,
"n_tokens_gold": 34,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-90",
"gold_norm": "1",
"dist_norm": "return [abs(x-y) for x,y in zip(game,guess)]",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 53.59280555654732,
"lp_gold": -17.83990478515625,
"lp_dist": -71.43271034170357,
"n_tokens_gold": 2,
"n_tokens_dist": 21
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 53.46250804614101,
"lp_gold": -17.562668800354004,
"lp_dist": -71.02517684649501,
"n_tokens_gold": 2,
"n_tokens_dist": 21
}
},
{
"ex_id": "openai_humaneval-test-152",
"gold_norm": "return [abs(x-y) for x,y in zip(game,guess)]",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.4121496778377605,
"lp_gold": -12.769094695483773,
"lp_dist": -18.181244373321533,
"n_tokens_gold": 21,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 6.846039920872499,
"lp_gold": -11.968789428645323,
"lp_dist": -18.814829349517822,
"n_tokens_gold": 21,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-24",
"gold_norm": "0",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.762319564819336,
"lp_gold": -21.209729194641113,
"lp_dist": -19.447409629821777,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.050973892211914,
"lp_gold": -23.011981964111328,
"lp_dist": -20.961008071899414,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-20",
"gold_norm": "2",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.5170793533325195,
"lp_gold": -21.059219360351562,
"lp_dist": -18.542140007019043,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.842561721801758,
"lp_gold": -24.164966583251953,
"lp_dist": -20.322404861450195,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-148",
"gold_norm": "1",
"dist_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 84.42804804586194,
"lp_gold": -12.750839233398438,
"lp_dist": -97.17888727926038,
"n_tokens_gold": 2,
"n_tokens_dist": 63
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 85.3895359868402,
"lp_gold": -12.885719776153564,
"lp_dist": -98.27525576299377,
"n_tokens_gold": 2,
"n_tokens_dist": 63
}
},
{
"ex_id": "openai_humaneval-test-92",
"gold_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -15.070005173284699,
"lp_gold": -29.284181828100373,
"lp_dist": -14.214176654815674,
"n_tokens_gold": 63,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -16.731429186844252,
"lp_gold": -33.610841837906264,
"lp_dist": -16.87941265106201,
"n_tokens_gold": 63,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-26",
"gold_norm": "1",
"dist_norm": "return len(set(string.lower()))",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 39.93833448708756,
"lp_gold": -21.593355178833008,
"lp_dist": -61.53168966592057,
"n_tokens_gold": 2,
"n_tokens_dist": 10
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 44.07288610804244,
"lp_gold": -19.37473964691162,
"lp_dist": -63.44762575495406,
"n_tokens_gold": 2,
"n_tokens_dist": 10
}
},
{
"ex_id": "openai_humaneval-test-16",
"gold_norm": "return len(set(string.lower()))",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.23131456784904003,
"lp_gold": -21.334825424477458,
"lp_dist": -21.103510856628418,
"n_tokens_gold": 10,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 1.4797544392640702,
"lp_gold": -20.777993210882414,
"lp_dist": -22.257747650146484,
"n_tokens_gold": 10,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-142",
"gold_norm": "3",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.5028867721557617,
"lp_gold": -20.428752899169922,
"lp_dist": -18.92586612701416,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.940119743347168,
"lp_gold": -20.01767873764038,
"lp_dist": -19.077558994293213,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-99",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.057826042175293,
"lp_dist": -20.057826042175293,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.03865337371826,
"lp_dist": -21.03865337371826,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-127",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.7136526107788086,
"lp_gold": -19.18015956878662,
"lp_dist": -15.466506958007812,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.9199934005737305,
"lp_gold": -19.79235076904297,
"lp_dist": -15.872357368469238,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-156",
"gold_norm": "1",
"dist_norm": "3",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.246551513671875,
"lp_gold": -19.624220848083496,
"lp_dist": -21.87077236175537,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.5017471313476562,
"lp_gold": -21.236965656280518,
"lp_dist": -23.738712787628174,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-141",
"gold_norm": "3",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.184619903564453,
"lp_gold": -20.60717535018921,
"lp_dist": -17.422555446624756,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.860507011413574,
"lp_gold": -21.143166542053223,
"lp_dist": -18.28265953063965,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-78",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2415552139282227,
"lp_gold": -19.249483585357666,
"lp_dist": -21.49103879928589,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.3341121673583984,
"lp_gold": -19.18357276916504,
"lp_dist": -21.517684936523438,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-68",
"gold_norm": "0",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -1.2041006088256836,
"lp_gold": -18.397984981536865,
"lp_dist": -17.19388437271118,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.9792289733886719,
"lp_gold": -17.703375339508057,
"lp_dist": -16.724146366119385,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-93",
"gold_norm": "2",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -2.300654411315918,
"lp_gold": -20.44687557220459,
"lp_dist": -18.146221160888672,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.002674102783203,
"lp_gold": -19.50245952606201,
"lp_dist": -17.49978542327881,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-60",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.0008726119995117,
"lp_gold": -17.277544021606445,
"lp_dist": -19.278416633605957,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 2.495342254638672,
"lp_gold": -20.217015266418457,
"lp_dist": -22.71235752105713,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-82",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -18.858778476715088,
"lp_dist": -18.858778476715088,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.178369522094727,
"lp_dist": -20.178369522094727,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-59",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -24.201942443847656,
"lp_dist": -24.201942443847656,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -25.577584266662598,
"lp_dist": -25.577584266662598,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-149",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.985135078430176,
"lp_gold": -18.369908809661865,
"lp_dist": -14.38477373123169,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -3.8884334564208984,
"lp_gold": -19.89932155609131,
"lp_dist": -16.01088809967041,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-42",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.361058235168457,
"lp_gold": -16.945013999938965,
"lp_dist": -20.306072235107422,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.650448799133301,
"lp_gold": -18.96267032623291,
"lp_dist": -22.61311912536621,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-163",
"gold_norm": "0",
"dist_norm": "2",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.24646282196044922,
"lp_gold": -21.461342811584473,
"lp_dist": -21.214879989624023,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.1632556915283203,
"lp_gold": -22.695805549621582,
"lp_dist": -22.859061241149902,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-133",
"gold_norm": "2",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.3052263259887695,
"lp_gold": -19.276253700256348,
"lp_dist": -24.581480026245117,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.072274208068848,
"lp_gold": -20.15150022506714,
"lp_dist": -25.223774433135986,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-161",
"gold_norm": "-1",
"dist_norm": "return [x for x in strings if x.startswith(prefix)]",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 34.15456820592226,
"lp_gold": -21.663207292556763,
"lp_dist": -55.81777549847902,
"n_tokens_gold": 3,
"n_tokens_dist": 16
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 39.88924138105904,
"lp_gold": -23.96406078338623,
"lp_dist": -63.85330216444527,
"n_tokens_gold": 3,
"n_tokens_dist": 16
}
},
{
"ex_id": "openai_humaneval-test-29",
"gold_norm": "return [x for x in strings if x.startswith(prefix)]",
"dist_norm": "return string.swapcase()",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 29.161806431533478,
"lp_gold": -23.64715713548503,
"lp_dist": -52.80896356701851,
"n_tokens_gold": 16,
"n_tokens_dist": 7
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 27.964344927147977,
"lp_gold": -21.276650241537936,
"lp_dist": -49.24099516868591,
"n_tokens_gold": 16,
"n_tokens_dist": 7
}
},
{
"ex_id": "openai_humaneval-test-27",
"gold_norm": "return string.swapcase()",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.143668868753593,
"lp_gold": -18.235226890828926,
"lp_dist": -23.37889575958252,
"n_tokens_gold": 7,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.050230901411851,
"lp_gold": -19.54451664192311,
"lp_dist": -22.59474754333496,
"n_tokens_gold": 7,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-61",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.488422393798828,
"lp_gold": -21.18766164779663,
"lp_dist": -17.699239253997803,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.8497695922851562,
"lp_gold": -23.119681358337402,
"lp_dist": -20.269911766052246,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-104",
"gold_norm": "1",
"dist_norm": "2.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 8.777350425720215,
"lp_gold": -21.164722442626953,
"lp_dist": -29.942072868347168,
"n_tokens_gold": 2,
"n_tokens_dist": 4
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 8.533888697624207,
"lp_gold": -22.994476318359375,
"lp_dist": -31.52836501598358,
"n_tokens_gold": 2,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-45",
"gold_norm": "2.0",
"dist_norm": "101",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.062639269977808,
"lp_gold": -26.36994906887412,
"lp_dist": -31.43258833885193,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.08874404430389404,
"lp_gold": -27.106158316135406,
"lp_dist": -27.017414271831512,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-75",
"gold_norm": "101",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -7.495836019515991,
"lp_gold": -24.720885515213013,
"lp_dist": -17.22504949569702,
"n_tokens_gold": 4,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -6.730545282363892,
"lp_gold": -24.493883848190308,
"lp_dist": -17.763338565826416,
"n_tokens_gold": 4,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-6",
"gold_norm": "1",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 3.3637475967407227,
"lp_gold": -19.45144271850586,
"lp_dist": -22.815190315246582,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 3.392610549926758,
"lp_gold": -19.24812602996826,
"lp_dist": -22.64073657989502,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-136",
"gold_norm": "0",
"dist_norm": "3",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -0.40959739685058594,
"lp_gold": -18.25609064102173,
"lp_dist": -17.846493244171143,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0858154296875,
"lp_gold": -19.771966457366943,
"lp_dist": -19.857781887054443,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-33",
"gold_norm": "3",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.021329879760742188,
"lp_gold": -19.846437454223633,
"lp_dist": -19.867767333984375,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.7291717529296875,
"lp_gold": -21.070161819458008,
"lp_dist": -21.799333572387695,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-44",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.9867525100708,
"lp_dist": -22.9867525100708,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.455985069274902,
"lp_dist": -22.455985069274902,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-88",
"gold_norm": "0",
"dist_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 91.28553534628554,
"lp_gold": -19.742467880249023,
"lp_dist": -111.02800322653457,
"n_tokens_gold": 2,
"n_tokens_dist": 48
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 100.06942167917623,
"lp_gold": -20.66820240020752,
"lp_dist": -120.73762407938375,
"n_tokens_gold": 2,
"n_tokens_dist": 48
}
},
{
"ex_id": "openai_humaneval-test-12",
"gold_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s",
"dist_norm": "9",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -26.818854912871643,
"lp_gold": -51.12532578384332,
"lp_dist": -24.30647087097168,
"n_tokens_gold": 48,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -32.62358831267056,
"lp_gold": -56.370979059740876,
"lp_dist": -23.747390747070312,
"n_tokens_gold": 48,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-105",
"gold_norm": "9",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.5095605850219727,
"lp_gold": -20.889235973358154,
"lp_dist": -17.37967538833618,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.047847747802734,
"lp_gold": -22.52873468399048,
"lp_dist": -17.480886936187744,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-121",
"gold_norm": "1",
"dist_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 97.35365189886329,
"lp_gold": -20.428816318511963,
"lp_dist": -117.78246821737525,
"n_tokens_gold": 2,
"n_tokens_dist": 75
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 109.46097758087126,
"lp_gold": -22.744394302368164,
"lp_dist": -132.20537188323942,
"n_tokens_gold": 2,
"n_tokens_dist": 75
}
},
{
"ex_id": "openai_humaneval-test-144",
"gold_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -39.772302772080536,
"lp_gold": -56.47381558564865,
"lp_dist": -16.701512813568115,
"n_tokens_gold": 75,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -56.80937039689411,
"lp_gold": -74.05506884888996,
"lp_dist": -17.24569845199585,
"n_tokens_gold": 75,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-73",
"gold_norm": "1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -19.193764686584473,
"lp_dist": -19.193764686584473,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -20.223108291625977,
"lp_dist": -20.223108291625977,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-76",
"gold_norm": "1",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 4.929764986038208,
"lp_gold": -15.572730541229248,
"lp_dist": -20.502495527267456,
"n_tokens_gold": 2,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 5.190648078918457,
"lp_gold": -16.13015127182007,
"lp_dist": -21.320799350738525,
"n_tokens_gold": 2,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-5",
"gold_norm": "-1",
"dist_norm": "-1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -25.595508337020874,
"lp_dist": -25.595508337020874,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -30.356321334838867,
"lp_dist": -30.356321334838867,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-46",
"gold_norm": "-1",
"dist_norm": "0",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.3807437419891357,
"lp_gold": -22.49609923362732,
"lp_dist": -19.115355491638184,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.765854597091675,
"lp_gold": -24.438928365707397,
"lp_dist": -21.673073768615723,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-150",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.250157833099365,
"lp_dist": -22.250157833099365,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -22.30048179626465,
"lp_dist": -22.30048179626465,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-35",
"gold_norm": "0",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -21.539960861206055,
"lp_dist": -21.539960861206055,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "gold",
"correct": true,
"margin": 0.0,
"lp_gold": -23.488576889038086,
"lp_dist": -23.488576889038086,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-153",
"gold_norm": "0",
"dist_norm": "1",
"baseline": {
"pred": "dist",
"correct": false,
"margin": -3.9339113235473633,
"lp_gold": -19.433120727539062,
"lp_dist": -15.4992094039917,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -5.801613807678223,
"lp_gold": -20.252619743347168,
"lp_dist": -14.451005935668945,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
}
],
"flip_rows": [
{
"ex_id": "openai_humaneval-test-39",
"gold_norm": "-1",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.44877076148986816,
"lp_gold": -19.808196783065796,
"lp_dist": -20.256967544555664,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.15900945663452148,
"lp_gold": -22.082778453826904,
"lp_dist": -21.923768997192383,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.70906001329422,
"lp_gold": -19.808191001415253,
"lp_dist": -20.517251014709473,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.2600139379501343,
"lp_gold": -20.45387899875641,
"lp_dist": -20.713892936706543,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.896909236907959,
"lp_gold": -22.914924144744873,
"lp_dist": -21.018014907836914,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "gold",
"correct": true,
"margin": 0.4487643241882324,
"lp_gold": -22.053846836090088,
"lp_dist": -22.50261116027832,
"n_tokens_gold": 3,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.5229349136352539,
"lp_gold": -22.082777976989746,
"lp_dist": -21.559843063354492,
"n_tokens_gold": 3,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-134",
"gold_norm": "122",
"dist_norm": "1.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.8403654620051384,
"lp_gold": -27.5423321723938,
"lp_dist": -28.382697634398937,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -2.457980029284954,
"lp_gold": -29.874002933502197,
"lp_dist": -27.416022904217243,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -2.6199172660708427,
"lp_gold": -27.5423264503479,
"lp_dist": -24.922409184277058,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.1696217805147171,
"lp_gold": -28.275829792022705,
"lp_dist": -28.445451572537422,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.7151931561529636,
"lp_gold": -29.58313512802124,
"lp_dist": -26.867941971868277,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -2.835217572748661,
"lp_gold": -29.671478271484375,
"lp_dist": -26.836260698735714,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -2.1110972091555595,
"lp_gold": -29.87399911880493,
"lp_dist": -27.762901909649372,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-4",
"gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.45850569046490364,
"lp_gold": -22.989404618372987,
"lp_dist": -23.44791030883789,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -8.605282360978833,
"lp_gold": -30.1749826020677,
"lp_dist": -21.569700241088867,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "dist",
"correct": false,
"margin": -6.219808316351077,
"lp_gold": -29.551916813970706,
"lp_dist": -23.33210849761963,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -6.893468835648207,
"lp_gold": -30.730242708023695,
"lp_dist": -23.83677387237549,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -8.074352583655013,
"lp_gold": -29.26745828128685,
"lp_dist": -21.193105697631836,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -9.070045394459783,
"lp_gold": -29.0644835658465,
"lp_dist": -19.99443817138672,
"n_tokens_gold": 33,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -8.754907058842889,
"lp_gold": -30.174978660710565,
"lp_dist": -21.420071601867676,
"n_tokens_gold": 33,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-139",
"gold_norm": "+1",
"dist_norm": "0.0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 1.2652113437652588,
"lp_gold": -22.48181676864624,
"lp_dist": -23.7470281124115,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.1343896389007568,
"lp_gold": -23.847239017486572,
"lp_dist": -22.712849378585815,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.8445468246936798,
"lp_gold": -22.48181438446045,
"lp_dist": -23.32636120915413,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -1.0516579747200012,
"lp_gold": -24.160611152648926,
"lp_dist": -23.108953177928925,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -2.6032419204711914,
"lp_gold": -22.94077157974243,
"lp_dist": -20.33752965927124,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.2553260326385498,
"lp_gold": -24.39438009262085,
"lp_dist": -23.1390540599823,
"n_tokens_gold": 3,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.7304326295852661,
"lp_gold": -23.84722900390625,
"lp_dist": -23.116796374320984,
"n_tokens_gold": 3,
"n_tokens_dist": 4
}
},
{
"ex_id": "openai_humaneval-test-19",
"gold_norm": "9",
"dist_norm": "1",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 0.07897377014160156,
"lp_gold": -22.66366195678711,
"lp_dist": -22.74263572692871,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4677810668945312,
"lp_gold": -23.768733024597168,
"lp_dist": -22.300951957702637,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.07897186279296875,
"lp_gold": -22.663668632507324,
"lp_dist": -22.742640495300293,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 0.2928171157836914,
"lp_gold": -23.486221313476562,
"lp_dist": -23.779038429260254,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.1306524276733398,
"lp_gold": -22.043014526367188,
"lp_dist": -20.912362098693848,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.335036277770996,
"lp_gold": -23.414420127868652,
"lp_dist": -22.079383850097656,
"n_tokens_gold": 2,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.4677820205688477,
"lp_gold": -23.768739700317383,
"lp_dist": -22.300957679748535,
"n_tokens_gold": 2,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-106",
"gold_norm": "+1",
"dist_norm": "26",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.2219808101654053,
"lp_gold": -21.646430253982544,
"lp_dist": -23.86841106414795,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -1.4264450073242188,
"lp_gold": -24.843106269836426,
"lp_dist": -23.416661262512207,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 0.3593015670776367,
"lp_gold": -21.646427631378174,
"lp_dist": -22.00572919845581,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_time_shuffled": {
"pred": "dist",
"correct": false,
"margin": -2.295396327972412,
"lp_gold": -24.159332752227783,
"lp_dist": -21.86393642425537,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -1.9753856658935547,
"lp_gold": -25.059080600738525,
"lp_dist": -23.08369493484497,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -1.8524909019470215,
"lp_gold": -25.18923282623291,
"lp_dist": -23.33674192428589,
"n_tokens_gold": 3,
"n_tokens_dist": 3
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -1.216343879699707,
"lp_gold": -24.843104362487793,
"lp_dist": -23.626760482788086,
"n_tokens_gold": 3,
"n_tokens_dist": 3
}
},
{
"ex_id": "openai_humaneval-test-22",
"gold_norm": "return [x for x in values if isinstance(x, int)]",
"dist_norm": "0",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 2.9452141776912413,
"lp_gold": -21.9394551262028,
"lp_dist": -24.884669303894043,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.21686965267372216,
"lp_gold": -23.520365059778214,
"lp_dist": -23.303495407104492,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 2.3960064357215742,
"lp_gold": -21.941916614205184,
"lp_dist": -24.337923049926758,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 3.289606383860928,
"lp_gold": -21.179310509144443,
"lp_dist": -24.46891689300537,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.6156758015997639,
"lp_gold": -24.370192403162264,
"lp_dist": -23.7545166015625,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.07147166511367686,
"lp_gold": -25.524140808790435,
"lp_dist": -25.452669143676758,
"n_tokens_gold": 16,
"n_tokens_dist": 2
},
"control_patch_nonshared": {
"pred": "dist",
"correct": false,
"margin": -0.40586215297889794,
"lp_gold": -23.520359337732316,
"lp_dist": -23.114497184753418,
"n_tokens_gold": 16,
"n_tokens_dist": 2
}
},
{
"ex_id": "openai_humaneval-test-45",
"gold_norm": "2.0",
"dist_norm": "101",
"baseline": {
"pred": "gold",
"correct": true,
"margin": 5.062639269977808,
"lp_gold": -26.36994906887412,
"lp_dist": -31.43258833885193,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"ablated": {
"pred": "dist",
"correct": false,
"margin": -0.08874404430389404,
"lp_gold": -27.106158316135406,
"lp_dist": -27.017414271831512,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"patched_self": {
"pred": "gold",
"correct": true,
"margin": 5.745390687137842,
"lp_gold": -26.369948115199804,
"lp_dist": -32.11533880233765,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_time_shuffled": {
"pred": "gold",
"correct": true,
"margin": 3.564100921154022,
"lp_gold": -25.915751039981842,
"lp_dist": -29.479851961135864,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_shared_randvec": {
"pred": "dist",
"correct": false,
"margin": -0.768334724009037,
"lp_gold": -28.143043376505375,
"lp_dist": -27.374708652496338,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_rand_subspace": {
"pred": "dist",
"correct": false,
"margin": -0.5243468135595322,
"lp_gold": -28.490523919463158,
"lp_dist": -27.966177105903625,
"n_tokens_gold": 4,
"n_tokens_dist": 4
},
"control_patch_nonshared": {
"pred": "gold",
"correct": true,
"margin": 0.04392840713262558,
"lp_gold": -27.1061624661088,
"lp_dist": -27.150090873241425,
"n_tokens_gold": 4,
"n_tokens_dist": 4
}
}
]
}