| { |
| "meta": { |
| "model": "meta-llama/Llama-2-7b-chat-hf", |
| "device": "cuda", |
| "dtype": "fp32", |
| "layer": 10, |
| "layers_path": "model.layers", |
| "seed": 123, |
| "task": "humaneval", |
| "eval_mode": "pair_logprob", |
| "eval_meta": { |
| "hf_id": "openai_humaneval", |
| "split": "test", |
| "n_total": 164 |
| }, |
| "n_eval_loaded": 164, |
| "n_scanned": 164, |
| "base_acc_scan": 0.6585365853658537, |
| "ablt_acc_scan": 0.6402439024390244, |
| "flips_total": 8, |
| "flips_used": 8, |
| "patch_steps": [ |
| 0, |
| 1, |
| 2, |
| 3 |
| ], |
| "patch_n_steps": 4, |
| "Qs_path": "Q_shared_layer10.npy", |
| "Qs_shape": [ |
| 4096, |
| 97 |
| ], |
| "gold_text_prefix": " ", |
| "dist_text_prefix": " ", |
| "gold_max_tokens": 128, |
| "distractor_mode": "next_gold", |
| "answer_prefix_effective": "\nFinal answer:", |
| "max_new_tokens_effective": 64, |
| "run_coeff_controls": false, |
| "use_benchmark_loader": false, |
| "hf_id": "openai_humaneval", |
| "hf_split": "test" |
| }, |
| "summary_on_flips": { |
| "patched_self": { |
| "n": 8, |
| "rescued": 6, |
| "rescued_pct": 75.0, |
| "mean_delta_margin_vs_ablated": 2.1062567234039307, |
| "median_delta_margin_vs_ablated": 1.8823415040969849 |
| }, |
| "control_time_shuffled": { |
| "n": 8, |
| "rescued": 5, |
| "rescued_pct": 62.5, |
| "mean_delta_margin_vs_ablated": 1.611517310142517, |
| "median_delta_margin_vs_ablated": 1.736205816268921 |
| }, |
| "control_shared_randvec": { |
| "n": 8, |
| "rescued": 0, |
| "rescued_pct": 0.0, |
| "mean_delta_margin_vs_ablated": -0.5279055237770081, |
| "median_delta_margin_vs_ablated": -0.47387340664863586 |
| }, |
| "control_rand_subspace": { |
| "n": 8, |
| "rescued": 1, |
| "rescued_pct": 12.5, |
| "mean_delta_margin_vs_ablated": -0.11733363568782806, |
| "median_delta_margin_vs_ablated": -0.24908697605133057 |
| }, |
| "control_patch_nonshared": { |
| "n": 8, |
| "rescued": 1, |
| "rescued_pct": 12.5, |
| "mean_delta_margin_vs_ablated": 0.048883724957704544, |
| "median_delta_margin_vs_ablated": 0.0663357526063919 |
| } |
| }, |
| "scan_rows": [ |
| { |
| "ex_id": "openai_humaneval-test-18", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.234334945678711, |
| "lp_gold": -19.061742782592773, |
| "lp_dist": -21.296077728271484, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2801809310913086, |
| "lp_gold": -21.540138244628906, |
| "lp_dist": -23.820319175720215, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-31", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -24.834365844726562, |
| "lp_dist": -24.834365844726562, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -26.02178192138672, |
| "lp_dist": -26.02178192138672, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-158", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.071807861328125, |
| "lp_dist": -20.071807861328125, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.830044746398926, |
| "lp_dist": -20.830044746398926, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-43", |
| "gold_norm": "0", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9933905601501465, |
| "lp_gold": -19.265485286712646, |
| "lp_dist": -20.258875846862793, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1291828155517578, |
| "lp_gold": -20.913329124450684, |
| "lp_dist": -21.04251194000244, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-39", |
| "gold_norm": "-1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.44877076148986816, |
| "lp_gold": -19.808196783065796, |
| "lp_dist": -20.256967544555664, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.15900945663452148, |
| "lp_gold": -22.082778453826904, |
| "lp_dist": -21.923768997192383, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-15", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9036493301391602, |
| "lp_gold": -21.704959869384766, |
| "lp_dist": -23.608609199523926, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.597311019897461, |
| "lp_gold": -22.718390464782715, |
| "lp_dist": -24.315701484680176, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-151", |
| "gold_norm": "0", |
| "dist_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 121.7853691404589, |
| "lp_gold": -18.80201482772827, |
| "lp_dist": -140.58738396818717, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 79 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 121.62771476514445, |
| "lp_gold": -21.097187042236328, |
| "lp_dist": -142.72490180738077, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 79 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-101", |
| "gold_norm": "if not s:\n return []\n\n s_list = []\n\n for letter in s:\n if letter == ',':\n s_list.append(' ')\n else:\n s_list.append(letter)\n\n s_list = \"\".join(s_list)\n return s_list.split()", |
| "dist_norm": "return sorted(list(set(l)))", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -31.890593631385258, |
| "lp_gold": -82.64533315385356, |
| "lp_dist": -50.7547395224683, |
| "n_tokens_gold": 79, |
| "n_tokens_dist": 10 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -36.78753035544207, |
| "lp_gold": -92.11622255231043, |
| "lp_dist": -55.32869219686836, |
| "n_tokens_gold": 79, |
| "n_tokens_dist": 10 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-34", |
| "gold_norm": "return sorted(list(set(l)))", |
| "dist_norm": "for e in l:\n if e >= t:\n return False\n return True", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 49.12246614873311, |
| "lp_gold": -11.745759465690753, |
| "lp_dist": -60.868225614423864, |
| "n_tokens_gold": 10, |
| "n_tokens_dist": 21 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 45.176009765720664, |
| "lp_gold": -15.221526360244752, |
| "lp_dist": -60.397536125965416, |
| "n_tokens_gold": 10, |
| "n_tokens_dist": 21 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-52", |
| "gold_norm": "for e in l:\n if e >= t:\n return False\n return True", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -18.150727199346875, |
| "lp_gold": -34.72359650018916, |
| "lp_dist": -16.572869300842285, |
| "n_tokens_gold": 21, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -18.006402769460692, |
| "lp_gold": -34.587387361898436, |
| "lp_dist": -16.580984592437744, |
| "n_tokens_gold": 21, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-113", |
| "gold_norm": "1", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.4202613830566406, |
| "lp_gold": -14.915795803070068, |
| "lp_dist": -17.33605718612671, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.1670427322387695, |
| "lp_gold": -17.457444190979004, |
| "lp_dist": -19.624486923217773, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-119", |
| "gold_norm": "2", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.728981971740723, |
| "lp_dist": -19.728981971740723, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.550307273864746, |
| "lp_dist": -20.550307273864746, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-83", |
| "gold_norm": "2", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.055248260498047, |
| "lp_gold": -21.746171951293945, |
| "lp_dist": -19.6909236907959, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8670825958251953, |
| "lp_gold": -22.857107162475586, |
| "lp_dist": -20.99002456665039, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-116", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.5492916107177734, |
| "lp_gold": -13.82323694229126, |
| "lp_dist": -16.372528553009033, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.914440155029297, |
| "lp_gold": -15.276045799255371, |
| "lp_dist": -19.190485954284668, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-56", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.711745262145996, |
| "lp_dist": -20.711745262145996, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.994271278381348, |
| "lp_dist": -22.994271278381348, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-131", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.21437931060791, |
| "lp_dist": -20.21437931060791, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.886200428009033, |
| "lp_dist": -21.886200428009033, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-1", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -23.005115509033203, |
| "lp_dist": -23.005115509033203, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.74118995666504, |
| "lp_dist": -22.74118995666504, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-159", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.005736351013184, |
| "lp_gold": -21.52419900894165, |
| "lp_dist": -17.518462657928467, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.665071487426758, |
| "lp_gold": -20.96841859817505, |
| "lp_dist": -17.30334711074829, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-123", |
| "gold_norm": "1", |
| "dist_norm": "return len(string)", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 22.28865046799183, |
| "lp_gold": -18.46173858642578, |
| "lp_dist": -40.75038905441761, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 6 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 21.772113933227956, |
| "lp_gold": -18.439892768859863, |
| "lp_dist": -40.21200670208782, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 6 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-23", |
| "gold_norm": "return len(string)", |
| "dist_norm": "29", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 12.790524657903006, |
| "lp_gold": -15.688145462336252, |
| "lp_dist": -28.478670120239258, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.356130968688376, |
| "lp_gold": -16.101557362915628, |
| "lp_dist": -29.457688331604004, |
| "n_tokens_gold": 6, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-124", |
| "gold_norm": "29", |
| "dist_norm": "return encode_cyclic(encode_cyclic(s))", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 50.18577869143337, |
| "lp_gold": -21.59093189239502, |
| "lp_dist": -71.77671058382839, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 14 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 56.64552312903106, |
| "lp_gold": -22.290241479873657, |
| "lp_dist": -78.93576460890472, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 14 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-38", |
| "gold_norm": "return encode_cyclic(encode_cyclic(s))", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7194257393894077, |
| "lp_gold": -29.536041797739017, |
| "lp_dist": -26.81661605834961, |
| "n_tokens_gold": 14, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.8930526294279844, |
| "lp_gold": -29.922911218600348, |
| "lp_dist": -26.029858589172363, |
| "n_tokens_gold": 14, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-84", |
| "gold_norm": "2", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.775970458984375, |
| "lp_dist": -19.775970458984375, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.496947288513184, |
| "lp_dist": -20.496947288513184, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-41", |
| "gold_norm": "2", |
| "dist_norm": "122", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.278376579284668, |
| "lp_gold": -22.390517234802246, |
| "lp_dist": -30.668893814086914, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.729191303253174, |
| "lp_gold": -21.189680099487305, |
| "lp_dist": -30.91887140274048, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-134", |
| "gold_norm": "122", |
| "dist_norm": "1.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8403654620051384, |
| "lp_gold": -27.5423321723938, |
| "lp_dist": -28.382697634398937, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.457980029284954, |
| "lp_gold": -29.874002933502197, |
| "lp_dist": -27.416022904217243, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-2", |
| "gold_norm": "1.0", |
| "dist_norm": "+2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.533992663025856, |
| "lp_gold": -19.524202451109886, |
| "lp_dist": -25.058195114135742, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.357768684625626, |
| "lp_gold": -17.571494430303574, |
| "lp_dist": -25.9292631149292, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-80", |
| "gold_norm": "+2", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.821111679077148, |
| "lp_gold": -24.557270050048828, |
| "lp_dist": -18.73615837097168, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.8031325340271, |
| "lp_gold": -26.246718406677246, |
| "lp_dist": -20.443585872650146, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-74", |
| "gold_norm": "2", |
| "dist_norm": "5", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2735700607299805, |
| "lp_gold": -17.82392930984497, |
| "lp_dist": -19.09749937057495, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.1347217559814453, |
| "lp_gold": -18.707550048828125, |
| "lp_dist": -19.84227180480957, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-162", |
| "gold_norm": "5", |
| "dist_norm": "8", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4634990692138672, |
| "lp_gold": -19.24656867980957, |
| "lp_dist": -19.710067749023438, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.9875469207763672, |
| "lp_gold": -21.240734100341797, |
| "lp_dist": -22.228281021118164, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-138", |
| "gold_norm": "8", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1303815841674805, |
| "lp_gold": -21.265185356140137, |
| "lp_dist": -20.134803771972656, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.3591747283935547, |
| "lp_gold": -23.056735038757324, |
| "lp_dist": -21.69756031036377, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-87", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.202168941497803, |
| "lp_dist": -21.202168941497803, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.12106418609619, |
| "lp_dist": -22.12106418609619, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-145", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.991208076477051, |
| "lp_gold": -22.26430034637451, |
| "lp_dist": -19.27309226989746, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.47486686706543, |
| "lp_gold": -24.317991256713867, |
| "lp_dist": -19.843124389648438, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-54", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8734645843505859, |
| "lp_gold": -22.476731300354004, |
| "lp_dist": -23.35019588470459, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6492033004760742, |
| "lp_gold": -24.899346351623535, |
| "lp_dist": -26.54854965209961, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-109", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.480426788330078, |
| "lp_gold": -19.18559741973877, |
| "lp_dist": -16.70517063140869, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.790494918823242, |
| "lp_gold": -21.397884845733643, |
| "lp_dist": -16.6073899269104, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-102", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.423691272735596, |
| "lp_dist": -19.423691272735596, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.474623680114746, |
| "lp_dist": -20.474623680114746, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-62", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -17.160552978515625, |
| "lp_dist": -17.160552978515625, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -15.657040119171143, |
| "lp_dist": -15.657040119171143, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-129", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.168509483337402, |
| "lp_dist": -19.168509483337402, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.092866897583008, |
| "lp_dist": -19.092866897583008, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-110", |
| "gold_norm": "1", |
| "dist_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 56.64819990084652, |
| "lp_gold": -12.325343608856201, |
| "lp_dist": -68.97354350970272, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 33 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 60.25500547605043, |
| "lp_gold": -11.27043867111206, |
| "lp_dist": -71.52544414716249, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 33 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-4", |
| "gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.45850569046490364, |
| "lp_gold": -22.989404618372987, |
| "lp_dist": -23.44791030883789, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.605282360978833, |
| "lp_gold": -30.1749826020677, |
| "lp_dist": -21.569700241088867, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-8", |
| "gold_norm": "1", |
| "dist_norm": "10", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.0166356563568115, |
| "lp_gold": -18.3596510887146, |
| "lp_dist": -21.37628674507141, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.0787312984466553, |
| "lp_gold": -18.83952569961548, |
| "lp_dist": -21.918256998062134, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-97", |
| "gold_norm": "10", |
| "dist_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 94.84300347900216, |
| "lp_gold": -21.25941014289856, |
| "lp_dist": -116.10241362190072, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 100 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 98.10631249527887, |
| "lp_gold": -23.950812816619873, |
| "lp_dist": -122.05712531189874, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 100 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-137", |
| "gold_norm": "temp_a, temp_b = a, b\n if isinstance(temp_a, str): temp_a = temp_a.replace(',','.')\n if isinstance(temp_b, str): temp_b = temp_b.replace(',','.')\n if float(temp_a) == float(temp_b): return None\n return a if float(temp_a) > float(temp_b) else b", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -67.42598734535835, |
| "lp_gold": -83.42734728493355, |
| "lp_dist": -16.001359939575195, |
| "n_tokens_gold": 100, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -67.17782856585814, |
| "lp_gold": -82.82997252108885, |
| "lp_dist": -15.652143955230713, |
| "n_tokens_gold": 100, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-96", |
| "gold_norm": "0", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.6102466583251953, |
| "lp_gold": -18.234556198120117, |
| "lp_dist": -21.844802856445312, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.688672065734863, |
| "lp_gold": -19.75603199005127, |
| "lp_dist": -24.444704055786133, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-65", |
| "gold_norm": "-1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.8471782207489014, |
| "lp_gold": -22.299251317977905, |
| "lp_dist": -21.452073097229004, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.24910449981689453, |
| "lp_gold": -23.086091995239258, |
| "lp_dist": -22.836987495422363, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-114", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8158721923828125, |
| "lp_gold": -22.96446418762207, |
| "lp_dist": -21.148591995239258, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.1736278533935547, |
| "lp_gold": -25.378070831298828, |
| "lp_dist": -22.204442977905273, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-117", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -15.196922779083252, |
| "lp_dist": -15.196922779083252, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -15.9734787940979, |
| "lp_dist": -15.9734787940979, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-155", |
| "gold_norm": "1", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.606046199798584, |
| "lp_gold": -18.941298484802246, |
| "lp_dist": -21.54734468460083, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.5056796073913574, |
| "lp_gold": -20.002915382385254, |
| "lp_dist": -23.50859498977661, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-37", |
| "gold_norm": "-1", |
| "dist_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 86.6937904068327, |
| "lp_gold": -20.384744882583618, |
| "lp_dist": -107.07853528941632, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 20 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 87.06271385207947, |
| "lp_gold": -21.526390075683594, |
| "lp_dist": -108.58910392776306, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 20 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-115", |
| "gold_norm": "return sum([math.ceil(sum(arr)/capacity) for arr in grid])", |
| "dist_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 24.88441585241617, |
| "lp_gold": -46.10199257756449, |
| "lp_dist": -70.98640842998066, |
| "n_tokens_gold": 20, |
| "n_tokens_dist": 24 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 29.905949345506542, |
| "lp_gold": -44.645553992972054, |
| "lp_dist": -74.5515033384786, |
| "n_tokens_gold": 20, |
| "n_tokens_dist": 24 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-86", |
| "gold_norm": "return ' '.join([''.join(sorted(list(i))) for i in s.split(' ')])", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -14.466940372163663, |
| "lp_gold": -36.1265668560809, |
| "lp_dist": -21.659626483917236, |
| "n_tokens_gold": 24, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -15.646357826058647, |
| "lp_gold": -39.13540583020904, |
| "lp_dist": -23.48904800415039, |
| "n_tokens_gold": 24, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-85", |
| "gold_norm": "0", |
| "dist_norm": "return x + y", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.443425707519054, |
| "lp_gold": -16.92650079727173, |
| "lp_dist": -23.369926504790783, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.885157495737076, |
| "lp_gold": -17.333487033843994, |
| "lp_dist": -23.21864452958107, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 5 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-53", |
| "gold_norm": "return x + y", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.916433400284404, |
| "lp_gold": -10.99827282987917, |
| "lp_dist": -20.914706230163574, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 9.794849171276837, |
| "lp_gold": -10.414248690966815, |
| "lp_dist": -20.209097862243652, |
| "n_tokens_gold": 5, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-130", |
| "gold_norm": "2", |
| "dist_norm": "+1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.369633674621582, |
| "lp_gold": -20.579893589019775, |
| "lp_dist": -22.949527263641357, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.291049003601074, |
| "lp_gold": -21.44382381439209, |
| "lp_dist": -24.734872817993164, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-139", |
| "gold_norm": "+1", |
| "dist_norm": "0.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2652113437652588, |
| "lp_gold": -22.48181676864624, |
| "lp_dist": -23.7470281124115, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1343896389007568, |
| "lp_gold": -23.847239017486572, |
| "lp_dist": -22.712849378585815, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-81", |
| "gold_norm": "0.0", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.3655529320240021, |
| "lp_gold": -22.439337760210037, |
| "lp_dist": -22.073784828186035, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.13648897409439087, |
| "lp_gold": -23.279959738254547, |
| "lp_dist": -23.143470764160156, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-19", |
| "gold_norm": "9", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.07897377014160156, |
| "lp_gold": -22.66366195678711, |
| "lp_dist": -22.74263572692871, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4677810668945312, |
| "lp_gold": -23.768733024597168, |
| "lp_dist": -22.300951957702637, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-69", |
| "gold_norm": "1", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.650723457336426, |
| "lp_gold": -15.441932678222656, |
| "lp_dist": -18.092656135559082, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.28917121887207, |
| "lp_gold": -14.679984092712402, |
| "lp_dist": -18.969155311584473, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-122", |
| "gold_norm": "2", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.9139537811279297, |
| "lp_gold": -18.55172109603882, |
| "lp_dist": -17.63776731491089, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.36802101135253906, |
| "lp_gold": -18.541946411132812, |
| "lp_dist": -18.173925399780273, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-108", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.612565040588379, |
| "lp_gold": -22.057893753051758, |
| "lp_dist": -19.44532871246338, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.8640785217285156, |
| "lp_gold": -23.09704303741455, |
| "lp_dist": -19.232964515686035, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-48", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.96462059020996, |
| "lp_dist": -21.96462059020996, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -24.449016571044922, |
| "lp_dist": -24.449016571044922, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-17", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.34698486328125, |
| "lp_gold": -21.55142116546631, |
| "lp_dist": -22.89840602874756, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.0376691818237305, |
| "lp_gold": -20.85892963409424, |
| "lp_dist": -21.89659881591797, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-95", |
| "gold_norm": "0", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8023662567138672, |
| "lp_gold": -20.999281883239746, |
| "lp_dist": -19.19691562652588, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.70697021484375, |
| "lp_gold": -21.713737964630127, |
| "lp_dist": -20.006767749786377, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-91", |
| "gold_norm": "2", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.7028169631958, |
| "lp_dist": -21.7028169631958, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.45036220550537, |
| "lp_dist": -22.45036220550537, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-79", |
| "gold_norm": "2", |
| "dist_norm": "+1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.754400730133057, |
| "lp_gold": -17.42607831954956, |
| "lp_dist": -24.180479049682617, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.1651506423950195, |
| "lp_gold": -19.892897605895996, |
| "lp_dist": -26.058048248291016, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-14", |
| "gold_norm": "+1", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.5707898139953613, |
| "lp_gold": -29.604674816131592, |
| "lp_dist": -26.03388500213623, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.903932571411133, |
| "lp_gold": -27.936614990234375, |
| "lp_dist": -22.032682418823242, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-77", |
| "gold_norm": "3", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.08347511291503906, |
| "lp_gold": -20.79250478744507, |
| "lp_dist": -20.875979900360107, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4646930694580078, |
| "lp_gold": -21.741466522216797, |
| "lp_dist": -22.206159591674805, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-3", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.1768722534179688, |
| "lp_gold": -23.416447639465332, |
| "lp_dist": -21.239575386047363, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.3070783615112305, |
| "lp_gold": -20.640877723693848, |
| "lp_dist": -16.333799362182617, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-146", |
| "gold_norm": "1", |
| "dist_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 52.92330244462937, |
| "lp_gold": -17.06475257873535, |
| "lp_dist": -69.98805502336472, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 27 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 50.48234937642701, |
| "lp_gold": -18.093742847442627, |
| "lp_dist": -68.57609222386964, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 27 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-57", |
| "gold_norm": "if l == sorted(l) or l == sorted(l, reverse=True):\n return True\n return False", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -19.544386039720848, |
| "lp_gold": -39.81637967680581, |
| "lp_dist": -20.27199363708496, |
| "n_tokens_gold": 27, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -23.215539787866874, |
| "lp_gold": -47.03316483364324, |
| "lp_dist": -23.817625045776367, |
| "n_tokens_gold": 27, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-143", |
| "gold_norm": "2", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -18.56871795654297, |
| "lp_dist": -18.56871795654297, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -18.614916801452637, |
| "lp_dist": -18.614916801452637, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-0", |
| "gold_norm": "2", |
| "dist_norm": "+1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.107102870941162, |
| "lp_gold": -24.08918285369873, |
| "lp_dist": -31.196285724639893, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.970984935760498, |
| "lp_gold": -24.804469108581543, |
| "lp_dist": -31.77545404434204, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-106", |
| "gold_norm": "+1", |
| "dist_norm": "26", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2219808101654053, |
| "lp_gold": -21.646430253982544, |
| "lp_dist": -23.86841106414795, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4264450073242188, |
| "lp_gold": -24.843106269836426, |
| "lp_dist": -23.416661262512207, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-50", |
| "gold_norm": "26", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6608308553695679, |
| "lp_gold": -26.048061728477478, |
| "lp_dist": -25.38723087310791, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.0806753635406494, |
| "lp_gold": -25.965544939041138, |
| "lp_dist": -23.88486957550049, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-58", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0009937286376953, |
| "lp_gold": -17.66666841506958, |
| "lp_dist": -19.667662143707275, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9970569610595703, |
| "lp_gold": -19.63721752166748, |
| "lp_dist": -21.63427448272705, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-147", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.277851104736328, |
| "lp_gold": -21.182048797607422, |
| "lp_dist": -17.904197692871094, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.9687089920043945, |
| "lp_gold": -23.235244750976562, |
| "lp_dist": -19.266535758972168, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-160", |
| "gold_norm": "1", |
| "dist_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 82.88829492655168, |
| "lp_gold": -16.034985065460205, |
| "lp_dist": -98.92327999201189, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 45 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 79.75362334529109, |
| "lp_gold": -16.3989200592041, |
| "lp_dist": -96.15254340449519, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 45 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-67", |
| "gold_norm": "lis = list()\n for i in s.split(' '):\n if i.isdigit():\n lis.append(int(i))\n return n - sum(lis)", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -49.84833694786221, |
| "lp_gold": -66.66380525917202, |
| "lp_dist": -16.815468311309814, |
| "n_tokens_gold": 45, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -48.502373244005184, |
| "lp_gold": -65.89947893782232, |
| "lp_dist": -17.39710569381714, |
| "n_tokens_gold": 45, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-55", |
| "gold_norm": "2", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.25999921560287476, |
| "lp_gold": -24.11659049987793, |
| "lp_dist": -23.856591284275055, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.37139952182769775, |
| "lp_gold": -24.66786289215088, |
| "lp_dist": -25.039262413978577, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-118", |
| "gold_norm": "-1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.466064929962158, |
| "lp_gold": -23.7993106842041, |
| "lp_dist": -17.333245754241943, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.198596954345703, |
| "lp_gold": -26.275648593902588, |
| "lp_dist": -18.077051639556885, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-154", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -14.774529933929443, |
| "lp_dist": -14.774529933929443, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -16.673503875732422, |
| "lp_dist": -16.673503875732422, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-25", |
| "gold_norm": "1", |
| "dist_norm": "7", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.230724334716797, |
| "lp_gold": -22.422992706298828, |
| "lp_dist": -25.653717041015625, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.7021970748901367, |
| "lp_gold": -19.818692207336426, |
| "lp_dist": -23.520889282226562, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-36", |
| "gold_norm": "7", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.6618576049804688, |
| "lp_gold": -20.638415336608887, |
| "lp_dist": -23.300272941589355, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.1126937866210938, |
| "lp_gold": -22.601848602294922, |
| "lp_dist": -25.714542388916016, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-63", |
| "gold_norm": "3", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4981985092163086, |
| "lp_gold": -23.173019409179688, |
| "lp_dist": -21.67482089996338, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.069631576538086, |
| "lp_gold": -24.268108367919922, |
| "lp_dist": -23.198476791381836, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-132", |
| "gold_norm": "2", |
| "dist_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 68.18116059236388, |
| "lp_gold": -18.65871238708496, |
| "lp_dist": -86.83987297944884, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 44 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 67.7937639305901, |
| "lp_gold": -20.1024751663208, |
| "lp_dist": -87.8962390969109, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 44 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-21", |
| "gold_norm": "min_number = min(numbers)\n max_number = max(numbers)\n return [(x - min_number) / (max_number - min_number) for x in numbers]", |
| "dist_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 23.721003586428438, |
| "lp_gold": -41.06687730719631, |
| "lp_dist": -64.78788089362475, |
| "n_tokens_gold": 44, |
| "n_tokens_dist": 69 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 32.38431728373951, |
| "lp_gold": -42.50547727979096, |
| "lp_dist": -74.88979456353047, |
| "n_tokens_gold": 44, |
| "n_tokens_dist": 69 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-9", |
| "gold_norm": "running_max = None\n result = []\n\n for n in numbers:\n if running_max is None:\n running_max = n\n else:\n running_max = max(running_max, n)\n\n result.append(running_max)\n\n return result", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -29.805189722821147, |
| "lp_gold": -50.413360232159526, |
| "lp_dist": -20.60817050933838, |
| "n_tokens_gold": 69, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -51.59206402557629, |
| "lp_gold": -70.91829026001233, |
| "lp_dist": -19.326226234436035, |
| "n_tokens_gold": 69, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-72", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.30158805847168, |
| "lp_gold": -13.809431552886963, |
| "lp_dist": -19.111019611358643, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.650420188903809, |
| "lp_gold": -15.45082139968872, |
| "lp_dist": -20.10124158859253, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-128", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.34075927734375, |
| "lp_gold": -21.099153995513916, |
| "lp_dist": -17.758394718170166, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -4.7813920974731445, |
| "lp_gold": -22.55471706390381, |
| "lp_dist": -17.773324966430664, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-126", |
| "gold_norm": "1", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.619829177856445, |
| "lp_gold": -16.480793476104736, |
| "lp_dist": -24.10062265396118, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 7.267373561859131, |
| "lp_gold": -18.10063362121582, |
| "lp_dist": -25.36800718307495, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-70", |
| "gold_norm": "-1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.621180534362793, |
| "lp_gold": -23.137717247009277, |
| "lp_dist": -20.516536712646484, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.558529853820801, |
| "lp_gold": -25.226367950439453, |
| "lp_dist": -22.667838096618652, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-40", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -18.633878707885742, |
| "lp_dist": -18.633878707885742, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.059532165527344, |
| "lp_dist": -21.059532165527344, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-66", |
| "gold_norm": "0", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.13283920288085938, |
| "lp_gold": -17.218024253845215, |
| "lp_dist": -17.085185050964355, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.1121377944946289, |
| "lp_gold": -18.084847927093506, |
| "lp_dist": -17.972710132598877, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-71", |
| "gold_norm": "2", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2154407501220703, |
| "lp_gold": -18.285062789916992, |
| "lp_dist": -17.069622039794922, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.0220775604248047, |
| "lp_gold": -19.058651447296143, |
| "lp_dist": -18.036573886871338, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-107", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.3218297958374023, |
| "lp_gold": -17.986241340637207, |
| "lp_dist": -20.30807113647461, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.979066848754883, |
| "lp_gold": -16.702817916870117, |
| "lp_dist": -19.681884765625, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-32", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.1487884521484375, |
| "lp_gold": -22.8189640045166, |
| "lp_dist": -24.96775245666504, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.087052345275879, |
| "lp_gold": -22.50095844268799, |
| "lp_dist": -24.588010787963867, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-11", |
| "gold_norm": "1", |
| "dist_norm": "2.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 13.553829669952393, |
| "lp_gold": -20.95840072631836, |
| "lp_dist": -34.51223039627075, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 11.875772953033447, |
| "lp_gold": -22.25577449798584, |
| "lp_dist": -34.13154745101929, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-47", |
| "gold_norm": "2.0", |
| "dist_norm": "return [x for x in values if isinstance(x, int)]", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 22.397947419434786, |
| "lp_gold": -22.972108960151672, |
| "lp_dist": -45.37005637958646, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 16 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 21.156506193903624, |
| "lp_gold": -25.384591817855835, |
| "lp_dist": -46.54109801175946, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 16 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-22", |
| "gold_norm": "return [x for x in values if isinstance(x, int)]", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.9452141776912413, |
| "lp_gold": -21.9394551262028, |
| "lp_dist": -24.884669303894043, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.21686965267372216, |
| "lp_gold": -23.520365059778214, |
| "lp_dist": -23.303495407104492, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-140", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.853110790252686, |
| "lp_dist": -22.853110790252686, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -24.441052436828613, |
| "lp_dist": -24.441052436828613, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-30", |
| "gold_norm": "0", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.6810879707336426, |
| "lp_gold": -22.556360244750977, |
| "lp_dist": -24.23744821548462, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2648732662200928, |
| "lp_gold": -24.702096939086914, |
| "lp_dist": -26.966970205307007, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-10", |
| "gold_norm": "-1", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -26.95520782470703, |
| "lp_dist": -26.95520782470703, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -24.403673887252808, |
| "lp_dist": -24.403673887252808, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-112", |
| "gold_norm": "-1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.778575420379639, |
| "lp_gold": -21.71489953994751, |
| "lp_dist": -14.936324119567871, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.789752006530762, |
| "lp_gold": -22.717198848724365, |
| "lp_dist": -15.927446842193604, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-111", |
| "gold_norm": "1", |
| "dist_norm": "return ''.join(strings)", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 23.62041076645255, |
| "lp_gold": -18.88068723678589, |
| "lp_dist": -42.50109800323844, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 8 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 20.59145486354828, |
| "lp_gold": -19.92290210723877, |
| "lp_dist": -40.51435697078705, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 8 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-28", |
| "gold_norm": "return ''.join(strings)", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.083352681776887, |
| "lp_gold": -25.75984155811966, |
| "lp_dist": -24.676488876342773, |
| "n_tokens_gold": 8, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2199663514859367, |
| "lp_gold": -22.93246036817959, |
| "lp_dist": -24.152426719665527, |
| "n_tokens_gold": 8, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-135", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -16.229734420776367, |
| "lp_dist": -16.229734420776367, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -16.821220874786377, |
| "lp_dist": -16.821220874786377, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-94", |
| "gold_norm": "1", |
| "dist_norm": "while b:\n a, b = b, a % b\n return a", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 27.14890766143799, |
| "lp_gold": -17.605591773986816, |
| "lp_dist": -44.754499435424805, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 19 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 32.34489552024752, |
| "lp_gold": -17.090249061584473, |
| "lp_dist": -49.43514458183199, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 19 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-13", |
| "gold_norm": "while b:\n a, b = b, a % b\n return a", |
| "dist_norm": "return [x for x in strings if substring in x]", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 27.657666564541046, |
| "lp_gold": -24.580688443994973, |
| "lp_dist": -52.23835500853602, |
| "n_tokens_gold": 19, |
| "n_tokens_dist": 13 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 26.75688247599828, |
| "lp_gold": -29.810987717977696, |
| "lp_dist": -56.56787019397598, |
| "n_tokens_gold": 19, |
| "n_tokens_dist": 13 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-7", |
| "gold_norm": "return [x for x in strings if substring in x]", |
| "dist_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 60.87681590977445, |
| "lp_gold": -23.353178574070625, |
| "lp_dist": -84.22999448384508, |
| "n_tokens_gold": 13, |
| "n_tokens_dist": 37 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 83.19848300620379, |
| "lp_gold": -21.97879713297425, |
| "lp_dist": -105.17728013917804, |
| "n_tokens_gold": 13, |
| "n_tokens_dist": 37 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-157", |
| "gold_norm": "return a*a == b*b + c*c or b*b == a*a + c*c or c*c == a*a + b*b", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -22.37864390958157, |
| "lp_gold": -42.11520473111477, |
| "lp_dist": -19.736560821533203, |
| "n_tokens_gold": 37, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -21.383852594010023, |
| "lp_gold": -42.70623838197389, |
| "lp_dist": -21.322385787963867, |
| "n_tokens_gold": 37, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-49", |
| "gold_norm": "2", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.1286115646362305, |
| "lp_gold": -14.368673324584961, |
| "lp_dist": -20.49728488922119, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.370944976806641, |
| "lp_gold": -15.46406078338623, |
| "lp_dist": -21.83500576019287, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-120", |
| "gold_norm": "0", |
| "dist_norm": "26", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.207955360412598, |
| "lp_gold": -19.637977600097656, |
| "lp_dist": -23.845932960510254, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.25955867767334, |
| "lp_gold": -19.60517692565918, |
| "lp_dist": -21.86473560333252, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-89", |
| "gold_norm": "26", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.005466938018799, |
| "lp_gold": -25.530439853668213, |
| "lp_dist": -18.524972915649414, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.288686752319336, |
| "lp_gold": -25.664198875427246, |
| "lp_dist": -19.37551212310791, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-98", |
| "gold_norm": "1", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.9043750762939453, |
| "lp_gold": -21.17328405380249, |
| "lp_dist": -23.077659130096436, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2112646102905273, |
| "lp_gold": -22.75312328338623, |
| "lp_dist": -24.964387893676758, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-100", |
| "gold_norm": "2", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.1701860427856445, |
| "lp_gold": -22.101744651794434, |
| "lp_dist": -19.93155860900879, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.7842912673950195, |
| "lp_gold": -23.208542823791504, |
| "lp_dist": -21.424251556396484, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-64", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -17.467191696166992, |
| "lp_dist": -17.467191696166992, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -17.525324821472168, |
| "lp_dist": -17.525324821472168, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-103", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.922170639038086, |
| "lp_gold": -16.26321840286255, |
| "lp_dist": -19.185389041900635, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.1872482299804688, |
| "lp_gold": -18.55193328857422, |
| "lp_dist": -21.739181518554688, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-125", |
| "gold_norm": "0", |
| "dist_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 48.67979192888379, |
| "lp_gold": -21.47081232070923, |
| "lp_dist": -70.15060424959302, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 34 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 55.4912621024414, |
| "lp_gold": -21.899943828582764, |
| "lp_dist": -77.39120593102416, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 34 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-51", |
| "gold_norm": "return \"\".join([s for s in text if s.lower() not in [\"a\", \"e\", \"i\", \"o\", \"u\"]])", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -15.132140384576928, |
| "lp_gold": -33.735738979242456, |
| "lp_dist": -18.603598594665527, |
| "n_tokens_gold": 34, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -17.125044157398406, |
| "lp_gold": -38.15538339842624, |
| "lp_dist": -21.030339241027832, |
| "n_tokens_gold": 34, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-90", |
| "gold_norm": "1", |
| "dist_norm": "return [abs(x-y) for x,y in zip(game,guess)]", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 53.59280555654732, |
| "lp_gold": -17.83990478515625, |
| "lp_dist": -71.43271034170357, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 21 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 53.46250804614101, |
| "lp_gold": -17.562668800354004, |
| "lp_dist": -71.02517684649501, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 21 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-152", |
| "gold_norm": "return [abs(x-y) for x,y in zip(game,guess)]", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.4121496778377605, |
| "lp_gold": -12.769094695483773, |
| "lp_dist": -18.181244373321533, |
| "n_tokens_gold": 21, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 6.846039920872499, |
| "lp_gold": -11.968789428645323, |
| "lp_dist": -18.814829349517822, |
| "n_tokens_gold": 21, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-24", |
| "gold_norm": "0", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.762319564819336, |
| "lp_gold": -21.209729194641113, |
| "lp_dist": -19.447409629821777, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.050973892211914, |
| "lp_gold": -23.011981964111328, |
| "lp_dist": -20.961008071899414, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-20", |
| "gold_norm": "2", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.5170793533325195, |
| "lp_gold": -21.059219360351562, |
| "lp_dist": -18.542140007019043, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.842561721801758, |
| "lp_gold": -24.164966583251953, |
| "lp_dist": -20.322404861450195, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-148", |
| "gold_norm": "1", |
| "dist_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 84.42804804586194, |
| "lp_gold": -12.750839233398438, |
| "lp_dist": -97.17888727926038, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 63 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 85.3895359868402, |
| "lp_gold": -12.885719776153564, |
| "lp_dist": -98.27525576299377, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 63 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-92", |
| "gold_norm": "if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):\n if (x+y==z) or (x+z==y) or (y+z==x):\n return True\n return False\n return False", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -15.070005173284699, |
| "lp_gold": -29.284181828100373, |
| "lp_dist": -14.214176654815674, |
| "n_tokens_gold": 63, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -16.731429186844252, |
| "lp_gold": -33.610841837906264, |
| "lp_dist": -16.87941265106201, |
| "n_tokens_gold": 63, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-26", |
| "gold_norm": "1", |
| "dist_norm": "return len(set(string.lower()))", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 39.93833448708756, |
| "lp_gold": -21.593355178833008, |
| "lp_dist": -61.53168966592057, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 10 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 44.07288610804244, |
| "lp_gold": -19.37473964691162, |
| "lp_dist": -63.44762575495406, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 10 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-16", |
| "gold_norm": "return len(set(string.lower()))", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.23131456784904003, |
| "lp_gold": -21.334825424477458, |
| "lp_dist": -21.103510856628418, |
| "n_tokens_gold": 10, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.4797544392640702, |
| "lp_gold": -20.777993210882414, |
| "lp_dist": -22.257747650146484, |
| "n_tokens_gold": 10, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-142", |
| "gold_norm": "3", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.5028867721557617, |
| "lp_gold": -20.428752899169922, |
| "lp_dist": -18.92586612701416, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.940119743347168, |
| "lp_gold": -20.01767873764038, |
| "lp_dist": -19.077558994293213, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-99", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.057826042175293, |
| "lp_dist": -20.057826042175293, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.03865337371826, |
| "lp_dist": -21.03865337371826, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-127", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.7136526107788086, |
| "lp_gold": -19.18015956878662, |
| "lp_dist": -15.466506958007812, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.9199934005737305, |
| "lp_gold": -19.79235076904297, |
| "lp_dist": -15.872357368469238, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-156", |
| "gold_norm": "1", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.246551513671875, |
| "lp_gold": -19.624220848083496, |
| "lp_dist": -21.87077236175537, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.5017471313476562, |
| "lp_gold": -21.236965656280518, |
| "lp_dist": -23.738712787628174, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-141", |
| "gold_norm": "3", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.184619903564453, |
| "lp_gold": -20.60717535018921, |
| "lp_dist": -17.422555446624756, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.860507011413574, |
| "lp_gold": -21.143166542053223, |
| "lp_dist": -18.28265953063965, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-78", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2415552139282227, |
| "lp_gold": -19.249483585357666, |
| "lp_dist": -21.49103879928589, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.3341121673583984, |
| "lp_gold": -19.18357276916504, |
| "lp_dist": -21.517684936523438, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-68", |
| "gold_norm": "0", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2041006088256836, |
| "lp_gold": -18.397984981536865, |
| "lp_dist": -17.19388437271118, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.9792289733886719, |
| "lp_gold": -17.703375339508057, |
| "lp_dist": -16.724146366119385, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-93", |
| "gold_norm": "2", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.300654411315918, |
| "lp_gold": -20.44687557220459, |
| "lp_dist": -18.146221160888672, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.002674102783203, |
| "lp_gold": -19.50245952606201, |
| "lp_dist": -17.49978542327881, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-60", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.0008726119995117, |
| "lp_gold": -17.277544021606445, |
| "lp_dist": -19.278416633605957, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.495342254638672, |
| "lp_gold": -20.217015266418457, |
| "lp_dist": -22.71235752105713, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-82", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -18.858778476715088, |
| "lp_dist": -18.858778476715088, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.178369522094727, |
| "lp_dist": -20.178369522094727, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-59", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -24.201942443847656, |
| "lp_dist": -24.201942443847656, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -25.577584266662598, |
| "lp_dist": -25.577584266662598, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-149", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.985135078430176, |
| "lp_gold": -18.369908809661865, |
| "lp_dist": -14.38477373123169, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.8884334564208984, |
| "lp_gold": -19.89932155609131, |
| "lp_dist": -16.01088809967041, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-42", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.361058235168457, |
| "lp_gold": -16.945013999938965, |
| "lp_dist": -20.306072235107422, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.650448799133301, |
| "lp_gold": -18.96267032623291, |
| "lp_dist": -22.61311912536621, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-163", |
| "gold_norm": "0", |
| "dist_norm": "2", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.24646282196044922, |
| "lp_gold": -21.461342811584473, |
| "lp_dist": -21.214879989624023, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1632556915283203, |
| "lp_gold": -22.695805549621582, |
| "lp_dist": -22.859061241149902, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-133", |
| "gold_norm": "2", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.3052263259887695, |
| "lp_gold": -19.276253700256348, |
| "lp_dist": -24.581480026245117, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.072274208068848, |
| "lp_gold": -20.15150022506714, |
| "lp_dist": -25.223774433135986, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-161", |
| "gold_norm": "-1", |
| "dist_norm": "return [x for x in strings if x.startswith(prefix)]", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 34.15456820592226, |
| "lp_gold": -21.663207292556763, |
| "lp_dist": -55.81777549847902, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 16 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 39.88924138105904, |
| "lp_gold": -23.96406078338623, |
| "lp_dist": -63.85330216444527, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 16 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-29", |
| "gold_norm": "return [x for x in strings if x.startswith(prefix)]", |
| "dist_norm": "return string.swapcase()", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 29.161806431533478, |
| "lp_gold": -23.64715713548503, |
| "lp_dist": -52.80896356701851, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 7 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 27.964344927147977, |
| "lp_gold": -21.276650241537936, |
| "lp_dist": -49.24099516868591, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 7 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-27", |
| "gold_norm": "return string.swapcase()", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.143668868753593, |
| "lp_gold": -18.235226890828926, |
| "lp_dist": -23.37889575958252, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.050230901411851, |
| "lp_gold": -19.54451664192311, |
| "lp_dist": -22.59474754333496, |
| "n_tokens_gold": 7, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-61", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.488422393798828, |
| "lp_gold": -21.18766164779663, |
| "lp_dist": -17.699239253997803, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.8497695922851562, |
| "lp_gold": -23.119681358337402, |
| "lp_dist": -20.269911766052246, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-104", |
| "gold_norm": "1", |
| "dist_norm": "2.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.777350425720215, |
| "lp_gold": -21.164722442626953, |
| "lp_dist": -29.942072868347168, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 8.533888697624207, |
| "lp_gold": -22.994476318359375, |
| "lp_dist": -31.52836501598358, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-45", |
| "gold_norm": "2.0", |
| "dist_norm": "101", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.062639269977808, |
| "lp_gold": -26.36994906887412, |
| "lp_dist": -31.43258833885193, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.08874404430389404, |
| "lp_gold": -27.106158316135406, |
| "lp_dist": -27.017414271831512, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-75", |
| "gold_norm": "101", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -7.495836019515991, |
| "lp_gold": -24.720885515213013, |
| "lp_dist": -17.22504949569702, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.730545282363892, |
| "lp_gold": -24.493883848190308, |
| "lp_dist": -17.763338565826416, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-6", |
| "gold_norm": "1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.3637475967407227, |
| "lp_gold": -19.45144271850586, |
| "lp_dist": -22.815190315246582, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.392610549926758, |
| "lp_gold": -19.24812602996826, |
| "lp_dist": -22.64073657989502, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-136", |
| "gold_norm": "0", |
| "dist_norm": "3", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.40959739685058594, |
| "lp_gold": -18.25609064102173, |
| "lp_dist": -17.846493244171143, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0858154296875, |
| "lp_gold": -19.771966457366943, |
| "lp_dist": -19.857781887054443, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-33", |
| "gold_norm": "3", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.021329879760742188, |
| "lp_gold": -19.846437454223633, |
| "lp_dist": -19.867767333984375, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.7291717529296875, |
| "lp_gold": -21.070161819458008, |
| "lp_dist": -21.799333572387695, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-44", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.9867525100708, |
| "lp_dist": -22.9867525100708, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.455985069274902, |
| "lp_dist": -22.455985069274902, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-88", |
| "gold_norm": "0", |
| "dist_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 91.28553534628554, |
| "lp_gold": -19.742467880249023, |
| "lp_dist": -111.02800322653457, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 48 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 100.06942167917623, |
| "lp_gold": -20.66820240020752, |
| "lp_dist": -120.73762407938375, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 48 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-12", |
| "gold_norm": "if not strings:\n return None\n\n maxlen = max(len(x) for x in strings)\n for s in strings:\n if len(s) == maxlen:\n return s", |
| "dist_norm": "9", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -26.818854912871643, |
| "lp_gold": -51.12532578384332, |
| "lp_dist": -24.30647087097168, |
| "n_tokens_gold": 48, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -32.62358831267056, |
| "lp_gold": -56.370979059740876, |
| "lp_dist": -23.747390747070312, |
| "n_tokens_gold": 48, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-105", |
| "gold_norm": "9", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.5095605850219727, |
| "lp_gold": -20.889235973358154, |
| "lp_dist": -17.37967538833618, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.047847747802734, |
| "lp_gold": -22.52873468399048, |
| "lp_dist": -17.480886936187744, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-121", |
| "gold_norm": "1", |
| "dist_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 97.35365189886329, |
| "lp_gold": -20.428816318511963, |
| "lp_dist": -117.78246821737525, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 75 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 109.46097758087126, |
| "lp_gold": -22.744394302368164, |
| "lp_dist": -132.20537188323942, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 75 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-144", |
| "gold_norm": "a, b = x.split(\"/\")\n c, d = n.split(\"/\")\n numerator = int(a) * int(c)\n denom = int(b) * int(d)\n if (numerator/denom == int(numerator/denom)):\n return True\n return False", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -39.772302772080536, |
| "lp_gold": -56.47381558564865, |
| "lp_dist": -16.701512813568115, |
| "n_tokens_gold": 75, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -56.80937039689411, |
| "lp_gold": -74.05506884888996, |
| "lp_dist": -17.24569845199585, |
| "n_tokens_gold": 75, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-73", |
| "gold_norm": "1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -19.193764686584473, |
| "lp_dist": -19.193764686584473, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -20.223108291625977, |
| "lp_dist": -20.223108291625977, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-76", |
| "gold_norm": "1", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 4.929764986038208, |
| "lp_gold": -15.572730541229248, |
| "lp_dist": -20.502495527267456, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.190648078918457, |
| "lp_gold": -16.13015127182007, |
| "lp_dist": -21.320799350738525, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-5", |
| "gold_norm": "-1", |
| "dist_norm": "-1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -25.595508337020874, |
| "lp_dist": -25.595508337020874, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -30.356321334838867, |
| "lp_dist": -30.356321334838867, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-46", |
| "gold_norm": "-1", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.3807437419891357, |
| "lp_gold": -22.49609923362732, |
| "lp_dist": -19.115355491638184, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.765854597091675, |
| "lp_gold": -24.438928365707397, |
| "lp_dist": -21.673073768615723, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-150", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.250157833099365, |
| "lp_dist": -22.250157833099365, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -22.30048179626465, |
| "lp_dist": -22.30048179626465, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-35", |
| "gold_norm": "0", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -21.539960861206055, |
| "lp_dist": -21.539960861206055, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.0, |
| "lp_gold": -23.488576889038086, |
| "lp_dist": -23.488576889038086, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-153", |
| "gold_norm": "0", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -3.9339113235473633, |
| "lp_gold": -19.433120727539062, |
| "lp_dist": -15.4992094039917, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -5.801613807678223, |
| "lp_gold": -20.252619743347168, |
| "lp_dist": -14.451005935668945, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| } |
| ], |
| "flip_rows": [ |
| { |
| "ex_id": "openai_humaneval-test-39", |
| "gold_norm": "-1", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.44877076148986816, |
| "lp_gold": -19.808196783065796, |
| "lp_dist": -20.256967544555664, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.15900945663452148, |
| "lp_gold": -22.082778453826904, |
| "lp_dist": -21.923768997192383, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.70906001329422, |
| "lp_gold": -19.808191001415253, |
| "lp_dist": -20.517251014709473, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2600139379501343, |
| "lp_gold": -20.45387899875641, |
| "lp_dist": -20.713892936706543, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.896909236907959, |
| "lp_gold": -22.914924144744873, |
| "lp_dist": -21.018014907836914, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.4487643241882324, |
| "lp_gold": -22.053846836090088, |
| "lp_dist": -22.50261116027832, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.5229349136352539, |
| "lp_gold": -22.082777976989746, |
| "lp_dist": -21.559843063354492, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-134", |
| "gold_norm": "122", |
| "dist_norm": "1.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8403654620051384, |
| "lp_gold": -27.5423321723938, |
| "lp_dist": -28.382697634398937, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.457980029284954, |
| "lp_gold": -29.874002933502197, |
| "lp_dist": -27.416022904217243, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.6199172660708427, |
| "lp_gold": -27.5423264503479, |
| "lp_dist": -24.922409184277058, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.1696217805147171, |
| "lp_gold": -28.275829792022705, |
| "lp_dist": -28.445451572537422, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.7151931561529636, |
| "lp_gold": -29.58313512802124, |
| "lp_dist": -26.867941971868277, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.835217572748661, |
| "lp_gold": -29.671478271484375, |
| "lp_dist": -26.836260698735714, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.1110972091555595, |
| "lp_gold": -29.87399911880493, |
| "lp_dist": -27.762901909649372, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-4", |
| "gold_norm": "mean = sum(numbers) / len(numbers)\n return sum(abs(x - mean) for x in numbers) / len(numbers)", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.45850569046490364, |
| "lp_gold": -22.989404618372987, |
| "lp_dist": -23.44791030883789, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.605282360978833, |
| "lp_gold": -30.1749826020677, |
| "lp_dist": -21.569700241088867, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.219808316351077, |
| "lp_gold": -29.551916813970706, |
| "lp_dist": -23.33210849761963, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -6.893468835648207, |
| "lp_gold": -30.730242708023695, |
| "lp_dist": -23.83677387237549, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.074352583655013, |
| "lp_gold": -29.26745828128685, |
| "lp_dist": -21.193105697631836, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -9.070045394459783, |
| "lp_gold": -29.0644835658465, |
| "lp_dist": -19.99443817138672, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -8.754907058842889, |
| "lp_gold": -30.174978660710565, |
| "lp_dist": -21.420071601867676, |
| "n_tokens_gold": 33, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-139", |
| "gold_norm": "+1", |
| "dist_norm": "0.0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 1.2652113437652588, |
| "lp_gold": -22.48181676864624, |
| "lp_dist": -23.7470281124115, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1343896389007568, |
| "lp_gold": -23.847239017486572, |
| "lp_dist": -22.712849378585815, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.8445468246936798, |
| "lp_gold": -22.48181438446045, |
| "lp_dist": -23.32636120915413, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.0516579747200012, |
| "lp_gold": -24.160611152648926, |
| "lp_dist": -23.108953177928925, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.6032419204711914, |
| "lp_gold": -22.94077157974243, |
| "lp_dist": -20.33752965927124, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.2553260326385498, |
| "lp_gold": -24.39438009262085, |
| "lp_dist": -23.1390540599823, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.7304326295852661, |
| "lp_gold": -23.84722900390625, |
| "lp_dist": -23.116796374320984, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 4 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-19", |
| "gold_norm": "9", |
| "dist_norm": "1", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.07897377014160156, |
| "lp_gold": -22.66366195678711, |
| "lp_dist": -22.74263572692871, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4677810668945312, |
| "lp_gold": -23.768733024597168, |
| "lp_dist": -22.300951957702637, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.07897186279296875, |
| "lp_gold": -22.663668632507324, |
| "lp_dist": -22.742640495300293, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.2928171157836914, |
| "lp_gold": -23.486221313476562, |
| "lp_dist": -23.779038429260254, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.1306524276733398, |
| "lp_gold": -22.043014526367188, |
| "lp_dist": -20.912362098693848, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.335036277770996, |
| "lp_gold": -23.414420127868652, |
| "lp_dist": -22.079383850097656, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4677820205688477, |
| "lp_gold": -23.768739700317383, |
| "lp_dist": -22.300957679748535, |
| "n_tokens_gold": 2, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-106", |
| "gold_norm": "+1", |
| "dist_norm": "26", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.2219808101654053, |
| "lp_gold": -21.646430253982544, |
| "lp_dist": -23.86841106414795, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.4264450073242188, |
| "lp_gold": -24.843106269836426, |
| "lp_dist": -23.416661262512207, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.3593015670776367, |
| "lp_gold": -21.646427631378174, |
| "lp_dist": -22.00572919845581, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_time_shuffled": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -2.295396327972412, |
| "lp_gold": -24.159332752227783, |
| "lp_dist": -21.86393642425537, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.9753856658935547, |
| "lp_gold": -25.059080600738525, |
| "lp_dist": -23.08369493484497, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.8524909019470215, |
| "lp_gold": -25.18923282623291, |
| "lp_dist": -23.33674192428589, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -1.216343879699707, |
| "lp_gold": -24.843104362487793, |
| "lp_dist": -23.626760482788086, |
| "n_tokens_gold": 3, |
| "n_tokens_dist": 3 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-22", |
| "gold_norm": "return [x for x in values if isinstance(x, int)]", |
| "dist_norm": "0", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.9452141776912413, |
| "lp_gold": -21.9394551262028, |
| "lp_dist": -24.884669303894043, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.21686965267372216, |
| "lp_gold": -23.520365059778214, |
| "lp_dist": -23.303495407104492, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 2.3960064357215742, |
| "lp_gold": -21.941916614205184, |
| "lp_dist": -24.337923049926758, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.289606383860928, |
| "lp_gold": -21.179310509144443, |
| "lp_dist": -24.46891689300537, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.6156758015997639, |
| "lp_gold": -24.370192403162264, |
| "lp_dist": -23.7545166015625, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.07147166511367686, |
| "lp_gold": -25.524140808790435, |
| "lp_dist": -25.452669143676758, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| }, |
| "control_patch_nonshared": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.40586215297889794, |
| "lp_gold": -23.520359337732316, |
| "lp_dist": -23.114497184753418, |
| "n_tokens_gold": 16, |
| "n_tokens_dist": 2 |
| } |
| }, |
| { |
| "ex_id": "openai_humaneval-test-45", |
| "gold_norm": "2.0", |
| "dist_norm": "101", |
| "baseline": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.062639269977808, |
| "lp_gold": -26.36994906887412, |
| "lp_dist": -31.43258833885193, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "ablated": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.08874404430389404, |
| "lp_gold": -27.106158316135406, |
| "lp_dist": -27.017414271831512, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "patched_self": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 5.745390687137842, |
| "lp_gold": -26.369948115199804, |
| "lp_dist": -32.11533880233765, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_time_shuffled": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 3.564100921154022, |
| "lp_gold": -25.915751039981842, |
| "lp_dist": -29.479851961135864, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_shared_randvec": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.768334724009037, |
| "lp_gold": -28.143043376505375, |
| "lp_dist": -27.374708652496338, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_rand_subspace": { |
| "pred": "dist", |
| "correct": false, |
| "margin": -0.5243468135595322, |
| "lp_gold": -28.490523919463158, |
| "lp_dist": -27.966177105903625, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| }, |
| "control_patch_nonshared": { |
| "pred": "gold", |
| "correct": true, |
| "margin": 0.04392840713262558, |
| "lp_gold": -27.1061624661088, |
| "lp_dist": -27.150090873241425, |
| "n_tokens_gold": 4, |
| "n_tokens_dist": 4 |
| } |
| } |
| ] |
| } |