Zishan-Shao commited on
Commit
aa0e435
·
verified ·
1 Parent(s): 985685a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed123.npy +3 -0
  2. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed456.npy +3 -0
  3. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/alpha_sweep.csv +18 -0
  4. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/alpha_sweep.md +19 -0
  5. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/analyze_qwen_results.py +236 -0
  6. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/paper_table.md +17 -0
  7. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/alpha_sweep_deltam.pdf +0 -0
  8. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/alpha_sweep_fliprate.pdf +0 -0
  9. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/mc_controls_gap.pdf +0 -0
  10. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/mc_patched0_rescue.pdf +0 -0
  11. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/openanswer_patchedself_rescue.pdf +0 -0
  12. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/summary.csv +16 -0
  13. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/summary.md +17 -0
  14. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/compute_Qs_seed123.json +0 -0
  15. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/compute_Qs_seed456.json +0 -0
  16. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_alpha_sweep_seed123.json +0 -0
  17. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_alpha_sweep_seed456.json +0 -0
  18. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_transfer_cross_mc_baselinecorrect_seed123.json +0 -0
  19. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_transfer_same_task_seed123.json +0 -0
  20. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/gsm8k_genmath.json +3888 -0
  21. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/gsm8k_pairlogprob.json +0 -0
  22. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/humaneval_gencode_compile.json +2336 -0
  23. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/humaneval_pairlogprob.json +0 -0
  24. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/aqua.json +0 -0
  25. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/arc_challenge.json +0 -0
  26. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/commonsenseqa.json +0 -0
  27. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/logiqa.json +0 -0
  28. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/openbookqa.json +0 -0
  29. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/piqa.json +0 -0
  30. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/qasc.json +0 -0
  31. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/Q_shared_layer24_seed123.npy +3 -0
  32. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/Q_shared_layer24_seed456.npy +3 -0
  33. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/alpha_sweep.csv +18 -0
  34. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/alpha_sweep.md +19 -0
  35. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/analyze_qwen_results.py +236 -0
  36. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/paper_table.md +17 -0
  37. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/alpha_sweep_deltam.pdf +0 -0
  38. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/alpha_sweep_fliprate.pdf +0 -0
  39. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/mc_controls_gap.pdf +0 -0
  40. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/mc_patched0_rescue.pdf +0 -0
  41. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/openanswer_patchedself_rescue.pdf +0 -0
  42. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/qwen_report.md +65 -0
  43. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/qwen_tables.tex +71 -0
  44. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/summary.csv +16 -0
  45. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/summary.md +17 -0
  46. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/compute_Qs_seed123.json +0 -0
  47. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/compute_Qs_seed456.json +0 -0
  48. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_alpha_sweep_seed123.json +0 -0
  49. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_alpha_sweep_seed456.json +0 -0
  50. artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_transfer_cross_mc_baselinecorrect_seed123.json +0 -0
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed123.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906481c53730b1c5b8c1aabc43e90570c76385cd57e7a2a1a4ea6b81d5d5788c
3
+ size 2723968
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed456.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab2ecc1600f61fefe53f12f2b1c8014973c7c4b92cd815d6d3e19024457673c5
3
+ size 2422912
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/alpha_sweep.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ file,task,layer,seed,alpha,n,flip_rate,ablated_acc,pred_change_rate,mean_margin,mean_delta_margin_vs_baseline
2
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.0,64,0.0,1.0,0.0,2.773576259613037,0.0
3
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.02,64,0.0,1.0,0.0,2.7706217765808105,-0.002954694442451
4
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.05,64,0.015625,0.984375,0.015625,2.7653543949127197,-0.008222113363444805
5
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.1,64,0.03125,0.96875,0.03125,2.7537312507629395,-0.019845208153128624
6
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.2,64,0.046875,0.953125,0.046875,2.7078418731689453,-0.0657346323132515
7
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.3,64,0.0625,0.9375,0.0625,2.611804962158203,-0.1617715060710907
8
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.5,64,0.15625,0.84375,0.15625,2.0545363426208496,-0.7190403342247009
9
+ aqua_alpha_sweep_seed123.json,aqua,10,123,0.75,64,0.375,0.625,0.375,1.0690979957580566,-1.7044786214828491
10
+ aqua_alpha_sweep_seed123.json,aqua,10,123,1.0,64,1.0,0.0,1.0,-2.415497064590454,-5.18907356262207
11
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.0,65,0.0,1.0,0.0,2.5807881355285645,0.0
12
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.05,65,0.0,1.0,0.0,2.557753562927246,-0.023034870624542236
13
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.1,65,0.0,1.0,0.0,2.530287265777588,-0.050501130521297455
14
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.2,65,0.015384615384615385,0.9846153846153847,0.015384615384615385,2.4566214084625244,-0.12416719645261765
15
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.3,65,0.09230769230769231,0.9076923076923077,0.09230769230769231,2.3547489643096924,-0.22603949904441833
16
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.5,65,0.2153846153846154,0.7846153846153846,0.2153846153846154,1.979431390762329,-0.601357102394104
17
+ aqua_alpha_sweep_seed456.json,aqua,10,456,0.75,65,0.3230769230769231,0.676923076923077,0.3230769230769231,1.2684406042099,-1.3123478889465332
18
+ aqua_alpha_sweep_seed456.json,aqua,10,456,1.0,65,1.0,0.0,1.0,-2.193070411682129,-4.773859024047852
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/alpha_sweep.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | task | file | alpha | n | flip_rate | ablated_acc | mean_delta_margin_vs_baseline |
2
+ | --- | --- | --- | --- | --- | --- | --- |
3
+ | aqua | aqua_alpha_sweep_seed123.json | 0.0 | 64 | 0.0 | 100.0 | 0.000 |
4
+ | aqua | aqua_alpha_sweep_seed123.json | 0.02 | 64 | 0.0 | 100.0 | -0.003 |
5
+ | aqua | aqua_alpha_sweep_seed123.json | 0.05 | 64 | 1.6 | 98.4 | -0.008 |
6
+ | aqua | aqua_alpha_sweep_seed123.json | 0.1 | 64 | 3.1 | 96.9 | -0.020 |
7
+ | aqua | aqua_alpha_sweep_seed123.json | 0.2 | 64 | 4.7 | 95.3 | -0.066 |
8
+ | aqua | aqua_alpha_sweep_seed123.json | 0.3 | 64 | 6.2 | 93.8 | -0.162 |
9
+ | aqua | aqua_alpha_sweep_seed123.json | 0.5 | 64 | 15.6 | 84.4 | -0.719 |
10
+ | aqua | aqua_alpha_sweep_seed123.json | 0.75 | 64 | 37.5 | 62.5 | -1.704 |
11
+ | aqua | aqua_alpha_sweep_seed123.json | 1.0 | 64 | 100.0 | 0.0 | -5.189 |
12
+ | aqua | aqua_alpha_sweep_seed456.json | 0.0 | 65 | 0.0 | 100.0 | 0.000 |
13
+ | aqua | aqua_alpha_sweep_seed456.json | 0.05 | 65 | 0.0 | 100.0 | -0.023 |
14
+ | aqua | aqua_alpha_sweep_seed456.json | 0.1 | 65 | 0.0 | 100.0 | -0.051 |
15
+ | aqua | aqua_alpha_sweep_seed456.json | 0.2 | 65 | 1.5 | 98.5 | -0.124 |
16
+ | aqua | aqua_alpha_sweep_seed456.json | 0.3 | 65 | 9.2 | 90.8 | -0.226 |
17
+ | aqua | aqua_alpha_sweep_seed456.json | 0.5 | 65 | 21.5 | 78.5 | -0.601 |
18
+ | aqua | aqua_alpha_sweep_seed456.json | 0.75 | 65 | 32.3 | 67.7 | -1.312 |
19
+ | aqua | aqua_alpha_sweep_seed456.json | 1.0 | 65 | 100.0 | 0.0 | -4.774 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/analyze_qwen_results.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+
9
+ SUMMARY_CSV = os.environ.get("SUMMARY_CSV", "summary.csv")
10
+ ALPHA_CSV = os.environ.get("ALPHA_CSV", "alpha_sweep.csv")
11
+ OUT_MD = os.environ.get("OUT_MD", "qwen_report.md")
12
+ OUT_TEX = os.environ.get("OUT_TEX", "qwen_tables.tex")
13
+ PLOT_DIR = os.environ.get("PLOT_DIR", "plots")
14
+
15
+ os.makedirs(PLOT_DIR, exist_ok=True)
16
+
17
+ df = pd.read_csv(SUMMARY_CSV)
18
+ alpha_df = pd.read_csv(ALPHA_CSV) if os.path.exists(ALPHA_CSV) else pd.DataFrame()
19
+
20
+ # Helpers
21
+ def f(x, nd=3):
22
+ if pd.isna(x): return ""
23
+ if isinstance(x, (int, np.integer)): return str(int(x))
24
+ if isinstance(x, (float, np.floating)): return f"{float(x):.{nd}f}"
25
+ return str(x)
26
+
27
+ def pct(x, nd=1):
28
+ if pd.isna(x): return ""
29
+ return f"{float(x):.{nd}f}"
30
+
31
+ # Split by kind
32
+ mc = df[df["kind"] == "subspace_mc"].copy()
33
+ oa = df[df["kind"] == "openanswer"].copy()
34
+ fs = df[df["kind"] == "flipset"].copy()
35
+
36
+ # Choose representative columns
37
+ def select_cols(kind_df, cols):
38
+ keep = [c for c in cols if c in kind_df.columns]
39
+ return kind_df[keep].copy()
40
+
41
+ mc_cols = [
42
+ "task","eval_mode","seed","base_acc_scan","ablt_acc_scan","flips_scan",
43
+ "patched_0_rescued_pct","patched_full_rescued_pct",
44
+ "control_time_shuffled_rescued_pct","control_shared_randvec_rescued_pct",
45
+ "control_rand_subspace_rescued_pct","control_patch_nonshared_rescued_pct",
46
+ ]
47
+ oa_cols = [
48
+ "task","eval_mode","seed","base_acc_scan","ablt_acc_scan","flips_scan",
49
+ "patched_self_rescued_pct","control_time_shuffled_rescued_pct",
50
+ "control_shared_randvec_rescued_pct","control_rand_subspace_rescued_pct",
51
+ "control_patch_nonshared_rescued_pct",
52
+ ]
53
+ fs_cols = [
54
+ "file","seed","task","base_acc_scan","ablt_acc_scan","flips_scan",
55
+ "patched_self_rescued_pct","patched_transfer_rescued_pct",
56
+ ]
57
+
58
+ mc_tbl = select_cols(mc, mc_cols).sort_values(["task","eval_mode","seed"])
59
+ oa_tbl = select_cols(oa, oa_cols).sort_values(["task","eval_mode","seed"])
60
+ fs_tbl = select_cols(fs, fs_cols).sort_values(["seed","file"])
61
+
62
+ # ---- Plot 1: MC patched_0 rescue by task ----
63
+ if len(mc_tbl) > 0 and "patched_0_rescued_pct" in mc_tbl.columns:
64
+ mc_plot = mc_tbl.groupby("task", as_index=False)["patched_0_rescued_pct"].mean()
65
+ plt.figure()
66
+ plt.bar(mc_plot["task"], mc_plot["patched_0_rescued_pct"])
67
+ plt.xticks(rotation=45, ha="right")
68
+ plt.ylabel("Rescue% on flips (patched_0)")
69
+ plt.tight_layout()
70
+ plt.savefig(os.path.join(PLOT_DIR, "mc_patched0_rescue.pdf"), dpi=300)
71
+ plt.close()
72
+
73
+ # ---- Plot 2: MC controls gap (patched_0 vs rand-in-shared vs nonshared) ----
74
+ if len(mc_tbl) > 0 and "patched_0_rescued_pct" in mc_tbl.columns:
75
+ tmp = mc_tbl.groupby("task", as_index=False).agg({
76
+ "patched_0_rescued_pct":"mean",
77
+ "control_shared_randvec_rescued_pct":"mean",
78
+ "control_patch_nonshared_rescued_pct":"mean",
79
+ "control_rand_subspace_rescued_pct":"mean",
80
+ })
81
+ plt.figure()
82
+ x = np.arange(len(tmp))
83
+ w = 0.2
84
+ plt.bar(x - 1.5*w, tmp["patched_0_rescued_pct"], width=w, label="patched_0")
85
+ plt.bar(x - 0.5*w, tmp["control_shared_randvec_rescued_pct"], width=w, label="rand vec in shared")
86
+ plt.bar(x + 0.5*w, tmp["control_rand_subspace_rescued_pct"], width=w, label="rand subspace")
87
+ plt.bar(x + 1.5*w, tmp["control_patch_nonshared_rescued_pct"], width=w, label="nonshared patch")
88
+ plt.xticks(x, tmp["task"], rotation=45, ha="right")
89
+ plt.ylabel("Rescue% on flips")
90
+ plt.legend()
91
+ plt.tight_layout()
92
+ plt.savefig(os.path.join(PLOT_DIR, "mc_controls_gap.pdf"), dpi=300)
93
+ plt.close()
94
+
95
+ # ---- Plot 3: Alpha sweep flip_rate curves ----
96
+ if len(alpha_df) > 0 and {"alpha","flip_rate","seed"}.issubset(alpha_df.columns):
97
+ alpha_df2 = alpha_df.copy()
98
+ alpha_df2["alpha"] = pd.to_numeric(alpha_df2["alpha"], errors="coerce")
99
+ alpha_df2 = alpha_df2.dropna(subset=["alpha"])
100
+ plt.figure()
101
+ for seed in sorted(alpha_df2["seed"].dropna().unique()):
102
+ sub = alpha_df2[alpha_df2["seed"] == seed].sort_values("alpha")
103
+ plt.plot(sub["alpha"], sub["flip_rate"]*100.0, marker="o", label=f"seed={int(seed)}")
104
+ plt.xlabel("alpha")
105
+ plt.ylabel("Flip rate on flip-set (%)")
106
+ plt.legend()
107
+ plt.tight_layout()
108
+ plt.savefig(os.path.join(PLOT_DIR, "alpha_sweep_fliprate.pdf"), dpi=300)
109
+ plt.close()
110
+
111
+ # ---- Plot 4: Alpha sweep mean delta margin curves ----
112
+ if len(alpha_df) > 0 and {"alpha","mean_delta_margin_vs_baseline","seed"}.issubset(alpha_df.columns):
113
+ alpha_df2 = alpha_df.copy()
114
+ alpha_df2["alpha"] = pd.to_numeric(alpha_df2["alpha"], errors="coerce")
115
+ alpha_df2 = alpha_df2.dropna(subset=["alpha"])
116
+ plt.figure()
117
+ for seed in sorted(alpha_df2["seed"].dropna().unique()):
118
+ sub = alpha_df2[alpha_df2["seed"] == seed].sort_values("alpha")
119
+ plt.plot(sub["alpha"], sub["mean_delta_margin_vs_baseline"], marker="o", label=f"seed={int(seed)}")
120
+ plt.xlabel("alpha")
121
+ plt.ylabel("Mean Δmargin vs baseline (on flip-set)")
122
+ plt.legend()
123
+ plt.tight_layout()
124
+ plt.savefig(os.path.join(PLOT_DIR, "alpha_sweep_deltam.pdf"), dpi=300)
125
+ plt.close()
126
+
127
+ # ---- Plot 5: Open-answer patched_self rescue (pair_logprob vs gen) ----
128
+ if len(oa_tbl) > 0 and "patched_self_rescued_pct" in oa_tbl.columns:
129
+ oa_plot = oa_tbl.copy()
130
+ oa_plot["label"] = oa_plot["task"].astype(str) + ":" + oa_plot["eval_mode"].astype(str)
131
+ plt.figure()
132
+ plt.bar(oa_plot["label"], oa_plot["patched_self_rescued_pct"])
133
+ plt.xticks(rotation=45, ha="right")
134
+ plt.ylabel("Rescue% on flips (patched_self)")
135
+ plt.tight_layout()
136
+ plt.savefig(os.path.join(PLOT_DIR, "openanswer_patchedself_rescue.pdf"), dpi=300)
137
+ plt.close()
138
+
139
+ # ---- Markdown report ----
140
+ def df_to_md_table(dfx: pd.DataFrame, max_rows: int = 30) -> str:
141
+ if dfx is None or len(dfx) == 0:
142
+ return "_(none)_"
143
+ d = dfx.copy()
144
+ if len(d) > max_rows:
145
+ d = d.head(max_rows)
146
+ return d.to_markdown(index=False)
147
+
148
+ lines = []
149
+ lines.append(f"# Qwen subspace patching + flipset report\n")
150
+ lines.append(f"Generated from `{os.path.basename(SUMMARY_CSV)}` and `{os.path.basename(ALPHA_CSV)}`.\n")
151
+ lines.append("## Overview\n")
152
+ lines.append(f"- Runs: {len(df)} total JSON summaries\n")
153
+ lines.append(f"- MC runs: {len(mc)}; Open-answer runs: {len(oa)}; Flipset runs: {len(fs)}\n")
154
+ lines.append("## Key plots (PDF, dpi=300)\n")
155
+ for fn in [
156
+ "mc_patched0_rescue.pdf",
157
+ "mc_controls_gap.pdf",
158
+ "alpha_sweep_fliprate.pdf",
159
+ "alpha_sweep_deltam.pdf",
160
+ "openanswer_patchedself_rescue.pdf",
161
+ ]:
162
+ p = os.path.join(PLOT_DIR, fn)
163
+ if os.path.exists(p):
164
+ lines.append(f"- `{fn}`")
165
+ lines.append("\n")
166
+
167
+ lines.append("## Multiple-choice patchback (subspace_mc)\n")
168
+ lines.append(df_to_md_table(mc_tbl))
169
+ lines.append("\n")
170
+
171
+ lines.append("## Open-answer patchback (openanswer)\n")
172
+ lines.append(df_to_md_table(oa_tbl))
173
+ lines.append("\n")
174
+
175
+ lines.append("## Flipset transfer patching (flipset)\n")
176
+ lines.append(df_to_md_table(fs_tbl))
177
+ lines.append("\n")
178
+
179
+ if len(alpha_df) > 0:
180
+ lines.append("## Alpha sweep (flip-set)\n")
181
+ # show a compact subset (alpha=0,0.5,0.75,1.0) if present
182
+ a = alpha_df.copy()
183
+ a["alpha"] = pd.to_numeric(a["alpha"], errors="coerce")
184
+ a = a.dropna(subset=["alpha"])
185
+ keep = a[a["alpha"].isin([0.0, 0.5, 0.75, 1.0])].copy()
186
+ if len(keep) == 0:
187
+ keep = a
188
+ keep = keep.sort_values(["seed","alpha"])
189
+ cols = [c for c in ["file","seed","alpha","n","flip_rate","ablated_acc","mean_delta_margin_vs_baseline"] if c in keep.columns]
190
+ lines.append(df_to_md_table(keep[cols], max_rows=60))
191
+ lines.append("\n")
192
+
193
+ with open(OUT_MD, "w", encoding="utf-8") as f:
194
+ f.write("\n".join(lines))
195
+
196
+ # ---- LaTeX tables (quick export) ----
197
+ # We write two compact tables: MC summary + Open-answer summary
198
+ def to_latex_table(dfx: pd.DataFrame, caption: str, label: str) -> str:
199
+ if dfx is None or len(dfx) == 0:
200
+ return f"% {caption}\n% (empty)\n"
201
+ return dfx.to_latex(index=False, escape=True, caption=caption, label=label)
202
+
203
+ tex_lines = []
204
+ tex_lines.append("% Auto-generated LaTeX tables for Qwen results\n")
205
+ tex_lines.append("% Requires \\usepackage{booktabs}\n\n")
206
+
207
+ if len(mc_tbl) > 0:
208
+ tex_lines.append(to_latex_table(
209
+ mc_tbl,
210
+ caption="Qwen: multiple-choice (subspace\\Apatch) summary.",
211
+ label="tab:qwen_mc_summary"
212
+ ))
213
+ tex_lines.append("\n")
214
+
215
+ if len(oa_tbl) > 0:
216
+ tex_lines.append(to_latex_table(
217
+ oa_tbl,
218
+ caption="Qwen: open-answer (openanswer\\_subspace\\_patching) summary.",
219
+ label="tab:qwen_openanswer_summary"
220
+ ))
221
+ tex_lines.append("\n")
222
+
223
+ if len(alpha_df) > 0:
224
+ tex_lines.append(to_latex_table(
225
+ alpha_df.sort_values(["seed","alpha"]).head(40),
226
+ caption="Qwen: alpha sweep (first 40 rows shown).",
227
+ label="tab:qwen_alpha_sweep_head"
228
+ ))
229
+ tex_lines.append("\n")
230
+
231
+ with open(OUT_TEX, "w", encoding="utf-8") as f:
232
+ f.write("\n".join(tex_lines))
233
+
234
+ print(f"[OK] Wrote report: {OUT_MD}")
235
+ print(f"[OK] Wrote LaTeX tables: {OUT_TEX}")
236
+ print(f"[OK] Plots in: {PLOT_DIR}")
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/paper_table.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | kind | task | eval_mode | base_acc_scan | ablt_acc_scan | flips_scan | Patched@0 (rescue%, Δm) | Patched@full (rescue%, Δm) | Patched(self) (rescue%, Δm) | Patched(transfer) (rescue%, Δm) | Cross-example donor (rescue%, Δm) | Donor mismatch (rescue%, Δm) | Shared coeff permute (rescue%, Δm) | Shared coeff signflip (rescue%, Δm) | Rand vec in shared (rescue%, Δm) | Rand subspace (rescue%, Δm) | Nonshared patch (rescue%, Δm) |
2
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
3
+ | flipset | aqua | | 0.413 | 0.276 | 64 | | | | | | | | | | | |
4
+ | flipset | aqua | | 0.402 | 0.291 | 65 | | | | | | | | | | | |
5
+ | flipset | aqua | | 0.413 | 0.276 | 64 | | | 100.0%, 5.189 | 95.3%, 5.191 | | | | | | | |
6
+ | flipset | aqua | | 0.413 | 0.276 | 64 | | | 100.0%, 5.189 | 96.9%, 5.143 | | | | | | | |
7
+ | openanswer | gsm8k | gen_math | 0.035 | 0.059 | 7 | | | 14.3%, - | | 14.3%, - | | | | 14.3%, - | 0.0%, - | 42.9%, - |
8
+ | openanswer | gsm8k | pair_logprob | 0.801 | 0.625 | 53 | | | 98.1%, 10.441 | | 94.3%, 10.053 | | | | 20.8%, 0.094 | 18.9%, 0.630 | 5.7%, 0.223 |
9
+ | openanswer | humaneval | gen_code_compile | | | 0 | | | 10.5%, - | | 10.5%, - | | | | 15.8%, - | 26.3%, - | 0.0%, - |
10
+ | openanswer | humaneval | pair_logprob | 0.683 | 0.695 | 5 | | | 80.0%, 3.919 | | 80.0%, 2.955 | | | | 0.0%, -0.728 | 20.0%, -0.119 | 0.0%, -0.053 |
11
+ | subspace_mc | aqua | | 0.413 | 0.276 | 64 | 100.0%, 5.189 | 100.0%, 5.189 | | | 98.4%, 5.171 | | | | 20.3%, 0.773 | 35.9%, 1.421 | 0.0%, 0.000 |
12
+ | subspace_mc | arc_challenge | | 0.906 | 0.514 | 108 | 100.0%, 9.373 | 100.0%, 9.373 | | | 100.0%, 9.361 | | | | 36.1%, 1.780 | 60.2%, 2.989 | 0.0%, -0.000 |
13
+ | subspace_mc | commonsenseqa | | 0.867 | 0.598 | 77 | 100.0%, 8.117 | 100.0%, 8.117 | | | 100.0%, 8.102 | | | | 39.0%, 1.555 | 55.8%, 2.602 | 0.0%, -0.000 |
14
+ | subspace_mc | logiqa | | 0.473 | 0.395 | 45 | 100.0%, 5.974 | 100.0%, 5.974 | | | 100.0%, 6.011 | | | | 35.6%, 0.588 | 40.0%, 0.821 | 0.0%, 0.000 |
15
+ | subspace_mc | openbookqa | | 0.859 | 0.457 | 107 | 100.0%, 8.290 | 100.0%, 8.290 | | | 99.1%, 8.249 | | | | 33.6%, 1.499 | 47.7%, 2.272 | 0.0%, 0.000 |
16
+ | subspace_mc | piqa | | 0.871 | 0.773 | 37 | 100.0%, 5.723 | 100.0%, 5.723 | | | 100.0%, 5.689 | | | | 56.8%, 1.174 | 48.6%, 1.417 | 0.0%, -0.000 |
17
+ | subspace_mc | qasc | | 0.809 | 0.402 | 110 | 100.0%, 8.833 | 100.0%, 8.833 | | | 100.0%, 8.808 | | | | 39.1%, 1.842 | 50.0%, 2.400 | 0.0%, -0.000 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/alpha_sweep_deltam.pdf ADDED
Binary file (15.1 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/alpha_sweep_fliprate.pdf ADDED
Binary file (13.6 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/mc_controls_gap.pdf ADDED
Binary file (14.3 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/mc_patched0_rescue.pdf ADDED
Binary file (13.8 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/plots/openanswer_patchedself_rescue.pdf ADDED
Binary file (13.7 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/summary.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kind,file,task,eval_mode,layer,seed,hf_id,hf_split,candidate_labels,Qs_shape,patch_desc,donor_source,donor_tasks,donor_pick,n_donor_bank,scan_effective,scan_skipped,base_acc_scan,ablt_acc_scan,flips_scan,anti_flips_scan,both_correct_scan,both_wrong_scan,patched_primary_method,patched_primary_rescued_pct,patched_primary_mean_dmargin,diff_time_shuffled_minus_patched_primary_rescued_pct,diff_patched_primary_minus_shared_randvec_rescued_pct,patched_0_rescued,patched_0_n,patched_0_rescued_pct,patched_0_mean_dmargin,patched_01_rescued,patched_01_n,patched_01_rescued_pct,patched_01_mean_dmargin,patched_full_rescued,patched_full_n,patched_full_rescued_pct,patched_full_mean_dmargin,patched_self_rescued,patched_self_n,patched_self_rescued_pct,patched_self_mean_dmargin,patched_transfer_rescued,patched_transfer_n,patched_transfer_rescued_pct,patched_transfer_mean_dmargin,control_time_shuffled_rescued,control_time_shuffled_n,control_time_shuffled_rescued_pct,control_time_shuffled_mean_dmargin,control_shared_mismatch_rescued,control_shared_mismatch_n,control_shared_mismatch_rescued_pct,control_shared_mismatch_mean_dmargin,control_shared_perm_rescued,control_shared_perm_n,control_shared_perm_rescued_pct,control_shared_perm_mean_dmargin,control_shared_signflip_rescued,control_shared_signflip_n,control_shared_signflip_rescued_pct,control_shared_signflip_mean_dmargin,control_shared_randvec_rescued,control_shared_randvec_n,control_shared_randvec_rescued_pct,control_shared_randvec_mean_dmargin,control_rand_subspace_rescued,control_rand_subspace_n,control_rand_subspace_rescued_pct,control_rand_subspace_mean_dmargin,control_patch_nonshared_rescued,control_patch_nonshared_n,control_patch_nonshared_rescued_pct,control_patch_nonshared_mean_dmargin
2
+ flipset,aqua_alpha_sweep_seed123.json,aqua,,10,123,,,ABCDE,3584x190,steps=0,,,,,254,0,0.41338582677165353,0.2755905511811024,64,29,41,120,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3
+ flipset,aqua_alpha_sweep_seed456.json,aqua,,10,456,,,ABCDE,3584x169,steps=0,,,,,254,0,0.4015748031496063,0.29133858267716534,65,37,37,115,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4
+ flipset,aqua_transfer_cross_mc_baselinecorrect_seed123.json,aqua,,10,123,,,ABCDE,3584x190,steps=0,cross_task_eval,"commonsenseqa,openbookqa",random,256,254,0,0.41338582677165353,0.2755905511811024,64,29,41,120,patched_self,100.0,5.189074993133545,,,,,,,,,,,,,,,64,64,100.0,5.189074993133545,61,64,95.3125,5.190681457519531,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5
+ flipset,aqua_transfer_same_task_seed123.json,aqua,,10,123,,,ABCDE,3584x190,steps=0,same_task_eval,aqua,random,254,254,0,0.41338582677165353,0.2755905511811024,64,29,41,120,patched_self,100.0,5.189074993133545,,,,,,,,,,,,,,,64,64,100.0,5.189074993133545,62,64,96.875,5.143482208251953,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6
+ openanswer,gsm8k_genmath.json,gsm8k,gen_math,10,123,gsm8k/main,,,3584x190,"steps=0,1,2,3",,,,,256,0,0.03515625,0.05859375,7,13,2,234,patched_self,14.285714285714286,,0.0,0.0,,,,,,,,,,,,,1,7,14.285714285714286,,,,,,1,7,14.285714285714286,,,,,,,,,,,,,,1,7,14.285714285714286,,0,7,0.0,,3,7,42.857142857142854,
7
+ openanswer,gsm8k_pairlogprob.json,gsm8k,pair_logprob,10,123,gsm8k/main,,,3584x190,"steps=0,1,2,3",,,,,256,0,0.80078125,0.625,53,8,152,43,patched_self,98.11320754716981,10.441205978393555,-3.773584905660371,77.35849056603773,,,,,,,,,,,,,52,53,98.11320754716981,10.441205978393555,,,,,50,53,94.33962264150944,10.053207397460938,,,,,,,,,,,,,11,53,20.754716981132077,0.09415686130523682,10,53,18.867924528301888,0.6303381323814392,3,53,5.660377358490566,0.2226167768239975
8
+ openanswer,humaneval_gencode_compile.json,humaneval,gen_code_compile,10,123,openai_humaneval,test,,3584x190,"steps=0,1,2,3",,,,,0,164,,,0,0,0,0,patched_self,10.526315789473685,,0.0,-5.263157894736841,,,,,,,,,,,,,2,19,10.526315789473685,,,,,,2,19,10.526315789473685,,,,,,,,,,,,,,3,19,15.789473684210526,,5,19,26.31578947368421,,0,19,0.0,
9
+ openanswer,humaneval_pairlogprob.json,humaneval,pair_logprob,10,123,openai_humaneval,test,,3584x190,"steps=0,1,2,3",,,,,164,0,0.6829268292682927,0.6951219512195121,5,7,107,45,patched_self,80.0,3.919372081756592,0.0,80.0,,,,,,,,,,,,,4,5,80.0,3.919372081756592,,,,,4,5,80.0,2.954876661300659,,,,,,,,,,,,,0,5,0.0,-0.7276772260665894,1,5,20.0,-0.11909005790948868,0,5,0.0,-0.052741408348083496
10
+ subspace_mc,aqua.json,aqua,,10,123,,,ABCDE,3584x190,,,,,,254,0,0.41338582677165353,0.2755905511811024,64,29,41,120,patched_0,100.0,5.1890749065205455,-1.5625,79.6875,64,64,100.0,5.1890749065205455,64,64,100.0,5.1890749065205455,64,64,100.0,5.1890749065205455,,,,,,,,,63,64,98.4375,5.171447346918285,,,,,,,,,,,,,13,64,20.3125,0.7729076333343983,23,64,35.9375,1.4208112843334675,0,64,0.0,2.1606683731079102e-07
11
+ subspace_mc,arc_challenge.json,arc_challenge,,10,123,,,ABCD,3584x190,,,,,,255,1,0.9058823529411765,0.5137254901960784,108,8,123,16,patched_0,100.0,9.372807964682579,0.0,63.888888888888886,108,108,100.0,9.372807964682579,108,108,100.0,9.372807964682579,108,108,100.0,9.372807964682579,,,,,,,,,108,108,100.0,9.361393508535844,,,,,,,,,,,,,39,108,36.111111111111114,1.7799473873994969,65,108,60.18518518518518,2.988849757446183,0,108,0.0,-5.342342235423901e-07
12
+ subspace_mc,commonsenseqa.json,commonsenseqa,,10,123,,,ABCDE,3584x190,,,,,,256,0,0.8671875,0.59765625,77,8,145,26,patched_0,100.0,8.11723030232764,0.0,61.03896103896104,77,77,100.0,8.11723030232764,77,77,100.0,8.11723030232764,77,77,100.0,8.11723030232764,,,,,,,,,77,77,100.0,8.101711891688309,,,,,,,,,,,,,30,77,38.96103896103896,1.5545239510474267,43,77,55.84415584415584,2.602022914143352,0,77,0.0,-9.103254838423296e-07
13
+ subspace_mc,logiqa.json,logiqa,,10,123,,,ABCD,3584x190,,,,,,256,0,0.47265625,0.39453125,45,25,76,110,patched_0,100.0,5.974016189575195,0.0,64.44444444444444,45,45,100.0,5.974016189575195,45,45,100.0,5.974016189575195,45,45,100.0,5.974016189575195,,,,,,,,,45,45,100.0,6.01071400642395,,,,,,,,,,,,,16,45,35.55555555555556,0.588188378016154,18,45,40.0,0.8205998208787706,0,45,0.0,6.092919243706597e-07
14
+ subspace_mc,openbookqa.json,openbookqa,,10,123,,,ABCD,3584x190,,,,,,256,0,0.859375,0.45703125,107,4,113,32,patched_0,100.0,8.289876717830372,-0.9345794392523317,66.35514018691589,107,107,100.0,8.289876717830372,107,107,100.0,8.289876717830372,107,107,100.0,8.289876717830372,,,,,,,,,106,107,99.06542056074767,8.248568821733244,,,,,,,,,,,,,36,107,33.64485981308411,1.498662706847503,51,107,47.66355140186916,2.2723147490314233,0,107,0.0,5.214013785959404e-07
15
+ subspace_mc,piqa.json,piqa,,10,123,,,AB,3584x190,,,,,,256,0,0.87109375,0.7734375,37,12,186,21,patched_0,100.0,5.722948335312508,0.0,43.24324324324324,37,37,100.0,5.722948335312508,37,37,100.0,5.722948335312508,37,37,100.0,5.722948335312508,,,,,,,,,37,37,100.0,5.689002320573136,,,,,,,,,,,,,21,37,56.75675675675676,1.1743023846600507,18,37,48.648648648648646,1.417075691996394,0,37,0.0,-3.2218726905616553e-07
16
+ subspace_mc,qasc.json,qasc,,10,123,,,ABCDEFGH,3584x190,,,,,,256,0,0.80859375,0.40234375,110,6,97,43,patched_0,100.0,8.833465647697448,0.0,60.90909090909091,110,110,100.0,8.833465647697448,110,110,100.0,8.833465647697448,110,110,100.0,8.833465647697448,,,,,,,,,110,110,100.0,8.807955605333502,,,,,,,,,,,,,43,110,39.09090909090909,1.8416864731095053,55,110,50.0,2.400287093899467,0,110,0.0,-6.198883056640625e-07
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/_summary/summary.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | kind | task | eval_mode | file | base_acc_scan | ablt_acc_scan | flips_scan | patched_primary_method | patched_primary_rescued_pct | control_time_shuffled_rescued_pct | control_shared_randvec_rescued_pct | control_rand_subspace_rescued_pct | control_patch_nonshared_rescued_pct |
2
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
3
+ | flipset | aqua | | aqua_alpha_sweep_seed123.json | 0.413 | 0.276 | 64 | | | | | | |
4
+ | flipset | aqua | | aqua_alpha_sweep_seed456.json | 0.402 | 0.291 | 65 | | | | | | |
5
+ | flipset | aqua | | aqua_transfer_cross_mc_baselinecorrect_seed123.json | 0.413 | 0.276 | 64 | patched_self | 100.0 | | | | |
6
+ | flipset | aqua | | aqua_transfer_same_task_seed123.json | 0.413 | 0.276 | 64 | patched_self | 100.0 | | | | |
7
+ | openanswer | gsm8k | gen_math | gsm8k_genmath.json | 0.035 | 0.059 | 7 | patched_self | 14.3 | 14.3 | 14.3 | 0.0 | 42.9 |
8
+ | openanswer | gsm8k | pair_logprob | gsm8k_pairlogprob.json | 0.801 | 0.625 | 53 | patched_self | 98.1 | 94.3 | 20.8 | 18.9 | 5.7 |
9
+ | openanswer | humaneval | gen_code_compile | humaneval_gencode_compile.json | | | 0 | patched_self | 10.5 | 10.5 | 15.8 | 26.3 | 0.0 |
10
+ | openanswer | humaneval | pair_logprob | humaneval_pairlogprob.json | 0.683 | 0.695 | 5 | patched_self | 80.0 | 80.0 | 0.0 | 20.0 | 0.0 |
11
+ | subspace_mc | aqua | | aqua.json | 0.413 | 0.276 | 64 | patched_0 | 100.0 | 98.4 | 20.3 | 35.9 | 0.0 |
12
+ | subspace_mc | arc_challenge | | arc_challenge.json | 0.906 | 0.514 | 108 | patched_0 | 100.0 | 100.0 | 36.1 | 60.2 | 0.0 |
13
+ | subspace_mc | commonsenseqa | | commonsenseqa.json | 0.867 | 0.598 | 77 | patched_0 | 100.0 | 100.0 | 39.0 | 55.8 | 0.0 |
14
+ | subspace_mc | logiqa | | logiqa.json | 0.473 | 0.395 | 45 | patched_0 | 100.0 | 100.0 | 35.6 | 40.0 | 0.0 |
15
+ | subspace_mc | openbookqa | | openbookqa.json | 0.859 | 0.457 | 107 | patched_0 | 100.0 | 99.1 | 33.6 | 47.7 | 0.0 |
16
+ | subspace_mc | piqa | | piqa.json | 0.871 | 0.773 | 37 | patched_0 | 100.0 | 100.0 | 56.8 | 48.6 | 0.0 |
17
+ | subspace_mc | qasc | | qasc.json | 0.809 | 0.402 | 110 | patched_0 | 100.0 | 100.0 | 39.1 | 50.0 | 0.0 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/compute_Qs_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/compute_Qs_seed456.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_alpha_sweep_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_alpha_sweep_seed456.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_transfer_cross_mc_baselinecorrect_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/flipset/aqua_transfer_same_task_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/gsm8k_genmath.json ADDED
@@ -0,0 +1,3888 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "meta": {
3
+ "model": "Qwen/Qwen2.5-7B-Instruct",
4
+ "device": "cuda",
5
+ "dtype": "fp32",
6
+ "layer": 10,
7
+ "layers_path": "model.layers",
8
+ "seed": 123,
9
+ "task": "gsm8k",
10
+ "eval_mode": "gen_math",
11
+ "eval_meta": {
12
+ "subspace_split": null,
13
+ "eval_split": "test",
14
+ "available_splits": [
15
+ "train",
16
+ "test"
17
+ ],
18
+ "hf_id": "gsm8k/main"
19
+ },
20
+ "n_eval_loaded": 256,
21
+ "n_scanned": 256,
22
+ "base_acc_scan": 0.03515625,
23
+ "ablt_acc_scan": 0.05859375,
24
+ "flips_total": 7,
25
+ "flips_used": 7,
26
+ "patch_steps": [
27
+ 0,
28
+ 1,
29
+ 2,
30
+ 3
31
+ ],
32
+ "patch_n_steps": 4,
33
+ "Qs_path": "/home/zs89/decodeshare/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed123.npy",
34
+ "Qs_shape": [
35
+ 3584,
36
+ 190
37
+ ],
38
+ "gold_text_prefix": " ",
39
+ "dist_text_prefix": " ",
40
+ "gold_max_tokens": 0,
41
+ "distractor_mode": "next_gold",
42
+ "answer_prefix_effective": "\nLet's think step by step.\nFinal answer (number only):",
43
+ "max_new_tokens_effective": 64,
44
+ "run_coeff_controls": false,
45
+ "use_benchmark_loader": true,
46
+ "hf_id": "",
47
+ "hf_split": "test"
48
+ },
49
+ "summary_on_flips": {
50
+ "patched_self": {
51
+ "n": 7,
52
+ "rescued": 1,
53
+ "rescued_pct": 14.285714285714286
54
+ },
55
+ "control_time_shuffled": {
56
+ "n": 7,
57
+ "rescued": 1,
58
+ "rescued_pct": 14.285714285714286
59
+ },
60
+ "control_shared_randvec": {
61
+ "n": 7,
62
+ "rescued": 1,
63
+ "rescued_pct": 14.285714285714286
64
+ },
65
+ "control_rand_subspace": {
66
+ "n": 7,
67
+ "rescued": 0,
68
+ "rescued_pct": 0.0
69
+ },
70
+ "control_patch_nonshared": {
71
+ "n": 7,
72
+ "rescued": 3,
73
+ "rescued_pct": 42.857142857142854
74
+ }
75
+ },
76
+ "scan_rows": [
77
+ {
78
+ "ex_id": "gsm8k-test-0",
79
+ "gold_raw": "50",
80
+ "baseline": {
81
+ "pred_answer": "140",
82
+ "correct": false,
83
+ "n_gen_tokens": 64
84
+ },
85
+ "ablated": {
86
+ "pred_answer": "70",
87
+ "correct": false,
88
+ "n_gen_tokens": 64
89
+ }
90
+ },
91
+ {
92
+ "ex_id": "gsm8k-test-1",
93
+ "gold_raw": "80",
94
+ "baseline": {
95
+ "pred_answer": "30",
96
+ "correct": false,
97
+ "n_gen_tokens": 64
98
+ },
99
+ "ablated": {
100
+ "pred_answer": "0",
101
+ "correct": false,
102
+ "n_gen_tokens": 64
103
+ }
104
+ },
105
+ {
106
+ "ex_id": "gsm8k-test-2",
107
+ "gold_raw": "12",
108
+ "baseline": {
109
+ "pred_answer": "7",
110
+ "correct": false,
111
+ "n_gen_tokens": 64
112
+ },
113
+ "ablated": {
114
+ "pred_answer": "00",
115
+ "correct": false,
116
+ "n_gen_tokens": 64
117
+ }
118
+ },
119
+ {
120
+ "ex_id": "gsm8k-test-3",
121
+ "gold_raw": "140",
122
+ "baseline": {
123
+ "pred_answer": "3",
124
+ "correct": false,
125
+ "n_gen_tokens": 64
126
+ },
127
+ "ablated": {
128
+ "pred_answer": "33",
129
+ "correct": false,
130
+ "n_gen_tokens": 64
131
+ }
132
+ },
133
+ {
134
+ "ex_id": "gsm8k-test-4",
135
+ "gold_raw": "36",
136
+ "baseline": {
137
+ "pred_answer": "4",
138
+ "correct": false,
139
+ "n_gen_tokens": 64
140
+ },
141
+ "ablated": {
142
+ "pred_answer": "2",
143
+ "correct": false,
144
+ "n_gen_tokens": 64
145
+ }
146
+ },
147
+ {
148
+ "ex_id": "gsm8k-test-5",
149
+ "gold_raw": "3200",
150
+ "baseline": {
151
+ "pred_answer": "40",
152
+ "correct": false,
153
+ "n_gen_tokens": 64
154
+ },
155
+ "ablated": {
156
+ "pred_answer": "4800.0",
157
+ "correct": false,
158
+ "n_gen_tokens": 64
159
+ }
160
+ },
161
+ {
162
+ "ex_id": "gsm8k-test-6",
163
+ "gold_raw": "38",
164
+ "baseline": {
165
+ "pred_answer": "3",
166
+ "correct": false,
167
+ "n_gen_tokens": 64
168
+ },
169
+ "ablated": {
170
+ "pred_answer": "41",
171
+ "correct": false,
172
+ "n_gen_tokens": 64
173
+ }
174
+ },
175
+ {
176
+ "ex_id": "gsm8k-test-7",
177
+ "gold_raw": "32",
178
+ "baseline": {
179
+ "pred_answer": "15",
180
+ "correct": false,
181
+ "n_gen_tokens": 64
182
+ },
183
+ "ablated": {
184
+ "pred_answer": "4",
185
+ "correct": false,
186
+ "n_gen_tokens": 64
187
+ }
188
+ },
189
+ {
190
+ "ex_id": "gsm8k-test-8",
191
+ "gold_raw": "92",
192
+ "baseline": {
193
+ "pred_answer": "3",
194
+ "correct": false,
195
+ "n_gen_tokens": 64
196
+ },
197
+ "ablated": {
198
+ "pred_answer": "20",
199
+ "correct": false,
200
+ "n_gen_tokens": 64
201
+ }
202
+ },
203
+ {
204
+ "ex_id": "gsm8k-test-9",
205
+ "gold_raw": "16",
206
+ "baseline": {
207
+ "pred_answer": "4",
208
+ "correct": false,
209
+ "n_gen_tokens": 64
210
+ },
211
+ "ablated": {
212
+ "pred_answer": "1",
213
+ "correct": false,
214
+ "n_gen_tokens": 64
215
+ }
216
+ },
217
+ {
218
+ "ex_id": "gsm8k-test-10",
219
+ "gold_raw": "45",
220
+ "baseline": {
221
+ "pred_answer": "3",
222
+ "correct": false,
223
+ "n_gen_tokens": 64
224
+ },
225
+ "ablated": {
226
+ "pred_answer": "21.5",
227
+ "correct": false,
228
+ "n_gen_tokens": 64
229
+ }
230
+ },
231
+ {
232
+ "ex_id": "gsm8k-test-11",
233
+ "gold_raw": "270",
234
+ "baseline": {
235
+ "pred_answer": "3",
236
+ "correct": false,
237
+ "n_gen_tokens": 64
238
+ },
239
+ "ablated": {
240
+ "pred_answer": "0.44704",
241
+ "correct": false,
242
+ "n_gen_tokens": 64
243
+ }
244
+ },
245
+ {
246
+ "ex_id": "gsm8k-test-12",
247
+ "gold_raw": "100",
248
+ "baseline": {
249
+ "pred_answer": "60",
250
+ "correct": false,
251
+ "n_gen_tokens": 64
252
+ },
253
+ "ablated": {
254
+ "pred_answer": "4",
255
+ "correct": false,
256
+ "n_gen_tokens": 64
257
+ }
258
+ },
259
+ {
260
+ "ex_id": "gsm8k-test-13",
261
+ "gold_raw": "25",
262
+ "baseline": {
263
+ "pred_answer": "4",
264
+ "correct": false,
265
+ "n_gen_tokens": 64
266
+ },
267
+ "ablated": {
268
+ "pred_answer": "2",
269
+ "correct": false,
270
+ "n_gen_tokens": 64
271
+ }
272
+ },
273
+ {
274
+ "ex_id": "gsm8k-test-14",
275
+ "gold_raw": "800",
276
+ "baseline": {
277
+ "pred_answer": "40",
278
+ "correct": false,
279
+ "n_gen_tokens": 64
280
+ },
281
+ "ablated": {
282
+ "pred_answer": "4",
283
+ "correct": false,
284
+ "n_gen_tokens": 64
285
+ }
286
+ },
287
+ {
288
+ "ex_id": "gsm8k-test-15",
289
+ "gold_raw": "2",
290
+ "baseline": {
291
+ "pred_answer": "65",
292
+ "correct": false,
293
+ "n_gen_tokens": 64
294
+ },
295
+ "ablated": {
296
+ "pred_answer": "4",
297
+ "correct": false,
298
+ "n_gen_tokens": 64
299
+ }
300
+ },
301
+ {
302
+ "ex_id": "gsm8k-test-16",
303
+ "gold_raw": "7000",
304
+ "baseline": {
305
+ "pred_answer": "5",
306
+ "correct": false,
307
+ "n_gen_tokens": 64
308
+ },
309
+ "ablated": {
310
+ "pred_answer": "4000",
311
+ "correct": false,
312
+ "n_gen_tokens": 64
313
+ }
314
+ },
315
+ {
316
+ "ex_id": "gsm8k-test-17",
317
+ "gold_raw": "25",
318
+ "baseline": {
319
+ "pred_answer": "4",
320
+ "correct": false,
321
+ "n_gen_tokens": 64
322
+ },
323
+ "ablated": {
324
+ "pred_answer": "42.15",
325
+ "correct": false,
326
+ "n_gen_tokens": 64
327
+ }
328
+ },
329
+ {
330
+ "ex_id": "gsm8k-test-18",
331
+ "gold_raw": "3",
332
+ "baseline": {
333
+ "pred_answer": "2",
334
+ "correct": false,
335
+ "n_gen_tokens": 64
336
+ },
337
+ "ablated": {
338
+ "pred_answer": "2.0",
339
+ "correct": false,
340
+ "n_gen_tokens": 64
341
+ }
342
+ },
343
+ {
344
+ "ex_id": "gsm8k-test-19",
345
+ "gold_raw": "3430",
346
+ "baseline": {
347
+ "pred_answer": "10",
348
+ "correct": false,
349
+ "n_gen_tokens": 64
350
+ },
351
+ "ablated": {
352
+ "pred_answer": "2147483",
353
+ "correct": false,
354
+ "n_gen_tokens": 64
355
+ }
356
+ },
357
+ {
358
+ "ex_id": "gsm8k-test-20",
359
+ "gold_raw": "106",
360
+ "baseline": {
361
+ "pred_answer": "0",
362
+ "correct": false,
363
+ "n_gen_tokens": 64
364
+ },
365
+ "ablated": {
366
+ "pred_answer": "1",
367
+ "correct": false,
368
+ "n_gen_tokens": 64
369
+ }
370
+ },
371
+ {
372
+ "ex_id": "gsm8k-test-21",
373
+ "gold_raw": "80",
374
+ "baseline": {
375
+ "pred_answer": "40",
376
+ "correct": false,
377
+ "n_gen_tokens": 64
378
+ },
379
+ "ablated": {
380
+ "pred_answer": "20",
381
+ "correct": false,
382
+ "n_gen_tokens": 64
383
+ }
384
+ },
385
+ {
386
+ "ex_id": "gsm8k-test-22",
387
+ "gold_raw": "26",
388
+ "baseline": {
389
+ "pred_answer": "1",
390
+ "correct": false,
391
+ "n_gen_tokens": 64
392
+ },
393
+ "ablated": {
394
+ "pred_answer": "2",
395
+ "correct": false,
396
+ "n_gen_tokens": 64
397
+ }
398
+ },
399
+ {
400
+ "ex_id": "gsm8k-test-23",
401
+ "gold_raw": "750",
402
+ "baseline": {
403
+ "pred_answer": "3",
404
+ "correct": false,
405
+ "n_gen_tokens": 64
406
+ },
407
+ "ablated": {
408
+ "pred_answer": "59",
409
+ "correct": false,
410
+ "n_gen_tokens": 64
411
+ }
412
+ },
413
+ {
414
+ "ex_id": "gsm8k-test-24",
415
+ "gold_raw": "9",
416
+ "baseline": {
417
+ "pred_answer": "17",
418
+ "correct": false,
419
+ "n_gen_tokens": 64
420
+ },
421
+ "ablated": {
422
+ "pred_answer": "17",
423
+ "correct": false,
424
+ "n_gen_tokens": 64
425
+ }
426
+ },
427
+ {
428
+ "ex_id": "gsm8k-test-25",
429
+ "gold_raw": "40",
430
+ "baseline": {
431
+ "pred_answer": "3",
432
+ "correct": false,
433
+ "n_gen_tokens": 64
434
+ },
435
+ "ablated": {
436
+ "pred_answer": "6",
437
+ "correct": false,
438
+ "n_gen_tokens": 64
439
+ }
440
+ },
441
+ {
442
+ "ex_id": "gsm8k-test-26",
443
+ "gold_raw": "14",
444
+ "baseline": {
445
+ "pred_answer": "3",
446
+ "correct": false,
447
+ "n_gen_tokens": 64
448
+ },
449
+ "ablated": {
450
+ "pred_answer": "24.0",
451
+ "correct": false,
452
+ "n_gen_tokens": 64
453
+ }
454
+ },
455
+ {
456
+ "ex_id": "gsm8k-test-27",
457
+ "gold_raw": "160",
458
+ "baseline": {
459
+ "pred_answer": "2",
460
+ "correct": false,
461
+ "n_gen_tokens": 64
462
+ },
463
+ "ablated": {
464
+ "pred_answer": "12",
465
+ "correct": false,
466
+ "n_gen_tokens": 64
467
+ }
468
+ },
469
+ {
470
+ "ex_id": "gsm8k-test-28",
471
+ "gold_raw": "6",
472
+ "baseline": {
473
+ "pred_answer": "2",
474
+ "correct": false,
475
+ "n_gen_tokens": 64
476
+ },
477
+ "ablated": {
478
+ "pred_answer": "1",
479
+ "correct": false,
480
+ "n_gen_tokens": 64
481
+ }
482
+ },
483
+ {
484
+ "ex_id": "gsm8k-test-29",
485
+ "gold_raw": "132",
486
+ "baseline": {
487
+ "pred_answer": "16",
488
+ "correct": false,
489
+ "n_gen_tokens": 64
490
+ },
491
+ "ablated": {
492
+ "pred_answer": "16",
493
+ "correct": false,
494
+ "n_gen_tokens": 64
495
+ }
496
+ },
497
+ {
498
+ "ex_id": "gsm8k-test-30",
499
+ "gold_raw": "8",
500
+ "baseline": {
501
+ "pred_answer": "4",
502
+ "correct": false,
503
+ "n_gen_tokens": 64
504
+ },
505
+ "ablated": {
506
+ "pred_answer": "5.000000000000000000000000000000000000000000000000000000000000",
507
+ "correct": false,
508
+ "n_gen_tokens": 64
509
+ }
510
+ },
511
+ {
512
+ "ex_id": "gsm8k-test-31",
513
+ "gold_raw": "68",
514
+ "baseline": {
515
+ "pred_answer": "1",
516
+ "correct": false,
517
+ "n_gen_tokens": 64
518
+ },
519
+ "ablated": {
520
+ "pred_answer": "40",
521
+ "correct": false,
522
+ "n_gen_tokens": 64
523
+ }
524
+ },
525
+ {
526
+ "ex_id": "gsm8k-test-32",
527
+ "gold_raw": "31",
528
+ "baseline": {
529
+ "pred_answer": "0.02",
530
+ "correct": false,
531
+ "n_gen_tokens": 64
532
+ },
533
+ "ablated": {
534
+ "pred_answer": "00",
535
+ "correct": false,
536
+ "n_gen_tokens": 64
537
+ }
538
+ },
539
+ {
540
+ "ex_id": "gsm8k-test-33",
541
+ "gold_raw": "100",
542
+ "baseline": {
543
+ "pred_answer": "2",
544
+ "correct": false,
545
+ "n_gen_tokens": 64
546
+ },
547
+ "ablated": {
548
+ "pred_answer": "1",
549
+ "correct": false,
550
+ "n_gen_tokens": 64
551
+ }
552
+ },
553
+ {
554
+ "ex_id": "gsm8k-test-34",
555
+ "gold_raw": "1509",
556
+ "baseline": {
557
+ "pred_answer": "4",
558
+ "correct": false,
559
+ "n_gen_tokens": 64
560
+ },
561
+ "ablated": {
562
+ "pred_answer": "2",
563
+ "correct": false,
564
+ "n_gen_tokens": 64
565
+ }
566
+ },
567
+ {
568
+ "ex_id": "gsm8k-test-35",
569
+ "gold_raw": "480",
570
+ "baseline": {
571
+ "pred_answer": "35",
572
+ "correct": false,
573
+ "n_gen_tokens": 64
574
+ },
575
+ "ablated": {
576
+ "pred_answer": "1",
577
+ "correct": false,
578
+ "n_gen_tokens": 64
579
+ }
580
+ },
581
+ {
582
+ "ex_id": "gsm8k-test-36",
583
+ "gold_raw": "520",
584
+ "baseline": {
585
+ "pred_answer": "1.0",
586
+ "correct": false,
587
+ "n_gen_tokens": 64
588
+ },
589
+ "ablated": {
590
+ "pred_answer": "3.00",
591
+ "correct": false,
592
+ "n_gen_tokens": 64
593
+ }
594
+ },
595
+ {
596
+ "ex_id": "gsm8k-test-37",
597
+ "gold_raw": "3",
598
+ "baseline": {
599
+ "pred_answer": "2",
600
+ "correct": false,
601
+ "n_gen_tokens": 64
602
+ },
603
+ "ablated": {
604
+ "pred_answer": "3.0",
605
+ "correct": true,
606
+ "n_gen_tokens": 64
607
+ }
608
+ },
609
+ {
610
+ "ex_id": "gsm8k-test-38",
611
+ "gold_raw": "33",
612
+ "baseline": {
613
+ "pred_answer": "32",
614
+ "correct": false,
615
+ "n_gen_tokens": 64
616
+ },
617
+ "ablated": {
618
+ "pred_answer": "28",
619
+ "correct": false,
620
+ "n_gen_tokens": 64
621
+ }
622
+ },
623
+ {
624
+ "ex_id": "gsm8k-test-39",
625
+ "gold_raw": "120",
626
+ "baseline": {
627
+ "pred_answer": "12",
628
+ "correct": false,
629
+ "n_gen_tokens": 64
630
+ },
631
+ "ablated": {
632
+ "pred_answer": "280",
633
+ "correct": false,
634
+ "n_gen_tokens": 64
635
+ }
636
+ },
637
+ {
638
+ "ex_id": "gsm8k-test-40",
639
+ "gold_raw": "14",
640
+ "baseline": {
641
+ "pred_answer": "2",
642
+ "correct": false,
643
+ "n_gen_tokens": 64
644
+ },
645
+ "ablated": {
646
+ "pred_answer": "8",
647
+ "correct": false,
648
+ "n_gen_tokens": 64
649
+ }
650
+ },
651
+ {
652
+ "ex_id": "gsm8k-test-41",
653
+ "gold_raw": "20",
654
+ "baseline": {
655
+ "pred_answer": "100",
656
+ "correct": false,
657
+ "n_gen_tokens": 64
658
+ },
659
+ "ablated": {
660
+ "pred_answer": "60",
661
+ "correct": false,
662
+ "n_gen_tokens": 64
663
+ }
664
+ },
665
+ {
666
+ "ex_id": "gsm8k-test-42",
667
+ "gold_raw": "95200",
668
+ "baseline": {
669
+ "pred_answer": "30",
670
+ "correct": false,
671
+ "n_gen_tokens": 64
672
+ },
673
+ "ablated": {
674
+ "pred_answer": "5",
675
+ "correct": false,
676
+ "n_gen_tokens": 64
677
+ }
678
+ },
679
+ {
680
+ "ex_id": "gsm8k-test-43",
681
+ "gold_raw": "77",
682
+ "baseline": {
683
+ "pred_answer": "11",
684
+ "correct": false,
685
+ "n_gen_tokens": 64
686
+ },
687
+ "ablated": {
688
+ "pred_answer": "42",
689
+ "correct": false,
690
+ "n_gen_tokens": 64
691
+ }
692
+ },
693
+ {
694
+ "ex_id": "gsm8k-test-44",
695
+ "gold_raw": "81",
696
+ "baseline": {
697
+ "pred_answer": "3",
698
+ "correct": false,
699
+ "n_gen_tokens": 64
700
+ },
701
+ "ablated": {
702
+ "pred_answer": "81",
703
+ "correct": true,
704
+ "n_gen_tokens": 64
705
+ }
706
+ },
707
+ {
708
+ "ex_id": "gsm8k-test-45",
709
+ "gold_raw": "310",
710
+ "baseline": {
711
+ "pred_answer": "24",
712
+ "correct": false,
713
+ "n_gen_tokens": 64
714
+ },
715
+ "ablated": {
716
+ "pred_answer": "255",
717
+ "correct": false,
718
+ "n_gen_tokens": 64
719
+ }
720
+ },
721
+ {
722
+ "ex_id": "gsm8k-test-46",
723
+ "gold_raw": "100",
724
+ "baseline": {
725
+ "pred_answer": "2",
726
+ "correct": false,
727
+ "n_gen_tokens": 64
728
+ },
729
+ "ablated": {
730
+ "pred_answer": "3",
731
+ "correct": false,
732
+ "n_gen_tokens": 64
733
+ }
734
+ },
735
+ {
736
+ "ex_id": "gsm8k-test-47",
737
+ "gold_raw": "160",
738
+ "baseline": {
739
+ "pred_answer": "8",
740
+ "correct": false,
741
+ "n_gen_tokens": 64
742
+ },
743
+ "ablated": {
744
+ "pred_answer": "4",
745
+ "correct": false,
746
+ "n_gen_tokens": 64
747
+ }
748
+ },
749
+ {
750
+ "ex_id": "gsm8k-test-48",
751
+ "gold_raw": "25",
752
+ "baseline": {
753
+ "pred_answer": "1",
754
+ "correct": false,
755
+ "n_gen_tokens": 64
756
+ },
757
+ "ablated": {
758
+ "pred_answer": "2",
759
+ "correct": false,
760
+ "n_gen_tokens": 64
761
+ }
762
+ },
763
+ {
764
+ "ex_id": "gsm8k-test-49",
765
+ "gold_raw": "1400",
766
+ "baseline": {
767
+ "pred_answer": "2",
768
+ "correct": false,
769
+ "n_gen_tokens": 64
770
+ },
771
+ "ablated": {
772
+ "pred_answer": "2",
773
+ "correct": false,
774
+ "n_gen_tokens": 64
775
+ }
776
+ },
777
+ {
778
+ "ex_id": "gsm8k-test-50",
779
+ "gold_raw": "120",
780
+ "baseline": {
781
+ "pred_answer": "1",
782
+ "correct": false,
783
+ "n_gen_tokens": 64
784
+ },
785
+ "ablated": {
786
+ "pred_answer": "120",
787
+ "correct": true,
788
+ "n_gen_tokens": 64
789
+ }
790
+ },
791
+ {
792
+ "ex_id": "gsm8k-test-51",
793
+ "gold_raw": "48",
794
+ "baseline": {
795
+ "pred_answer": "48",
796
+ "correct": true,
797
+ "n_gen_tokens": 64
798
+ },
799
+ "ablated": {
800
+ "pred_answer": "12",
801
+ "correct": false,
802
+ "n_gen_tokens": 64
803
+ }
804
+ },
805
+ {
806
+ "ex_id": "gsm8k-test-52",
807
+ "gold_raw": "50",
808
+ "baseline": {
809
+ "pred_answer": "3",
810
+ "correct": false,
811
+ "n_gen_tokens": 64
812
+ },
813
+ "ablated": {
814
+ "pred_answer": "1",
815
+ "correct": false,
816
+ "n_gen_tokens": 64
817
+ }
818
+ },
819
+ {
820
+ "ex_id": "gsm8k-test-53",
821
+ "gold_raw": "15400",
822
+ "baseline": {
823
+ "pred_answer": "1",
824
+ "correct": false,
825
+ "n_gen_tokens": 64
826
+ },
827
+ "ablated": {
828
+ "pred_answer": "1500",
829
+ "correct": false,
830
+ "n_gen_tokens": 64
831
+ }
832
+ },
833
+ {
834
+ "ex_id": "gsm8k-test-54",
835
+ "gold_raw": "80",
836
+ "baseline": {
837
+ "pred_answer": "40",
838
+ "correct": false,
839
+ "n_gen_tokens": 64
840
+ },
841
+ "ablated": {
842
+ "pred_answer": "2",
843
+ "correct": false,
844
+ "n_gen_tokens": 64
845
+ }
846
+ },
847
+ {
848
+ "ex_id": "gsm8k-test-55",
849
+ "gold_raw": "5",
850
+ "baseline": {
851
+ "pred_answer": "2",
852
+ "correct": false,
853
+ "n_gen_tokens": 64
854
+ },
855
+ "ablated": {
856
+ "pred_answer": "4",
857
+ "correct": false,
858
+ "n_gen_tokens": 64
859
+ }
860
+ },
861
+ {
862
+ "ex_id": "gsm8k-test-56",
863
+ "gold_raw": "14",
864
+ "baseline": {
865
+ "pred_answer": "2",
866
+ "correct": false,
867
+ "n_gen_tokens": 64
868
+ },
869
+ "ablated": {
870
+ "pred_answer": "2",
871
+ "correct": false,
872
+ "n_gen_tokens": 64
873
+ }
874
+ },
875
+ {
876
+ "ex_id": "gsm8k-test-57",
877
+ "gold_raw": "31",
878
+ "baseline": {
879
+ "pred_answer": "7",
880
+ "correct": false,
881
+ "n_gen_tokens": 64
882
+ },
883
+ "ablated": {
884
+ "pred_answer": "7",
885
+ "correct": false,
886
+ "n_gen_tokens": 64
887
+ }
888
+ },
889
+ {
890
+ "ex_id": "gsm8k-test-58",
891
+ "gold_raw": "36",
892
+ "baseline": {
893
+ "pred_answer": "24",
894
+ "correct": false,
895
+ "n_gen_tokens": 64
896
+ },
897
+ "ablated": {
898
+ "pred_answer": "30",
899
+ "correct": false,
900
+ "n_gen_tokens": 64
901
+ }
902
+ },
903
+ {
904
+ "ex_id": "gsm8k-test-59",
905
+ "gold_raw": "144",
906
+ "baseline": {
907
+ "pred_answer": "1",
908
+ "correct": false,
909
+ "n_gen_tokens": 64
910
+ },
911
+ "ablated": {
912
+ "pred_answer": "20",
913
+ "correct": false,
914
+ "n_gen_tokens": 64
915
+ }
916
+ },
917
+ {
918
+ "ex_id": "gsm8k-test-60",
919
+ "gold_raw": "5",
920
+ "baseline": {
921
+ "pred_answer": "4",
922
+ "correct": false,
923
+ "n_gen_tokens": 64
924
+ },
925
+ "ablated": {
926
+ "pred_answer": "33.0",
927
+ "correct": false,
928
+ "n_gen_tokens": 64
929
+ }
930
+ },
931
+ {
932
+ "ex_id": "gsm8k-test-61",
933
+ "gold_raw": "750",
934
+ "baseline": {
935
+ "pred_answer": "2",
936
+ "correct": false,
937
+ "n_gen_tokens": 64
938
+ },
939
+ "ablated": {
940
+ "pred_answer": "180",
941
+ "correct": false,
942
+ "n_gen_tokens": 64
943
+ }
944
+ },
945
+ {
946
+ "ex_id": "gsm8k-test-62",
947
+ "gold_raw": "38",
948
+ "baseline": {
949
+ "pred_answer": "2",
950
+ "correct": false,
951
+ "n_gen_tokens": 64
952
+ },
953
+ "ablated": {
954
+ "pred_answer": "48",
955
+ "correct": false,
956
+ "n_gen_tokens": 64
957
+ }
958
+ },
959
+ {
960
+ "ex_id": "gsm8k-test-63",
961
+ "gold_raw": "48",
962
+ "baseline": {
963
+ "pred_answer": "4",
964
+ "correct": false,
965
+ "n_gen_tokens": 64
966
+ },
967
+ "ablated": {
968
+ "pred_answer": "1",
969
+ "correct": false,
970
+ "n_gen_tokens": 64
971
+ }
972
+ },
973
+ {
974
+ "ex_id": "gsm8k-test-64",
975
+ "gold_raw": "655",
976
+ "baseline": {
977
+ "pred_answer": "3",
978
+ "correct": false,
979
+ "n_gen_tokens": 64
980
+ },
981
+ "ablated": {
982
+ "pred_answer": "4",
983
+ "correct": false,
984
+ "n_gen_tokens": 64
985
+ }
986
+ },
987
+ {
988
+ "ex_id": "gsm8k-test-65",
989
+ "gold_raw": "800",
990
+ "baseline": {
991
+ "pred_answer": "500",
992
+ "correct": false,
993
+ "n_gen_tokens": 64
994
+ },
995
+ "ablated": {
996
+ "pred_answer": "1",
997
+ "correct": false,
998
+ "n_gen_tokens": 64
999
+ }
1000
+ },
1001
+ {
1002
+ "ex_id": "gsm8k-test-66",
1003
+ "gold_raw": "7300",
1004
+ "baseline": {
1005
+ "pred_answer": "100",
1006
+ "correct": false,
1007
+ "n_gen_tokens": 64
1008
+ },
1009
+ "ablated": {
1010
+ "pred_answer": "1",
1011
+ "correct": false,
1012
+ "n_gen_tokens": 64
1013
+ }
1014
+ },
1015
+ {
1016
+ "ex_id": "gsm8k-test-67",
1017
+ "gold_raw": "48",
1018
+ "baseline": {
1019
+ "pred_answer": "1",
1020
+ "correct": false,
1021
+ "n_gen_tokens": 64
1022
+ },
1023
+ "ablated": {
1024
+ "pred_answer": "3",
1025
+ "correct": false,
1026
+ "n_gen_tokens": 64
1027
+ }
1028
+ },
1029
+ {
1030
+ "ex_id": "gsm8k-test-68",
1031
+ "gold_raw": "4",
1032
+ "baseline": {
1033
+ "pred_answer": "2",
1034
+ "correct": false,
1035
+ "n_gen_tokens": 64
1036
+ },
1037
+ "ablated": {
1038
+ "pred_answer": "4",
1039
+ "correct": true,
1040
+ "n_gen_tokens": 64
1041
+ }
1042
+ },
1043
+ {
1044
+ "ex_id": "gsm8k-test-69",
1045
+ "gold_raw": "15",
1046
+ "baseline": {
1047
+ "pred_answer": "2",
1048
+ "correct": false,
1049
+ "n_gen_tokens": 64
1050
+ },
1051
+ "ablated": {
1052
+ "pred_answer": "12.0",
1053
+ "correct": false,
1054
+ "n_gen_tokens": 64
1055
+ }
1056
+ },
1057
+ {
1058
+ "ex_id": "gsm8k-test-70",
1059
+ "gold_raw": "23",
1060
+ "baseline": {
1061
+ "pred_answer": "3",
1062
+ "correct": false,
1063
+ "n_gen_tokens": 64
1064
+ },
1065
+ "ablated": {
1066
+ "pred_answer": "3",
1067
+ "correct": false,
1068
+ "n_gen_tokens": 64
1069
+ }
1070
+ },
1071
+ {
1072
+ "ex_id": "gsm8k-test-71",
1073
+ "gold_raw": "225",
1074
+ "baseline": {
1075
+ "pred_answer": "5",
1076
+ "correct": false,
1077
+ "n_gen_tokens": 64
1078
+ },
1079
+ "ablated": {
1080
+ "pred_answer": "25.0",
1081
+ "correct": false,
1082
+ "n_gen_tokens": 64
1083
+ }
1084
+ },
1085
+ {
1086
+ "ex_id": "gsm8k-test-72",
1087
+ "gold_raw": "15",
1088
+ "baseline": {
1089
+ "pred_answer": "4",
1090
+ "correct": false,
1091
+ "n_gen_tokens": 64
1092
+ },
1093
+ "ablated": {
1094
+ "pred_answer": "13",
1095
+ "correct": false,
1096
+ "n_gen_tokens": 64
1097
+ }
1098
+ },
1099
+ {
1100
+ "ex_id": "gsm8k-test-73",
1101
+ "gold_raw": "82",
1102
+ "baseline": {
1103
+ "pred_answer": "10",
1104
+ "correct": false,
1105
+ "n_gen_tokens": 64
1106
+ },
1107
+ "ablated": {
1108
+ "pred_answer": "3",
1109
+ "correct": false,
1110
+ "n_gen_tokens": 64
1111
+ }
1112
+ },
1113
+ {
1114
+ "ex_id": "gsm8k-test-74",
1115
+ "gold_raw": "1218",
1116
+ "baseline": {
1117
+ "pred_answer": "48",
1118
+ "correct": false,
1119
+ "n_gen_tokens": 64
1120
+ },
1121
+ "ablated": {
1122
+ "pred_answer": "1024",
1123
+ "correct": false,
1124
+ "n_gen_tokens": 64
1125
+ }
1126
+ },
1127
+ {
1128
+ "ex_id": "gsm8k-test-75",
1129
+ "gold_raw": "2",
1130
+ "baseline": {
1131
+ "pred_answer": "2.00",
1132
+ "correct": true,
1133
+ "n_gen_tokens": 64
1134
+ },
1135
+ "ablated": {
1136
+ "pred_answer": "6",
1137
+ "correct": false,
1138
+ "n_gen_tokens": 64
1139
+ }
1140
+ },
1141
+ {
1142
+ "ex_id": "gsm8k-test-76",
1143
+ "gold_raw": "36",
1144
+ "baseline": {
1145
+ "pred_answer": "192",
1146
+ "correct": false,
1147
+ "n_gen_tokens": 64
1148
+ },
1149
+ "ablated": {
1150
+ "pred_answer": "15",
1151
+ "correct": false,
1152
+ "n_gen_tokens": 64
1153
+ }
1154
+ },
1155
+ {
1156
+ "ex_id": "gsm8k-test-77",
1157
+ "gold_raw": "13",
1158
+ "baseline": {
1159
+ "pred_answer": "3",
1160
+ "correct": false,
1161
+ "n_gen_tokens": 64
1162
+ },
1163
+ "ablated": {
1164
+ "pred_answer": "20",
1165
+ "correct": false,
1166
+ "n_gen_tokens": 64
1167
+ }
1168
+ },
1169
+ {
1170
+ "ex_id": "gsm8k-test-78",
1171
+ "gold_raw": "11",
1172
+ "baseline": {
1173
+ "pred_answer": "2",
1174
+ "correct": false,
1175
+ "n_gen_tokens": 64
1176
+ },
1177
+ "ablated": {
1178
+ "pred_answer": "2.0",
1179
+ "correct": false,
1180
+ "n_gen_tokens": 64
1181
+ }
1182
+ },
1183
+ {
1184
+ "ex_id": "gsm8k-test-79",
1185
+ "gold_raw": "8",
1186
+ "baseline": {
1187
+ "pred_answer": "6",
1188
+ "correct": false,
1189
+ "n_gen_tokens": 64
1190
+ },
1191
+ "ablated": {
1192
+ "pred_answer": "8",
1193
+ "correct": true,
1194
+ "n_gen_tokens": 64
1195
+ }
1196
+ },
1197
+ {
1198
+ "ex_id": "gsm8k-test-80",
1199
+ "gold_raw": "440",
1200
+ "baseline": {
1201
+ "pred_answer": "0.7",
1202
+ "correct": false,
1203
+ "n_gen_tokens": 64
1204
+ },
1205
+ "ablated": {
1206
+ "pred_answer": "560",
1207
+ "correct": false,
1208
+ "n_gen_tokens": 64
1209
+ }
1210
+ },
1211
+ {
1212
+ "ex_id": "gsm8k-test-81",
1213
+ "gold_raw": "2",
1214
+ "baseline": {
1215
+ "pred_answer": "8",
1216
+ "correct": false,
1217
+ "n_gen_tokens": 64
1218
+ },
1219
+ "ablated": {
1220
+ "pred_answer": "1",
1221
+ "correct": false,
1222
+ "n_gen_tokens": 64
1223
+ }
1224
+ },
1225
+ {
1226
+ "ex_id": "gsm8k-test-82",
1227
+ "gold_raw": "45",
1228
+ "baseline": {
1229
+ "pred_answer": "3",
1230
+ "correct": false,
1231
+ "n_gen_tokens": 64
1232
+ },
1233
+ "ablated": {
1234
+ "pred_answer": "9",
1235
+ "correct": false,
1236
+ "n_gen_tokens": 64
1237
+ }
1238
+ },
1239
+ {
1240
+ "ex_id": "gsm8k-test-83",
1241
+ "gold_raw": "54",
1242
+ "baseline": {
1243
+ "pred_answer": "10",
1244
+ "correct": false,
1245
+ "n_gen_tokens": 64
1246
+ },
1247
+ "ablated": {
1248
+ "pred_answer": "35",
1249
+ "correct": false,
1250
+ "n_gen_tokens": 64
1251
+ }
1252
+ },
1253
+ {
1254
+ "ex_id": "gsm8k-test-84",
1255
+ "gold_raw": "6",
1256
+ "baseline": {
1257
+ "pred_answer": "6",
1258
+ "correct": true,
1259
+ "n_gen_tokens": 64
1260
+ },
1261
+ "ablated": {
1262
+ "pred_answer": "9",
1263
+ "correct": false,
1264
+ "n_gen_tokens": 64
1265
+ }
1266
+ },
1267
+ {
1268
+ "ex_id": "gsm8k-test-85",
1269
+ "gold_raw": "240",
1270
+ "baseline": {
1271
+ "pred_answer": "3",
1272
+ "correct": false,
1273
+ "n_gen_tokens": 64
1274
+ },
1275
+ "ablated": {
1276
+ "pred_answer": "12600.0",
1277
+ "correct": false,
1278
+ "n_gen_tokens": 64
1279
+ }
1280
+ },
1281
+ {
1282
+ "ex_id": "gsm8k-test-86",
1283
+ "gold_raw": "428",
1284
+ "baseline": {
1285
+ "pred_answer": "2",
1286
+ "correct": false,
1287
+ "n_gen_tokens": 64
1288
+ },
1289
+ "ablated": {
1290
+ "pred_answer": "278",
1291
+ "correct": false,
1292
+ "n_gen_tokens": 64
1293
+ }
1294
+ },
1295
+ {
1296
+ "ex_id": "gsm8k-test-87",
1297
+ "gold_raw": "5",
1298
+ "baseline": {
1299
+ "pred_answer": "1",
1300
+ "correct": false,
1301
+ "n_gen_tokens": 64
1302
+ },
1303
+ "ablated": {
1304
+ "pred_answer": "5",
1305
+ "correct": true,
1306
+ "n_gen_tokens": 64
1307
+ }
1308
+ },
1309
+ {
1310
+ "ex_id": "gsm8k-test-88",
1311
+ "gold_raw": "255",
1312
+ "baseline": {
1313
+ "pred_answer": "1",
1314
+ "correct": false,
1315
+ "n_gen_tokens": 64
1316
+ },
1317
+ "ablated": {
1318
+ "pred_answer": "120",
1319
+ "correct": false,
1320
+ "n_gen_tokens": 64
1321
+ }
1322
+ },
1323
+ {
1324
+ "ex_id": "gsm8k-test-89",
1325
+ "gold_raw": "10",
1326
+ "baseline": {
1327
+ "pred_answer": "1",
1328
+ "correct": false,
1329
+ "n_gen_tokens": 64
1330
+ },
1331
+ "ablated": {
1332
+ "pred_answer": "4",
1333
+ "correct": false,
1334
+ "n_gen_tokens": 64
1335
+ }
1336
+ },
1337
+ {
1338
+ "ex_id": "gsm8k-test-90",
1339
+ "gold_raw": "9",
1340
+ "baseline": {
1341
+ "pred_answer": "4",
1342
+ "correct": false,
1343
+ "n_gen_tokens": 64
1344
+ },
1345
+ "ablated": {
1346
+ "pred_answer": "1",
1347
+ "correct": false,
1348
+ "n_gen_tokens": 64
1349
+ }
1350
+ },
1351
+ {
1352
+ "ex_id": "gsm8k-test-91",
1353
+ "gold_raw": "157",
1354
+ "baseline": {
1355
+ "pred_answer": "3",
1356
+ "correct": false,
1357
+ "n_gen_tokens": 64
1358
+ },
1359
+ "ablated": {
1360
+ "pred_answer": "10",
1361
+ "correct": false,
1362
+ "n_gen_tokens": 64
1363
+ }
1364
+ },
1365
+ {
1366
+ "ex_id": "gsm8k-test-92",
1367
+ "gold_raw": "56",
1368
+ "baseline": {
1369
+ "pred_answer": "30",
1370
+ "correct": false,
1371
+ "n_gen_tokens": 64
1372
+ },
1373
+ "ablated": {
1374
+ "pred_answer": "1",
1375
+ "correct": false,
1376
+ "n_gen_tokens": 64
1377
+ }
1378
+ },
1379
+ {
1380
+ "ex_id": "gsm8k-test-93",
1381
+ "gold_raw": "5",
1382
+ "baseline": {
1383
+ "pred_answer": "10",
1384
+ "correct": false,
1385
+ "n_gen_tokens": 64
1386
+ },
1387
+ "ablated": {
1388
+ "pred_answer": "0.5",
1389
+ "correct": false,
1390
+ "n_gen_tokens": 64
1391
+ }
1392
+ },
1393
+ {
1394
+ "ex_id": "gsm8k-test-94",
1395
+ "gold_raw": "144",
1396
+ "baseline": {
1397
+ "pred_answer": "3",
1398
+ "correct": false,
1399
+ "n_gen_tokens": 64
1400
+ },
1401
+ "ablated": {
1402
+ "pred_answer": "1",
1403
+ "correct": false,
1404
+ "n_gen_tokens": 64
1405
+ }
1406
+ },
1407
+ {
1408
+ "ex_id": "gsm8k-test-95",
1409
+ "gold_raw": "50",
1410
+ "baseline": {
1411
+ "pred_answer": "30",
1412
+ "correct": false,
1413
+ "n_gen_tokens": 64
1414
+ },
1415
+ "ablated": {
1416
+ "pred_answer": "3600",
1417
+ "correct": false,
1418
+ "n_gen_tokens": 64
1419
+ }
1420
+ },
1421
+ {
1422
+ "ex_id": "gsm8k-test-96",
1423
+ "gold_raw": "4",
1424
+ "baseline": {
1425
+ "pred_answer": "6",
1426
+ "correct": false,
1427
+ "n_gen_tokens": 64
1428
+ },
1429
+ "ablated": {
1430
+ "pred_answer": "2",
1431
+ "correct": false,
1432
+ "n_gen_tokens": 64
1433
+ }
1434
+ },
1435
+ {
1436
+ "ex_id": "gsm8k-test-97",
1437
+ "gold_raw": "50",
1438
+ "baseline": {
1439
+ "pred_answer": "4",
1440
+ "correct": false,
1441
+ "n_gen_tokens": 64
1442
+ },
1443
+ "ablated": {
1444
+ "pred_answer": "70",
1445
+ "correct": false,
1446
+ "n_gen_tokens": 64
1447
+ }
1448
+ },
1449
+ {
1450
+ "ex_id": "gsm8k-test-98",
1451
+ "gold_raw": "42",
1452
+ "baseline": {
1453
+ "pred_answer": "10.00",
1454
+ "correct": false,
1455
+ "n_gen_tokens": 64
1456
+ },
1457
+ "ablated": {
1458
+ "pred_answer": "30.00",
1459
+ "correct": false,
1460
+ "n_gen_tokens": 64
1461
+ }
1462
+ },
1463
+ {
1464
+ "ex_id": "gsm8k-test-99",
1465
+ "gold_raw": "7",
1466
+ "baseline": {
1467
+ "pred_answer": "2",
1468
+ "correct": false,
1469
+ "n_gen_tokens": 64
1470
+ },
1471
+ "ablated": {
1472
+ "pred_answer": "4",
1473
+ "correct": false,
1474
+ "n_gen_tokens": 64
1475
+ }
1476
+ },
1477
+ {
1478
+ "ex_id": "gsm8k-test-100",
1479
+ "gold_raw": "250",
1480
+ "baseline": {
1481
+ "pred_answer": "11",
1482
+ "correct": false,
1483
+ "n_gen_tokens": 64
1484
+ },
1485
+ "ablated": {
1486
+ "pred_answer": "3",
1487
+ "correct": false,
1488
+ "n_gen_tokens": 64
1489
+ }
1490
+ },
1491
+ {
1492
+ "ex_id": "gsm8k-test-101",
1493
+ "gold_raw": "12",
1494
+ "baseline": {
1495
+ "pred_answer": "6",
1496
+ "correct": false,
1497
+ "n_gen_tokens": 64
1498
+ },
1499
+ "ablated": {
1500
+ "pred_answer": "29",
1501
+ "correct": false,
1502
+ "n_gen_tokens": 64
1503
+ }
1504
+ },
1505
+ {
1506
+ "ex_id": "gsm8k-test-102",
1507
+ "gold_raw": "7",
1508
+ "baseline": {
1509
+ "pred_answer": "7",
1510
+ "correct": true,
1511
+ "n_gen_tokens": 64
1512
+ },
1513
+ "ablated": {
1514
+ "pred_answer": "7.0",
1515
+ "correct": true,
1516
+ "n_gen_tokens": 64
1517
+ }
1518
+ },
1519
+ {
1520
+ "ex_id": "gsm8k-test-103",
1521
+ "gold_raw": "8",
1522
+ "baseline": {
1523
+ "pred_answer": "3",
1524
+ "correct": false,
1525
+ "n_gen_tokens": 64
1526
+ },
1527
+ "ablated": {
1528
+ "pred_answer": "5",
1529
+ "correct": false,
1530
+ "n_gen_tokens": 64
1531
+ }
1532
+ },
1533
+ {
1534
+ "ex_id": "gsm8k-test-104",
1535
+ "gold_raw": "26",
1536
+ "baseline": {
1537
+ "pred_answer": "75",
1538
+ "correct": false,
1539
+ "n_gen_tokens": 64
1540
+ },
1541
+ "ablated": {
1542
+ "pred_answer": "19",
1543
+ "correct": false,
1544
+ "n_gen_tokens": 64
1545
+ }
1546
+ },
1547
+ {
1548
+ "ex_id": "gsm8k-test-105",
1549
+ "gold_raw": "42",
1550
+ "baseline": {
1551
+ "pred_answer": "3",
1552
+ "correct": false,
1553
+ "n_gen_tokens": 64
1554
+ },
1555
+ "ablated": {
1556
+ "pred_answer": "2",
1557
+ "correct": false,
1558
+ "n_gen_tokens": 64
1559
+ }
1560
+ },
1561
+ {
1562
+ "ex_id": "gsm8k-test-106",
1563
+ "gold_raw": "5",
1564
+ "baseline": {
1565
+ "pred_answer": "1",
1566
+ "correct": false,
1567
+ "n_gen_tokens": 64
1568
+ },
1569
+ "ablated": {
1570
+ "pred_answer": "15",
1571
+ "correct": false,
1572
+ "n_gen_tokens": 64
1573
+ }
1574
+ },
1575
+ {
1576
+ "ex_id": "gsm8k-test-107",
1577
+ "gold_raw": "14400",
1578
+ "baseline": {
1579
+ "pred_answer": "4",
1580
+ "correct": false,
1581
+ "n_gen_tokens": 64
1582
+ },
1583
+ "ablated": {
1584
+ "pred_answer": "3640",
1585
+ "correct": false,
1586
+ "n_gen_tokens": 64
1587
+ }
1588
+ },
1589
+ {
1590
+ "ex_id": "gsm8k-test-108",
1591
+ "gold_raw": "400",
1592
+ "baseline": {
1593
+ "pred_answer": "3",
1594
+ "correct": false,
1595
+ "n_gen_tokens": 64
1596
+ },
1597
+ "ablated": {
1598
+ "pred_answer": "15",
1599
+ "correct": false,
1600
+ "n_gen_tokens": 64
1601
+ }
1602
+ },
1603
+ {
1604
+ "ex_id": "gsm8k-test-109",
1605
+ "gold_raw": "40",
1606
+ "baseline": {
1607
+ "pred_answer": "3",
1608
+ "correct": false,
1609
+ "n_gen_tokens": 64
1610
+ },
1611
+ "ablated": {
1612
+ "pred_answer": "3",
1613
+ "correct": false,
1614
+ "n_gen_tokens": 64
1615
+ }
1616
+ },
1617
+ {
1618
+ "ex_id": "gsm8k-test-110",
1619
+ "gold_raw": "83",
1620
+ "baseline": {
1621
+ "pred_answer": "2",
1622
+ "correct": false,
1623
+ "n_gen_tokens": 64
1624
+ },
1625
+ "ablated": {
1626
+ "pred_answer": "100",
1627
+ "correct": false,
1628
+ "n_gen_tokens": 64
1629
+ }
1630
+ },
1631
+ {
1632
+ "ex_id": "gsm8k-test-111",
1633
+ "gold_raw": "10",
1634
+ "baseline": {
1635
+ "pred_answer": "3",
1636
+ "correct": false,
1637
+ "n_gen_tokens": 64
1638
+ },
1639
+ "ablated": {
1640
+ "pred_answer": "1",
1641
+ "correct": false,
1642
+ "n_gen_tokens": 64
1643
+ }
1644
+ },
1645
+ {
1646
+ "ex_id": "gsm8k-test-112",
1647
+ "gold_raw": "80",
1648
+ "baseline": {
1649
+ "pred_answer": "50",
1650
+ "correct": false,
1651
+ "n_gen_tokens": 64
1652
+ },
1653
+ "ablated": {
1654
+ "pred_answer": "1",
1655
+ "correct": false,
1656
+ "n_gen_tokens": 64
1657
+ }
1658
+ },
1659
+ {
1660
+ "ex_id": "gsm8k-test-113",
1661
+ "gold_raw": "180",
1662
+ "baseline": {
1663
+ "pred_answer": "12",
1664
+ "correct": false,
1665
+ "n_gen_tokens": 64
1666
+ },
1667
+ "ablated": {
1668
+ "pred_answer": "5.00",
1669
+ "correct": false,
1670
+ "n_gen_tokens": 64
1671
+ }
1672
+ },
1673
+ {
1674
+ "ex_id": "gsm8k-test-114",
1675
+ "gold_raw": "1450000",
1676
+ "baseline": {
1677
+ "pred_answer": "5",
1678
+ "correct": false,
1679
+ "n_gen_tokens": 64
1680
+ },
1681
+ "ablated": {
1682
+ "pred_answer": "1",
1683
+ "correct": false,
1684
+ "n_gen_tokens": 64
1685
+ }
1686
+ },
1687
+ {
1688
+ "ex_id": "gsm8k-test-115",
1689
+ "gold_raw": "15",
1690
+ "baseline": {
1691
+ "pred_answer": "1",
1692
+ "correct": false,
1693
+ "n_gen_tokens": 64
1694
+ },
1695
+ "ablated": {
1696
+ "pred_answer": "20",
1697
+ "correct": false,
1698
+ "n_gen_tokens": 64
1699
+ }
1700
+ },
1701
+ {
1702
+ "ex_id": "gsm8k-test-116",
1703
+ "gold_raw": "1000",
1704
+ "baseline": {
1705
+ "pred_answer": "3",
1706
+ "correct": false,
1707
+ "n_gen_tokens": 64
1708
+ },
1709
+ "ablated": {
1710
+ "pred_answer": "1000",
1711
+ "correct": true,
1712
+ "n_gen_tokens": 64
1713
+ }
1714
+ },
1715
+ {
1716
+ "ex_id": "gsm8k-test-117",
1717
+ "gold_raw": "2",
1718
+ "baseline": {
1719
+ "pred_answer": "2",
1720
+ "correct": true,
1721
+ "n_gen_tokens": 64
1722
+ },
1723
+ "ablated": {
1724
+ "pred_answer": "10",
1725
+ "correct": false,
1726
+ "n_gen_tokens": 64
1727
+ }
1728
+ },
1729
+ {
1730
+ "ex_id": "gsm8k-test-118",
1731
+ "gold_raw": "15",
1732
+ "baseline": {
1733
+ "pred_answer": "5",
1734
+ "correct": false,
1735
+ "n_gen_tokens": 64
1736
+ },
1737
+ "ablated": {
1738
+ "pred_answer": "3",
1739
+ "correct": false,
1740
+ "n_gen_tokens": 64
1741
+ }
1742
+ },
1743
+ {
1744
+ "ex_id": "gsm8k-test-119",
1745
+ "gold_raw": "100",
1746
+ "baseline": {
1747
+ "pred_answer": "3",
1748
+ "correct": false,
1749
+ "n_gen_tokens": 64
1750
+ },
1751
+ "ablated": {
1752
+ "pred_answer": "250",
1753
+ "correct": false,
1754
+ "n_gen_tokens": 64
1755
+ }
1756
+ },
1757
+ {
1758
+ "ex_id": "gsm8k-test-120",
1759
+ "gold_raw": "335",
1760
+ "baseline": {
1761
+ "pred_answer": "2",
1762
+ "correct": false,
1763
+ "n_gen_tokens": 64
1764
+ },
1765
+ "ablated": {
1766
+ "pred_answer": "12",
1767
+ "correct": false,
1768
+ "n_gen_tokens": 64
1769
+ }
1770
+ },
1771
+ {
1772
+ "ex_id": "gsm8k-test-121",
1773
+ "gold_raw": "60",
1774
+ "baseline": {
1775
+ "pred_answer": "4",
1776
+ "correct": false,
1777
+ "n_gen_tokens": 64
1778
+ },
1779
+ "ablated": {
1780
+ "pred_answer": "3",
1781
+ "correct": false,
1782
+ "n_gen_tokens": 64
1783
+ }
1784
+ },
1785
+ {
1786
+ "ex_id": "gsm8k-test-122",
1787
+ "gold_raw": "5",
1788
+ "baseline": {
1789
+ "pred_answer": "1",
1790
+ "correct": false,
1791
+ "n_gen_tokens": 64
1792
+ },
1793
+ "ablated": {
1794
+ "pred_answer": "12",
1795
+ "correct": false,
1796
+ "n_gen_tokens": 64
1797
+ }
1798
+ },
1799
+ {
1800
+ "ex_id": "gsm8k-test-123",
1801
+ "gold_raw": "9500",
1802
+ "baseline": {
1803
+ "pred_answer": "000",
1804
+ "correct": false,
1805
+ "n_gen_tokens": 64
1806
+ },
1807
+ "ablated": {
1808
+ "pred_answer": "3400",
1809
+ "correct": false,
1810
+ "n_gen_tokens": 64
1811
+ }
1812
+ },
1813
+ {
1814
+ "ex_id": "gsm8k-test-124",
1815
+ "gold_raw": "160",
1816
+ "baseline": {
1817
+ "pred_answer": "2",
1818
+ "correct": false,
1819
+ "n_gen_tokens": 64
1820
+ },
1821
+ "ablated": {
1822
+ "pred_answer": "10",
1823
+ "correct": false,
1824
+ "n_gen_tokens": 64
1825
+ }
1826
+ },
1827
+ {
1828
+ "ex_id": "gsm8k-test-125",
1829
+ "gold_raw": "1050",
1830
+ "baseline": {
1831
+ "pred_answer": "2",
1832
+ "correct": false,
1833
+ "n_gen_tokens": 64
1834
+ },
1835
+ "ablated": {
1836
+ "pred_answer": "14",
1837
+ "correct": false,
1838
+ "n_gen_tokens": 64
1839
+ }
1840
+ },
1841
+ {
1842
+ "ex_id": "gsm8k-test-126",
1843
+ "gold_raw": "91",
1844
+ "baseline": {
1845
+ "pred_answer": "2",
1846
+ "correct": false,
1847
+ "n_gen_tokens": 64
1848
+ },
1849
+ "ablated": {
1850
+ "pred_answer": "10",
1851
+ "correct": false,
1852
+ "n_gen_tokens": 64
1853
+ }
1854
+ },
1855
+ {
1856
+ "ex_id": "gsm8k-test-127",
1857
+ "gold_raw": "21",
1858
+ "baseline": {
1859
+ "pred_answer": "4",
1860
+ "correct": false,
1861
+ "n_gen_tokens": 64
1862
+ },
1863
+ "ablated": {
1864
+ "pred_answer": "11844329550533333923277761503275729703614298252421958262785",
1865
+ "correct": false,
1866
+ "n_gen_tokens": 64
1867
+ }
1868
+ },
1869
+ {
1870
+ "ex_id": "gsm8k-test-128",
1871
+ "gold_raw": "20",
1872
+ "baseline": {
1873
+ "pred_answer": "3",
1874
+ "correct": false,
1875
+ "n_gen_tokens": 64
1876
+ },
1877
+ "ablated": {
1878
+ "pred_answer": "24.0000000000000000",
1879
+ "correct": false,
1880
+ "n_gen_tokens": 64
1881
+ }
1882
+ },
1883
+ {
1884
+ "ex_id": "gsm8k-test-129",
1885
+ "gold_raw": "36",
1886
+ "baseline": {
1887
+ "pred_answer": "38",
1888
+ "correct": false,
1889
+ "n_gen_tokens": 64
1890
+ },
1891
+ "ablated": {
1892
+ "pred_answer": "400",
1893
+ "correct": false,
1894
+ "n_gen_tokens": 64
1895
+ }
1896
+ },
1897
+ {
1898
+ "ex_id": "gsm8k-test-130",
1899
+ "gold_raw": "36",
1900
+ "baseline": {
1901
+ "pred_answer": "1",
1902
+ "correct": false,
1903
+ "n_gen_tokens": 64
1904
+ },
1905
+ "ablated": {
1906
+ "pred_answer": "3",
1907
+ "correct": false,
1908
+ "n_gen_tokens": 64
1909
+ }
1910
+ },
1911
+ {
1912
+ "ex_id": "gsm8k-test-131",
1913
+ "gold_raw": "10",
1914
+ "baseline": {
1915
+ "pred_answer": "2",
1916
+ "correct": false,
1917
+ "n_gen_tokens": 64
1918
+ },
1919
+ "ablated": {
1920
+ "pred_answer": "8",
1921
+ "correct": false,
1922
+ "n_gen_tokens": 64
1923
+ }
1924
+ },
1925
+ {
1926
+ "ex_id": "gsm8k-test-132",
1927
+ "gold_raw": "5",
1928
+ "baseline": {
1929
+ "pred_answer": "3",
1930
+ "correct": false,
1931
+ "n_gen_tokens": 64
1932
+ },
1933
+ "ablated": {
1934
+ "pred_answer": "1",
1935
+ "correct": false,
1936
+ "n_gen_tokens": 64
1937
+ }
1938
+ },
1939
+ {
1940
+ "ex_id": "gsm8k-test-133",
1941
+ "gold_raw": "32",
1942
+ "baseline": {
1943
+ "pred_answer": "5",
1944
+ "correct": false,
1945
+ "n_gen_tokens": 64
1946
+ },
1947
+ "ablated": {
1948
+ "pred_answer": "25",
1949
+ "correct": false,
1950
+ "n_gen_tokens": 64
1951
+ }
1952
+ },
1953
+ {
1954
+ "ex_id": "gsm8k-test-134",
1955
+ "gold_raw": "18",
1956
+ "baseline": {
1957
+ "pred_answer": "4",
1958
+ "correct": false,
1959
+ "n_gen_tokens": 64
1960
+ },
1961
+ "ablated": {
1962
+ "pred_answer": "3",
1963
+ "correct": false,
1964
+ "n_gen_tokens": 64
1965
+ }
1966
+ },
1967
+ {
1968
+ "ex_id": "gsm8k-test-135",
1969
+ "gold_raw": "4",
1970
+ "baseline": {
1971
+ "pred_answer": "3",
1972
+ "correct": false,
1973
+ "n_gen_tokens": 64
1974
+ },
1975
+ "ablated": {
1976
+ "pred_answer": "4",
1977
+ "correct": true,
1978
+ "n_gen_tokens": 64
1979
+ }
1980
+ },
1981
+ {
1982
+ "ex_id": "gsm8k-test-136",
1983
+ "gold_raw": "48",
1984
+ "baseline": {
1985
+ "pred_answer": "3",
1986
+ "correct": false,
1987
+ "n_gen_tokens": 64
1988
+ },
1989
+ "ablated": {
1990
+ "pred_answer": "20",
1991
+ "correct": false,
1992
+ "n_gen_tokens": 64
1993
+ }
1994
+ },
1995
+ {
1996
+ "ex_id": "gsm8k-test-137",
1997
+ "gold_raw": "8",
1998
+ "baseline": {
1999
+ "pred_answer": "2",
2000
+ "correct": false,
2001
+ "n_gen_tokens": 64
2002
+ },
2003
+ "ablated": {
2004
+ "pred_answer": "5",
2005
+ "correct": false,
2006
+ "n_gen_tokens": 64
2007
+ }
2008
+ },
2009
+ {
2010
+ "ex_id": "gsm8k-test-138",
2011
+ "gold_raw": "21",
2012
+ "baseline": {
2013
+ "pred_answer": "6",
2014
+ "correct": false,
2015
+ "n_gen_tokens": 64
2016
+ },
2017
+ "ablated": {
2018
+ "pred_answer": "21",
2019
+ "correct": true,
2020
+ "n_gen_tokens": 64
2021
+ }
2022
+ },
2023
+ {
2024
+ "ex_id": "gsm8k-test-139",
2025
+ "gold_raw": "25",
2026
+ "baseline": {
2027
+ "pred_answer": "3",
2028
+ "correct": false,
2029
+ "n_gen_tokens": 64
2030
+ },
2031
+ "ablated": {
2032
+ "pred_answer": "00",
2033
+ "correct": false,
2034
+ "n_gen_tokens": 64
2035
+ }
2036
+ },
2037
+ {
2038
+ "ex_id": "gsm8k-test-140",
2039
+ "gold_raw": "3000",
2040
+ "baseline": {
2041
+ "pred_answer": "000",
2042
+ "correct": false,
2043
+ "n_gen_tokens": 64
2044
+ },
2045
+ "ablated": {
2046
+ "pred_answer": "7000.0",
2047
+ "correct": false,
2048
+ "n_gen_tokens": 64
2049
+ }
2050
+ },
2051
+ {
2052
+ "ex_id": "gsm8k-test-141",
2053
+ "gold_raw": "40",
2054
+ "baseline": {
2055
+ "pred_answer": "2",
2056
+ "correct": false,
2057
+ "n_gen_tokens": 64
2058
+ },
2059
+ "ablated": {
2060
+ "pred_answer": "10",
2061
+ "correct": false,
2062
+ "n_gen_tokens": 64
2063
+ }
2064
+ },
2065
+ {
2066
+ "ex_id": "gsm8k-test-142",
2067
+ "gold_raw": "50",
2068
+ "baseline": {
2069
+ "pred_answer": "28",
2070
+ "correct": false,
2071
+ "n_gen_tokens": 64
2072
+ },
2073
+ "ablated": {
2074
+ "pred_answer": "34",
2075
+ "correct": false,
2076
+ "n_gen_tokens": 64
2077
+ }
2078
+ },
2079
+ {
2080
+ "ex_id": "gsm8k-test-143",
2081
+ "gold_raw": "90",
2082
+ "baseline": {
2083
+ "pred_answer": "75",
2084
+ "correct": false,
2085
+ "n_gen_tokens": 64
2086
+ },
2087
+ "ablated": {
2088
+ "pred_answer": "100",
2089
+ "correct": false,
2090
+ "n_gen_tokens": 64
2091
+ }
2092
+ },
2093
+ {
2094
+ "ex_id": "gsm8k-test-144",
2095
+ "gold_raw": "23",
2096
+ "baseline": {
2097
+ "pred_answer": "70",
2098
+ "correct": false,
2099
+ "n_gen_tokens": 64
2100
+ },
2101
+ "ablated": {
2102
+ "pred_answer": "12",
2103
+ "correct": false,
2104
+ "n_gen_tokens": 64
2105
+ }
2106
+ },
2107
+ {
2108
+ "ex_id": "gsm8k-test-145",
2109
+ "gold_raw": "2",
2110
+ "baseline": {
2111
+ "pred_answer": "50",
2112
+ "correct": false,
2113
+ "n_gen_tokens": 64
2114
+ },
2115
+ "ablated": {
2116
+ "pred_answer": ". \n:\nTo find the the number of watermelons, we need to know the the the the number of watermelons, we need to know the the the number of watermelons, we need to know the the the number of watermelons, we need to know the the the number of watermel",
2117
+ "correct": false,
2118
+ "n_gen_tokens": 64
2119
+ }
2120
+ },
2121
+ {
2122
+ "ex_id": "gsm8k-test-146",
2123
+ "gold_raw": "50",
2124
+ "baseline": {
2125
+ "pred_answer": "1",
2126
+ "correct": false,
2127
+ "n_gen_tokens": 64
2128
+ },
2129
+ "ablated": {
2130
+ "pred_answer": "3.0",
2131
+ "correct": false,
2132
+ "n_gen_tokens": 64
2133
+ }
2134
+ },
2135
+ {
2136
+ "ex_id": "gsm8k-test-147",
2137
+ "gold_raw": "122",
2138
+ "baseline": {
2139
+ "pred_answer": "4",
2140
+ "correct": false,
2141
+ "n_gen_tokens": 64
2142
+ },
2143
+ "ablated": {
2144
+ "pred_answer": "4",
2145
+ "correct": false,
2146
+ "n_gen_tokens": 64
2147
+ }
2148
+ },
2149
+ {
2150
+ "ex_id": "gsm8k-test-148",
2151
+ "gold_raw": "300",
2152
+ "baseline": {
2153
+ "pred_answer": "400",
2154
+ "correct": false,
2155
+ "n_gen_tokens": 64
2156
+ },
2157
+ "ablated": {
2158
+ "pred_answer": "700",
2159
+ "correct": false,
2160
+ "n_gen_tokens": 64
2161
+ }
2162
+ },
2163
+ {
2164
+ "ex_id": "gsm8k-test-149",
2165
+ "gold_raw": "448",
2166
+ "baseline": {
2167
+ "pred_answer": "2",
2168
+ "correct": false,
2169
+ "n_gen_tokens": 64
2170
+ },
2171
+ "ablated": {
2172
+ "pred_answer": "1",
2173
+ "correct": false,
2174
+ "n_gen_tokens": 64
2175
+ }
2176
+ },
2177
+ {
2178
+ "ex_id": "gsm8k-test-150",
2179
+ "gold_raw": "2450",
2180
+ "baseline": {
2181
+ "pred_answer": "1",
2182
+ "correct": false,
2183
+ "n_gen_tokens": 64
2184
+ },
2185
+ "ablated": {
2186
+ "pred_answer": "250",
2187
+ "correct": false,
2188
+ "n_gen_tokens": 64
2189
+ }
2190
+ },
2191
+ {
2192
+ "ex_id": "gsm8k-test-151",
2193
+ "gold_raw": "803",
2194
+ "baseline": {
2195
+ "pred_answer": "12",
2196
+ "correct": false,
2197
+ "n_gen_tokens": 64
2198
+ },
2199
+ "ablated": {
2200
+ "pred_answer": "5",
2201
+ "correct": false,
2202
+ "n_gen_tokens": 64
2203
+ }
2204
+ },
2205
+ {
2206
+ "ex_id": "gsm8k-test-152",
2207
+ "gold_raw": "16",
2208
+ "baseline": {
2209
+ "pred_answer": "2",
2210
+ "correct": false,
2211
+ "n_gen_tokens": 64
2212
+ },
2213
+ "ablated": {
2214
+ "pred_answer": "8",
2215
+ "correct": false,
2216
+ "n_gen_tokens": 64
2217
+ }
2218
+ },
2219
+ {
2220
+ "ex_id": "gsm8k-test-153",
2221
+ "gold_raw": "280",
2222
+ "baseline": {
2223
+ "pred_answer": "3",
2224
+ "correct": false,
2225
+ "n_gen_tokens": 64
2226
+ },
2227
+ "ablated": {
2228
+ "pred_answer": "8",
2229
+ "correct": false,
2230
+ "n_gen_tokens": 64
2231
+ }
2232
+ },
2233
+ {
2234
+ "ex_id": "gsm8k-test-154",
2235
+ "gold_raw": "13",
2236
+ "baseline": {
2237
+ "pred_answer": "1.5",
2238
+ "correct": false,
2239
+ "n_gen_tokens": 64
2240
+ },
2241
+ "ablated": {
2242
+ "pred_answer": "40",
2243
+ "correct": false,
2244
+ "n_gen_tokens": 64
2245
+ }
2246
+ },
2247
+ {
2248
+ "ex_id": "gsm8k-test-155",
2249
+ "gold_raw": "20",
2250
+ "baseline": {
2251
+ "pred_answer": "90",
2252
+ "correct": false,
2253
+ "n_gen_tokens": 64
2254
+ },
2255
+ "ablated": {
2256
+ "pred_answer": "2",
2257
+ "correct": false,
2258
+ "n_gen_tokens": 64
2259
+ }
2260
+ },
2261
+ {
2262
+ "ex_id": "gsm8k-test-156",
2263
+ "gold_raw": "14",
2264
+ "baseline": {
2265
+ "pred_answer": "3",
2266
+ "correct": false,
2267
+ "n_gen_tokens": 64
2268
+ },
2269
+ "ablated": {
2270
+ "pred_answer": "30",
2271
+ "correct": false,
2272
+ "n_gen_tokens": 64
2273
+ }
2274
+ },
2275
+ {
2276
+ "ex_id": "gsm8k-test-157",
2277
+ "gold_raw": "32",
2278
+ "baseline": {
2279
+ "pred_answer": "2",
2280
+ "correct": false,
2281
+ "n_gen_tokens": 64
2282
+ },
2283
+ "ablated": {
2284
+ "pred_answer": "1",
2285
+ "correct": false,
2286
+ "n_gen_tokens": 64
2287
+ }
2288
+ },
2289
+ {
2290
+ "ex_id": "gsm8k-test-158",
2291
+ "gold_raw": "105",
2292
+ "baseline": {
2293
+ "pred_answer": "40",
2294
+ "correct": false,
2295
+ "n_gen_tokens": 64
2296
+ },
2297
+ "ablated": {
2298
+ "pred_answer": "2",
2299
+ "correct": false,
2300
+ "n_gen_tokens": 64
2301
+ }
2302
+ },
2303
+ {
2304
+ "ex_id": "gsm8k-test-159",
2305
+ "gold_raw": "71",
2306
+ "baseline": {
2307
+ "pred_answer": "50",
2308
+ "correct": false,
2309
+ "n_gen_tokens": 64
2310
+ },
2311
+ "ablated": {
2312
+ "pred_answer": "00.0",
2313
+ "correct": false,
2314
+ "n_gen_tokens": 64
2315
+ }
2316
+ },
2317
+ {
2318
+ "ex_id": "gsm8k-test-160",
2319
+ "gold_raw": "5",
2320
+ "baseline": {
2321
+ "pred_answer": "4",
2322
+ "correct": false,
2323
+ "n_gen_tokens": 64
2324
+ },
2325
+ "ablated": {
2326
+ "pred_answer": "1",
2327
+ "correct": false,
2328
+ "n_gen_tokens": 64
2329
+ }
2330
+ },
2331
+ {
2332
+ "ex_id": "gsm8k-test-161",
2333
+ "gold_raw": "30",
2334
+ "baseline": {
2335
+ "pred_answer": "3",
2336
+ "correct": false,
2337
+ "n_gen_tokens": 64
2338
+ },
2339
+ "ablated": {
2340
+ "pred_answer": "0",
2341
+ "correct": false,
2342
+ "n_gen_tokens": 64
2343
+ }
2344
+ },
2345
+ {
2346
+ "ex_id": "gsm8k-test-162",
2347
+ "gold_raw": "95",
2348
+ "baseline": {
2349
+ "pred_answer": "60",
2350
+ "correct": false,
2351
+ "n_gen_tokens": 64
2352
+ },
2353
+ "ablated": {
2354
+ "pred_answer": "900.0",
2355
+ "correct": false,
2356
+ "n_gen_tokens": 64
2357
+ }
2358
+ },
2359
+ {
2360
+ "ex_id": "gsm8k-test-163",
2361
+ "gold_raw": "147",
2362
+ "baseline": {
2363
+ "pred_answer": "3",
2364
+ "correct": false,
2365
+ "n_gen_tokens": 64
2366
+ },
2367
+ "ablated": {
2368
+ "pred_answer": "12",
2369
+ "correct": false,
2370
+ "n_gen_tokens": 64
2371
+ }
2372
+ },
2373
+ {
2374
+ "ex_id": "gsm8k-test-164",
2375
+ "gold_raw": "10",
2376
+ "baseline": {
2377
+ "pred_answer": "100",
2378
+ "correct": false,
2379
+ "n_gen_tokens": 64
2380
+ },
2381
+ "ablated": {
2382
+ "pred_answer": "1.20",
2383
+ "correct": false,
2384
+ "n_gen_tokens": 64
2385
+ }
2386
+ },
2387
+ {
2388
+ "ex_id": "gsm8k-test-165",
2389
+ "gold_raw": "40000",
2390
+ "baseline": {
2391
+ "pred_answer": "4",
2392
+ "correct": false,
2393
+ "n_gen_tokens": 64
2394
+ },
2395
+ "ablated": {
2396
+ "pred_answer": "40",
2397
+ "correct": false,
2398
+ "n_gen_tokens": 64
2399
+ }
2400
+ },
2401
+ {
2402
+ "ex_id": "gsm8k-test-166",
2403
+ "gold_raw": "12",
2404
+ "baseline": {
2405
+ "pred_answer": "4",
2406
+ "correct": false,
2407
+ "n_gen_tokens": 64
2408
+ },
2409
+ "ablated": {
2410
+ "pred_answer": "24.0000000000000000000000000000000000000000000000000000000024",
2411
+ "correct": false,
2412
+ "n_gen_tokens": 64
2413
+ }
2414
+ },
2415
+ {
2416
+ "ex_id": "gsm8k-test-167",
2417
+ "gold_raw": "129200",
2418
+ "baseline": {
2419
+ "pred_answer": "30",
2420
+ "correct": false,
2421
+ "n_gen_tokens": 64
2422
+ },
2423
+ "ablated": {
2424
+ "pred_answer": "30",
2425
+ "correct": false,
2426
+ "n_gen_tokens": 64
2427
+ }
2428
+ },
2429
+ {
2430
+ "ex_id": "gsm8k-test-168",
2431
+ "gold_raw": "5",
2432
+ "baseline": {
2433
+ "pred_answer": "10",
2434
+ "correct": false,
2435
+ "n_gen_tokens": 64
2436
+ },
2437
+ "ablated": {
2438
+ "pred_answer": "5",
2439
+ "correct": true,
2440
+ "n_gen_tokens": 64
2441
+ }
2442
+ },
2443
+ {
2444
+ "ex_id": "gsm8k-test-169",
2445
+ "gold_raw": "45",
2446
+ "baseline": {
2447
+ "pred_answer": "3",
2448
+ "correct": false,
2449
+ "n_gen_tokens": 64
2450
+ },
2451
+ "ablated": {
2452
+ "pred_answer": "45",
2453
+ "correct": true,
2454
+ "n_gen_tokens": 64
2455
+ }
2456
+ },
2457
+ {
2458
+ "ex_id": "gsm8k-test-170",
2459
+ "gold_raw": "20",
2460
+ "baseline": {
2461
+ "pred_answer": "3",
2462
+ "correct": false,
2463
+ "n_gen_tokens": 64
2464
+ },
2465
+ "ablated": {
2466
+ "pred_answer": "6",
2467
+ "correct": false,
2468
+ "n_gen_tokens": 64
2469
+ }
2470
+ },
2471
+ {
2472
+ "ex_id": "gsm8k-test-171",
2473
+ "gold_raw": "1170",
2474
+ "baseline": {
2475
+ "pred_answer": "30",
2476
+ "correct": false,
2477
+ "n_gen_tokens": 64
2478
+ },
2479
+ "ablated": {
2480
+ "pred_answer": "100",
2481
+ "correct": false,
2482
+ "n_gen_tokens": 64
2483
+ }
2484
+ },
2485
+ {
2486
+ "ex_id": "gsm8k-test-172",
2487
+ "gold_raw": "192",
2488
+ "baseline": {
2489
+ "pred_answer": "2",
2490
+ "correct": false,
2491
+ "n_gen_tokens": 64
2492
+ },
2493
+ "ablated": {
2494
+ "pred_answer": "24",
2495
+ "correct": false,
2496
+ "n_gen_tokens": 64
2497
+ }
2498
+ },
2499
+ {
2500
+ "ex_id": "gsm8k-test-173",
2501
+ "gold_raw": "14",
2502
+ "baseline": {
2503
+ "pred_answer": "0.5",
2504
+ "correct": false,
2505
+ "n_gen_tokens": 64
2506
+ },
2507
+ "ablated": {
2508
+ "pred_answer": "10.0",
2509
+ "correct": false,
2510
+ "n_gen_tokens": 64
2511
+ }
2512
+ },
2513
+ {
2514
+ "ex_id": "gsm8k-test-174",
2515
+ "gold_raw": "144",
2516
+ "baseline": {
2517
+ "pred_answer": "4",
2518
+ "correct": false,
2519
+ "n_gen_tokens": 64
2520
+ },
2521
+ "ablated": {
2522
+ "pred_answer": "1",
2523
+ "correct": false,
2524
+ "n_gen_tokens": 64
2525
+ }
2526
+ },
2527
+ {
2528
+ "ex_id": "gsm8k-test-175",
2529
+ "gold_raw": "350",
2530
+ "baseline": {
2531
+ "pred_answer": "2",
2532
+ "correct": false,
2533
+ "n_gen_tokens": 64
2534
+ },
2535
+ "ablated": {
2536
+ "pred_answer": "1",
2537
+ "correct": false,
2538
+ "n_gen_tokens": 64
2539
+ }
2540
+ },
2541
+ {
2542
+ "ex_id": "gsm8k-test-176",
2543
+ "gold_raw": "50",
2544
+ "baseline": {
2545
+ "pred_answer": "5",
2546
+ "correct": false,
2547
+ "n_gen_tokens": 64
2548
+ },
2549
+ "ablated": {
2550
+ "pred_answer": "25",
2551
+ "correct": false,
2552
+ "n_gen_tokens": 64
2553
+ }
2554
+ },
2555
+ {
2556
+ "ex_id": "gsm8k-test-177",
2557
+ "gold_raw": "7",
2558
+ "baseline": {
2559
+ "pred_answer": "1",
2560
+ "correct": false,
2561
+ "n_gen_tokens": 64
2562
+ },
2563
+ "ablated": {
2564
+ "pred_answer": "12",
2565
+ "correct": false,
2566
+ "n_gen_tokens": 64
2567
+ }
2568
+ },
2569
+ {
2570
+ "ex_id": "gsm8k-test-178",
2571
+ "gold_raw": "50",
2572
+ "baseline": {
2573
+ "pred_answer": "2",
2574
+ "correct": false,
2575
+ "n_gen_tokens": 64
2576
+ },
2577
+ "ablated": {
2578
+ "pred_answer": "40.000000000000000000000000000000000000000000000000000000001",
2579
+ "correct": false,
2580
+ "n_gen_tokens": 64
2581
+ }
2582
+ },
2583
+ {
2584
+ "ex_id": "gsm8k-test-179",
2585
+ "gold_raw": "8",
2586
+ "baseline": {
2587
+ "pred_answer": "2",
2588
+ "correct": false,
2589
+ "n_gen_tokens": 64
2590
+ },
2591
+ "ablated": {
2592
+ "pred_answer": "1",
2593
+ "correct": false,
2594
+ "n_gen_tokens": 64
2595
+ }
2596
+ },
2597
+ {
2598
+ "ex_id": "gsm8k-test-180",
2599
+ "gold_raw": "3160",
2600
+ "baseline": {
2601
+ "pred_answer": "2",
2602
+ "correct": false,
2603
+ "n_gen_tokens": 64
2604
+ },
2605
+ "ablated": {
2606
+ "pred_answer": "1",
2607
+ "correct": false,
2608
+ "n_gen_tokens": 64
2609
+ }
2610
+ },
2611
+ {
2612
+ "ex_id": "gsm8k-test-181",
2613
+ "gold_raw": "80",
2614
+ "baseline": {
2615
+ "pred_answer": "2",
2616
+ "correct": false,
2617
+ "n_gen_tokens": 64
2618
+ },
2619
+ "ablated": {
2620
+ "pred_answer": "7",
2621
+ "correct": false,
2622
+ "n_gen_tokens": 64
2623
+ }
2624
+ },
2625
+ {
2626
+ "ex_id": "gsm8k-test-182",
2627
+ "gold_raw": "50",
2628
+ "baseline": {
2629
+ "pred_answer": "4",
2630
+ "correct": false,
2631
+ "n_gen_tokens": 64
2632
+ },
2633
+ "ablated": {
2634
+ "pred_answer": "25",
2635
+ "correct": false,
2636
+ "n_gen_tokens": 64
2637
+ }
2638
+ },
2639
+ {
2640
+ "ex_id": "gsm8k-test-183",
2641
+ "gold_raw": "40",
2642
+ "baseline": {
2643
+ "pred_answer": "100",
2644
+ "correct": false,
2645
+ "n_gen_tokens": 64
2646
+ },
2647
+ "ablated": {
2648
+ "pred_answer": "1",
2649
+ "correct": false,
2650
+ "n_gen_tokens": 64
2651
+ }
2652
+ },
2653
+ {
2654
+ "ex_id": "gsm8k-test-184",
2655
+ "gold_raw": "78",
2656
+ "baseline": {
2657
+ "pred_answer": "1",
2658
+ "correct": false,
2659
+ "n_gen_tokens": 64
2660
+ },
2661
+ "ablated": {
2662
+ "pred_answer": "5",
2663
+ "correct": false,
2664
+ "n_gen_tokens": 64
2665
+ }
2666
+ },
2667
+ {
2668
+ "ex_id": "gsm8k-test-185",
2669
+ "gold_raw": "273",
2670
+ "baseline": {
2671
+ "pred_answer": "364",
2672
+ "correct": false,
2673
+ "n_gen_tokens": 64
2674
+ },
2675
+ "ablated": {
2676
+ "pred_answer": "147.0",
2677
+ "correct": false,
2678
+ "n_gen_tokens": 64
2679
+ }
2680
+ },
2681
+ {
2682
+ "ex_id": "gsm8k-test-186",
2683
+ "gold_raw": "2",
2684
+ "baseline": {
2685
+ "pred_answer": "4",
2686
+ "correct": false,
2687
+ "n_gen_tokens": 64
2688
+ },
2689
+ "ablated": {
2690
+ "pred_answer": "11",
2691
+ "correct": false,
2692
+ "n_gen_tokens": 64
2693
+ }
2694
+ },
2695
+ {
2696
+ "ex_id": "gsm8k-test-187",
2697
+ "gold_raw": "195",
2698
+ "baseline": {
2699
+ "pred_answer": "1",
2700
+ "correct": false,
2701
+ "n_gen_tokens": 64
2702
+ },
2703
+ "ablated": {
2704
+ "pred_answer": "1",
2705
+ "correct": false,
2706
+ "n_gen_tokens": 64
2707
+ }
2708
+ },
2709
+ {
2710
+ "ex_id": "gsm8k-test-188",
2711
+ "gold_raw": "1128",
2712
+ "baseline": {
2713
+ "pred_answer": "72",
2714
+ "correct": false,
2715
+ "n_gen_tokens": 64
2716
+ },
2717
+ "ablated": {
2718
+ "pred_answer": "1",
2719
+ "correct": false,
2720
+ "n_gen_tokens": 64
2721
+ }
2722
+ },
2723
+ {
2724
+ "ex_id": "gsm8k-test-189",
2725
+ "gold_raw": "172",
2726
+ "baseline": {
2727
+ "pred_answer": "2",
2728
+ "correct": false,
2729
+ "n_gen_tokens": 64
2730
+ },
2731
+ "ablated": {
2732
+ "pred_answer": "3",
2733
+ "correct": false,
2734
+ "n_gen_tokens": 64
2735
+ }
2736
+ },
2737
+ {
2738
+ "ex_id": "gsm8k-test-190",
2739
+ "gold_raw": "30",
2740
+ "baseline": {
2741
+ "pred_answer": "3",
2742
+ "correct": false,
2743
+ "n_gen_tokens": 64
2744
+ },
2745
+ "ablated": {
2746
+ "pred_answer": "35",
2747
+ "correct": false,
2748
+ "n_gen_tokens": 64
2749
+ }
2750
+ },
2751
+ {
2752
+ "ex_id": "gsm8k-test-191",
2753
+ "gold_raw": "30",
2754
+ "baseline": {
2755
+ "pred_answer": "20",
2756
+ "correct": false,
2757
+ "n_gen_tokens": 64
2758
+ },
2759
+ "ablated": {
2760
+ "pred_answer": "19",
2761
+ "correct": false,
2762
+ "n_gen_tokens": 64
2763
+ }
2764
+ },
2765
+ {
2766
+ "ex_id": "gsm8k-test-192",
2767
+ "gold_raw": "92",
2768
+ "baseline": {
2769
+ "pred_answer": "20",
2770
+ "correct": false,
2771
+ "n_gen_tokens": 64
2772
+ },
2773
+ "ablated": {
2774
+ "pred_answer": "20",
2775
+ "correct": false,
2776
+ "n_gen_tokens": 64
2777
+ }
2778
+ },
2779
+ {
2780
+ "ex_id": "gsm8k-test-193",
2781
+ "gold_raw": "20",
2782
+ "baseline": {
2783
+ "pred_answer": "100",
2784
+ "correct": false,
2785
+ "n_gen_tokens": 64
2786
+ },
2787
+ "ablated": {
2788
+ "pred_answer": "2",
2789
+ "correct": false,
2790
+ "n_gen_tokens": 64
2791
+ }
2792
+ },
2793
+ {
2794
+ "ex_id": "gsm8k-test-194",
2795
+ "gold_raw": "540",
2796
+ "baseline": {
2797
+ "pred_answer": "5",
2798
+ "correct": false,
2799
+ "n_gen_tokens": 64
2800
+ },
2801
+ "ablated": {
2802
+ "pred_answer": "1",
2803
+ "correct": false,
2804
+ "n_gen_tokens": 64
2805
+ }
2806
+ },
2807
+ {
2808
+ "ex_id": "gsm8k-test-195",
2809
+ "gold_raw": "10",
2810
+ "baseline": {
2811
+ "pred_answer": "12",
2812
+ "correct": false,
2813
+ "n_gen_tokens": 64
2814
+ },
2815
+ "ablated": {
2816
+ "pred_answer": "1",
2817
+ "correct": false,
2818
+ "n_gen_tokens": 64
2819
+ }
2820
+ },
2821
+ {
2822
+ "ex_id": "gsm8k-test-196",
2823
+ "gold_raw": "10",
2824
+ "baseline": {
2825
+ "pred_answer": "40",
2826
+ "correct": false,
2827
+ "n_gen_tokens": 64
2828
+ },
2829
+ "ablated": {
2830
+ "pred_answer": "14",
2831
+ "correct": false,
2832
+ "n_gen_tokens": 64
2833
+ }
2834
+ },
2835
+ {
2836
+ "ex_id": "gsm8k-test-197",
2837
+ "gold_raw": "38",
2838
+ "baseline": {
2839
+ "pred_answer": "14",
2840
+ "correct": false,
2841
+ "n_gen_tokens": 64
2842
+ },
2843
+ "ablated": {
2844
+ "pred_answer": "14",
2845
+ "correct": false,
2846
+ "n_gen_tokens": 64
2847
+ }
2848
+ },
2849
+ {
2850
+ "ex_id": "gsm8k-test-198",
2851
+ "gold_raw": "4000",
2852
+ "baseline": {
2853
+ "pred_answer": "100",
2854
+ "correct": false,
2855
+ "n_gen_tokens": 64
2856
+ },
2857
+ "ablated": {
2858
+ "pred_answer": "2",
2859
+ "correct": false,
2860
+ "n_gen_tokens": 64
2861
+ }
2862
+ },
2863
+ {
2864
+ "ex_id": "gsm8k-test-199",
2865
+ "gold_raw": "594",
2866
+ "baseline": {
2867
+ "pred_answer": "1",
2868
+ "correct": false,
2869
+ "n_gen_tokens": 64
2870
+ },
2871
+ "ablated": {
2872
+ "pred_answer": "200",
2873
+ "correct": false,
2874
+ "n_gen_tokens": 64
2875
+ }
2876
+ },
2877
+ {
2878
+ "ex_id": "gsm8k-test-200",
2879
+ "gold_raw": "2",
2880
+ "baseline": {
2881
+ "pred_answer": "3",
2882
+ "correct": false,
2883
+ "n_gen_tokens": 64
2884
+ },
2885
+ "ablated": {
2886
+ "pred_answer": "1",
2887
+ "correct": false,
2888
+ "n_gen_tokens": 64
2889
+ }
2890
+ },
2891
+ {
2892
+ "ex_id": "gsm8k-test-201",
2893
+ "gold_raw": "142",
2894
+ "baseline": {
2895
+ "pred_answer": "5",
2896
+ "correct": false,
2897
+ "n_gen_tokens": 64
2898
+ },
2899
+ "ablated": {
2900
+ "pred_answer": "17",
2901
+ "correct": false,
2902
+ "n_gen_tokens": 64
2903
+ }
2904
+ },
2905
+ {
2906
+ "ex_id": "gsm8k-test-202",
2907
+ "gold_raw": "9",
2908
+ "baseline": {
2909
+ "pred_answer": "4",
2910
+ "correct": false,
2911
+ "n_gen_tokens": 64
2912
+ },
2913
+ "ablated": {
2914
+ "pred_answer": "3",
2915
+ "correct": false,
2916
+ "n_gen_tokens": 64
2917
+ }
2918
+ },
2919
+ {
2920
+ "ex_id": "gsm8k-test-203",
2921
+ "gold_raw": "6",
2922
+ "baseline": {
2923
+ "pred_answer": "6",
2924
+ "correct": true,
2925
+ "n_gen_tokens": 64
2926
+ },
2927
+ "ablated": {
2928
+ "pred_answer": "4.000",
2929
+ "correct": false,
2930
+ "n_gen_tokens": 64
2931
+ }
2932
+ },
2933
+ {
2934
+ "ex_id": "gsm8k-test-204",
2935
+ "gold_raw": "100",
2936
+ "baseline": {
2937
+ "pred_answer": "2",
2938
+ "correct": false,
2939
+ "n_gen_tokens": 64
2940
+ },
2941
+ "ablated": {
2942
+ "pred_answer": "1",
2943
+ "correct": false,
2944
+ "n_gen_tokens": 64
2945
+ }
2946
+ },
2947
+ {
2948
+ "ex_id": "gsm8k-test-205",
2949
+ "gold_raw": "10",
2950
+ "baseline": {
2951
+ "pred_answer": "2",
2952
+ "correct": false,
2953
+ "n_gen_tokens": 64
2954
+ },
2955
+ "ablated": {
2956
+ "pred_answer": "21.000000000000",
2957
+ "correct": false,
2958
+ "n_gen_tokens": 64
2959
+ }
2960
+ },
2961
+ {
2962
+ "ex_id": "gsm8k-test-206",
2963
+ "gold_raw": "15",
2964
+ "baseline": {
2965
+ "pred_answer": "1",
2966
+ "correct": false,
2967
+ "n_gen_tokens": 64
2968
+ },
2969
+ "ablated": {
2970
+ "pred_answer": "4",
2971
+ "correct": false,
2972
+ "n_gen_tokens": 64
2973
+ }
2974
+ },
2975
+ {
2976
+ "ex_id": "gsm8k-test-207",
2977
+ "gold_raw": "22",
2978
+ "baseline": {
2979
+ "pred_answer": "2",
2980
+ "correct": false,
2981
+ "n_gen_tokens": 64
2982
+ },
2983
+ "ablated": {
2984
+ "pred_answer": "21",
2985
+ "correct": false,
2986
+ "n_gen_tokens": 64
2987
+ }
2988
+ },
2989
+ {
2990
+ "ex_id": "gsm8k-test-208",
2991
+ "gold_raw": "16",
2992
+ "baseline": {
2993
+ "pred_answer": "4",
2994
+ "correct": false,
2995
+ "n_gen_tokens": 64
2996
+ },
2997
+ "ablated": {
2998
+ "pred_answer": "6",
2999
+ "correct": false,
3000
+ "n_gen_tokens": 64
3001
+ }
3002
+ },
3003
+ {
3004
+ "ex_id": "gsm8k-test-209",
3005
+ "gold_raw": "16",
3006
+ "baseline": {
3007
+ "pred_answer": "3",
3008
+ "correct": false,
3009
+ "n_gen_tokens": 64
3010
+ },
3011
+ "ablated": {
3012
+ "pred_answer": "21.00000000000000000000000000000000000000000000000000000000000",
3013
+ "correct": false,
3014
+ "n_gen_tokens": 64
3015
+ }
3016
+ },
3017
+ {
3018
+ "ex_id": "gsm8k-test-210",
3019
+ "gold_raw": "5",
3020
+ "baseline": {
3021
+ "pred_answer": "1.5",
3022
+ "correct": false,
3023
+ "n_gen_tokens": 64
3024
+ },
3025
+ "ablated": {
3026
+ "pred_answer": "6",
3027
+ "correct": false,
3028
+ "n_gen_tokens": 64
3029
+ }
3030
+ },
3031
+ {
3032
+ "ex_id": "gsm8k-test-211",
3033
+ "gold_raw": "23",
3034
+ "baseline": {
3035
+ "pred_answer": "2",
3036
+ "correct": false,
3037
+ "n_gen_tokens": 64
3038
+ },
3039
+ "ablated": {
3040
+ "pred_answer": "5.0",
3041
+ "correct": false,
3042
+ "n_gen_tokens": 64
3043
+ }
3044
+ },
3045
+ {
3046
+ "ex_id": "gsm8k-test-212",
3047
+ "gold_raw": "30",
3048
+ "baseline": {
3049
+ "pred_answer": "3",
3050
+ "correct": false,
3051
+ "n_gen_tokens": 64
3052
+ },
3053
+ "ablated": {
3054
+ "pred_answer": "1",
3055
+ "correct": false,
3056
+ "n_gen_tokens": 64
3057
+ }
3058
+ },
3059
+ {
3060
+ "ex_id": "gsm8k-test-213",
3061
+ "gold_raw": "14000",
3062
+ "baseline": {
3063
+ "pred_answer": "2",
3064
+ "correct": false,
3065
+ "n_gen_tokens": 64
3066
+ },
3067
+ "ablated": {
3068
+ "pred_answer": "6400",
3069
+ "correct": false,
3070
+ "n_gen_tokens": 64
3071
+ }
3072
+ },
3073
+ {
3074
+ "ex_id": "gsm8k-test-214",
3075
+ "gold_raw": "60",
3076
+ "baseline": {
3077
+ "pred_answer": "16",
3078
+ "correct": false,
3079
+ "n_gen_tokens": 64
3080
+ },
3081
+ "ablated": {
3082
+ "pred_answer": "2.0",
3083
+ "correct": false,
3084
+ "n_gen_tokens": 64
3085
+ }
3086
+ },
3087
+ {
3088
+ "ex_id": "gsm8k-test-215",
3089
+ "gold_raw": "2",
3090
+ "baseline": {
3091
+ "pred_answer": "6",
3092
+ "correct": false,
3093
+ "n_gen_tokens": 64
3094
+ },
3095
+ "ablated": {
3096
+ "pred_answer": "100",
3097
+ "correct": false,
3098
+ "n_gen_tokens": 64
3099
+ }
3100
+ },
3101
+ {
3102
+ "ex_id": "gsm8k-test-216",
3103
+ "gold_raw": "3",
3104
+ "baseline": {
3105
+ "pred_answer": "3",
3106
+ "correct": true,
3107
+ "n_gen_tokens": 64
3108
+ },
3109
+ "ablated": {
3110
+ "pred_answer": "2",
3111
+ "correct": false,
3112
+ "n_gen_tokens": 64
3113
+ }
3114
+ },
3115
+ {
3116
+ "ex_id": "gsm8k-test-217",
3117
+ "gold_raw": "30",
3118
+ "baseline": {
3119
+ "pred_answer": "3",
3120
+ "correct": false,
3121
+ "n_gen_tokens": 64
3122
+ },
3123
+ "ablated": {
3124
+ "pred_answer": "123",
3125
+ "correct": false,
3126
+ "n_gen_tokens": 64
3127
+ }
3128
+ },
3129
+ {
3130
+ "ex_id": "gsm8k-test-218",
3131
+ "gold_raw": "1920",
3132
+ "baseline": {
3133
+ "pred_answer": "1",
3134
+ "correct": false,
3135
+ "n_gen_tokens": 64
3136
+ },
3137
+ "ablated": {
3138
+ "pred_answer": "2",
3139
+ "correct": false,
3140
+ "n_gen_tokens": 64
3141
+ }
3142
+ },
3143
+ {
3144
+ "ex_id": "gsm8k-test-219",
3145
+ "gold_raw": "84",
3146
+ "baseline": {
3147
+ "pred_answer": "2",
3148
+ "correct": false,
3149
+ "n_gen_tokens": 64
3150
+ },
3151
+ "ablated": {
3152
+ "pred_answer": "124",
3153
+ "correct": false,
3154
+ "n_gen_tokens": 64
3155
+ }
3156
+ },
3157
+ {
3158
+ "ex_id": "gsm8k-test-220",
3159
+ "gold_raw": "8",
3160
+ "baseline": {
3161
+ "pred_answer": "2",
3162
+ "correct": false,
3163
+ "n_gen_tokens": 64
3164
+ },
3165
+ "ablated": {
3166
+ "pred_answer": "3.1",
3167
+ "correct": false,
3168
+ "n_gen_tokens": 64
3169
+ }
3170
+ },
3171
+ {
3172
+ "ex_id": "gsm8k-test-221",
3173
+ "gold_raw": "12",
3174
+ "baseline": {
3175
+ "pred_answer": "5",
3176
+ "correct": false,
3177
+ "n_gen_tokens": 64
3178
+ },
3179
+ "ablated": {
3180
+ "pred_answer": "18",
3181
+ "correct": false,
3182
+ "n_gen_tokens": 64
3183
+ }
3184
+ },
3185
+ {
3186
+ "ex_id": "gsm8k-test-222",
3187
+ "gold_raw": "260",
3188
+ "baseline": {
3189
+ "pred_answer": "2",
3190
+ "correct": false,
3191
+ "n_gen_tokens": 64
3192
+ },
3193
+ "ablated": {
3194
+ "pred_answer": "120",
3195
+ "correct": false,
3196
+ "n_gen_tokens": 64
3197
+ }
3198
+ },
3199
+ {
3200
+ "ex_id": "gsm8k-test-223",
3201
+ "gold_raw": "288",
3202
+ "baseline": {
3203
+ "pred_answer": "30",
3204
+ "correct": false,
3205
+ "n_gen_tokens": 64
3206
+ },
3207
+ "ablated": {
3208
+ "pred_answer": "270",
3209
+ "correct": false,
3210
+ "n_gen_tokens": 64
3211
+ }
3212
+ },
3213
+ {
3214
+ "ex_id": "gsm8k-test-224",
3215
+ "gold_raw": "3",
3216
+ "baseline": {
3217
+ "pred_answer": "3",
3218
+ "correct": true,
3219
+ "n_gen_tokens": 64
3220
+ },
3221
+ "ablated": {
3222
+ "pred_answer": "5",
3223
+ "correct": false,
3224
+ "n_gen_tokens": 64
3225
+ }
3226
+ },
3227
+ {
3228
+ "ex_id": "gsm8k-test-225",
3229
+ "gold_raw": "1596",
3230
+ "baseline": {
3231
+ "pred_answer": "6",
3232
+ "correct": false,
3233
+ "n_gen_tokens": 64
3234
+ },
3235
+ "ablated": {
3236
+ "pred_answer": "176",
3237
+ "correct": false,
3238
+ "n_gen_tokens": 64
3239
+ }
3240
+ },
3241
+ {
3242
+ "ex_id": "gsm8k-test-226",
3243
+ "gold_raw": "81",
3244
+ "baseline": {
3245
+ "pred_answer": "8",
3246
+ "correct": false,
3247
+ "n_gen_tokens": 64
3248
+ },
3249
+ "ablated": {
3250
+ "pred_answer": "1.0",
3251
+ "correct": false,
3252
+ "n_gen_tokens": 64
3253
+ }
3254
+ },
3255
+ {
3256
+ "ex_id": "gsm8k-test-227",
3257
+ "gold_raw": "56",
3258
+ "baseline": {
3259
+ "pred_answer": "38",
3260
+ "correct": false,
3261
+ "n_gen_tokens": 64
3262
+ },
3263
+ "ablated": {
3264
+ "pred_answer": "2",
3265
+ "correct": false,
3266
+ "n_gen_tokens": 64
3267
+ }
3268
+ },
3269
+ {
3270
+ "ex_id": "gsm8k-test-228",
3271
+ "gold_raw": "1490",
3272
+ "baseline": {
3273
+ "pred_answer": "3",
3274
+ "correct": false,
3275
+ "n_gen_tokens": 64
3276
+ },
3277
+ "ablated": {
3278
+ "pred_answer": "1200",
3279
+ "correct": false,
3280
+ "n_gen_tokens": 64
3281
+ }
3282
+ },
3283
+ {
3284
+ "ex_id": "gsm8k-test-229",
3285
+ "gold_raw": "2",
3286
+ "baseline": {
3287
+ "pred_answer": "2",
3288
+ "correct": true,
3289
+ "n_gen_tokens": 64
3290
+ },
3291
+ "ablated": {
3292
+ "pred_answer": "2",
3293
+ "correct": true,
3294
+ "n_gen_tokens": 64
3295
+ }
3296
+ },
3297
+ {
3298
+ "ex_id": "gsm8k-test-230",
3299
+ "gold_raw": "20",
3300
+ "baseline": {
3301
+ "pred_answer": "1",
3302
+ "correct": false,
3303
+ "n_gen_tokens": 64
3304
+ },
3305
+ "ablated": {
3306
+ "pred_answer": "14",
3307
+ "correct": false,
3308
+ "n_gen_tokens": 64
3309
+ }
3310
+ },
3311
+ {
3312
+ "ex_id": "gsm8k-test-231",
3313
+ "gold_raw": "11",
3314
+ "baseline": {
3315
+ "pred_answer": "3",
3316
+ "correct": false,
3317
+ "n_gen_tokens": 64
3318
+ },
3319
+ "ablated": {
3320
+ "pred_answer": "12.0",
3321
+ "correct": false,
3322
+ "n_gen_tokens": 64
3323
+ }
3324
+ },
3325
+ {
3326
+ "ex_id": "gsm8k-test-232",
3327
+ "gold_raw": "120",
3328
+ "baseline": {
3329
+ "pred_answer": "3",
3330
+ "correct": false,
3331
+ "n_gen_tokens": 64
3332
+ },
3333
+ "ablated": {
3334
+ "pred_answer": "20",
3335
+ "correct": false,
3336
+ "n_gen_tokens": 64
3337
+ }
3338
+ },
3339
+ {
3340
+ "ex_id": "gsm8k-test-233",
3341
+ "gold_raw": "45",
3342
+ "baseline": {
3343
+ "pred_answer": "60",
3344
+ "correct": false,
3345
+ "n_gen_tokens": 64
3346
+ },
3347
+ "ablated": {
3348
+ "pred_answer": "6",
3349
+ "correct": false,
3350
+ "n_gen_tokens": 64
3351
+ }
3352
+ },
3353
+ {
3354
+ "ex_id": "gsm8k-test-234",
3355
+ "gold_raw": "10",
3356
+ "baseline": {
3357
+ "pred_answer": "4",
3358
+ "correct": false,
3359
+ "n_gen_tokens": 64
3360
+ },
3361
+ "ablated": {
3362
+ "pred_answer": "3",
3363
+ "correct": false,
3364
+ "n_gen_tokens": 64
3365
+ }
3366
+ },
3367
+ {
3368
+ "ex_id": "gsm8k-test-235",
3369
+ "gold_raw": "9",
3370
+ "baseline": {
3371
+ "pred_answer": "4",
3372
+ "correct": false,
3373
+ "n_gen_tokens": 64
3374
+ },
3375
+ "ablated": {
3376
+ "pred_answer": "0.25",
3377
+ "correct": false,
3378
+ "n_gen_tokens": 64
3379
+ }
3380
+ },
3381
+ {
3382
+ "ex_id": "gsm8k-test-236",
3383
+ "gold_raw": "33",
3384
+ "baseline": {
3385
+ "pred_answer": "4",
3386
+ "correct": false,
3387
+ "n_gen_tokens": 64
3388
+ },
3389
+ "ablated": {
3390
+ "pred_answer": "3",
3391
+ "correct": false,
3392
+ "n_gen_tokens": 64
3393
+ }
3394
+ },
3395
+ {
3396
+ "ex_id": "gsm8k-test-237",
3397
+ "gold_raw": "150",
3398
+ "baseline": {
3399
+ "pred_answer": "3",
3400
+ "correct": false,
3401
+ "n_gen_tokens": 64
3402
+ },
3403
+ "ablated": {
3404
+ "pred_answer": "250.00",
3405
+ "correct": false,
3406
+ "n_gen_tokens": 64
3407
+ }
3408
+ },
3409
+ {
3410
+ "ex_id": "gsm8k-test-238",
3411
+ "gold_raw": "60",
3412
+ "baseline": {
3413
+ "pred_answer": "3",
3414
+ "correct": false,
3415
+ "n_gen_tokens": 64
3416
+ },
3417
+ "ablated": {
3418
+ "pred_answer": "20",
3419
+ "correct": false,
3420
+ "n_gen_tokens": 64
3421
+ }
3422
+ },
3423
+ {
3424
+ "ex_id": "gsm8k-test-239",
3425
+ "gold_raw": "4",
3426
+ "baseline": {
3427
+ "pred_answer": "2",
3428
+ "correct": false,
3429
+ "n_gen_tokens": 64
3430
+ },
3431
+ "ablated": {
3432
+ "pred_answer": "1",
3433
+ "correct": false,
3434
+ "n_gen_tokens": 64
3435
+ }
3436
+ },
3437
+ {
3438
+ "ex_id": "gsm8k-test-240",
3439
+ "gold_raw": "7",
3440
+ "baseline": {
3441
+ "pred_answer": "3",
3442
+ "correct": false,
3443
+ "n_gen_tokens": 64
3444
+ },
3445
+ "ablated": {
3446
+ "pred_answer": "2",
3447
+ "correct": false,
3448
+ "n_gen_tokens": 64
3449
+ }
3450
+ },
3451
+ {
3452
+ "ex_id": "gsm8k-test-241",
3453
+ "gold_raw": "3140",
3454
+ "baseline": {
3455
+ "pred_answer": "4",
3456
+ "correct": false,
3457
+ "n_gen_tokens": 64
3458
+ },
3459
+ "ablated": {
3460
+ "pred_answer": "20",
3461
+ "correct": false,
3462
+ "n_gen_tokens": 64
3463
+ }
3464
+ },
3465
+ {
3466
+ "ex_id": "gsm8k-test-242",
3467
+ "gold_raw": "19",
3468
+ "baseline": {
3469
+ "pred_answer": "5",
3470
+ "correct": false,
3471
+ "n_gen_tokens": 64
3472
+ },
3473
+ "ablated": {
3474
+ "pred_answer": "25262525",
3475
+ "correct": false,
3476
+ "n_gen_tokens": 64
3477
+ }
3478
+ },
3479
+ {
3480
+ "ex_id": "gsm8k-test-243",
3481
+ "gold_raw": "6",
3482
+ "baseline": {
3483
+ "pred_answer": "1",
3484
+ "correct": false,
3485
+ "n_gen_tokens": 64
3486
+ },
3487
+ "ablated": {
3488
+ "pred_answer": "5",
3489
+ "correct": false,
3490
+ "n_gen_tokens": 64
3491
+ }
3492
+ },
3493
+ {
3494
+ "ex_id": "gsm8k-test-244",
3495
+ "gold_raw": "90",
3496
+ "baseline": {
3497
+ "pred_answer": "3",
3498
+ "correct": false,
3499
+ "n_gen_tokens": 64
3500
+ },
3501
+ "ablated": {
3502
+ "pred_answer": "3",
3503
+ "correct": false,
3504
+ "n_gen_tokens": 64
3505
+ }
3506
+ },
3507
+ {
3508
+ "ex_id": "gsm8k-test-245",
3509
+ "gold_raw": "10",
3510
+ "baseline": {
3511
+ "pred_answer": "1.5",
3512
+ "correct": false,
3513
+ "n_gen_tokens": 64
3514
+ },
3515
+ "ablated": {
3516
+ "pred_answer": "12.0",
3517
+ "correct": false,
3518
+ "n_gen_tokens": 64
3519
+ }
3520
+ },
3521
+ {
3522
+ "ex_id": "gsm8k-test-246",
3523
+ "gold_raw": "130000",
3524
+ "baseline": {
3525
+ "pred_answer": "2",
3526
+ "correct": false,
3527
+ "n_gen_tokens": 64
3528
+ },
3529
+ "ablated": {
3530
+ "pred_answer": "1",
3531
+ "correct": false,
3532
+ "n_gen_tokens": 64
3533
+ }
3534
+ },
3535
+ {
3536
+ "ex_id": "gsm8k-test-247",
3537
+ "gold_raw": "10",
3538
+ "baseline": {
3539
+ "pred_answer": "3",
3540
+ "correct": false,
3541
+ "n_gen_tokens": 64
3542
+ },
3543
+ "ablated": {
3544
+ "pred_answer": "20",
3545
+ "correct": false,
3546
+ "n_gen_tokens": 64
3547
+ }
3548
+ },
3549
+ {
3550
+ "ex_id": "gsm8k-test-248",
3551
+ "gold_raw": "525",
3552
+ "baseline": {
3553
+ "pred_answer": "31",
3554
+ "correct": false,
3555
+ "n_gen_tokens": 64
3556
+ },
3557
+ "ablated": {
3558
+ "pred_answer": "31",
3559
+ "correct": false,
3560
+ "n_gen_tokens": 64
3561
+ }
3562
+ },
3563
+ {
3564
+ "ex_id": "gsm8k-test-249",
3565
+ "gold_raw": "180",
3566
+ "baseline": {
3567
+ "pred_answer": "130",
3568
+ "correct": false,
3569
+ "n_gen_tokens": 64
3570
+ },
3571
+ "ablated": {
3572
+ "pred_answer": "20",
3573
+ "correct": false,
3574
+ "n_gen_tokens": 64
3575
+ }
3576
+ },
3577
+ {
3578
+ "ex_id": "gsm8k-test-250",
3579
+ "gold_raw": "1200",
3580
+ "baseline": {
3581
+ "pred_answer": "3",
3582
+ "correct": false,
3583
+ "n_gen_tokens": 64
3584
+ },
3585
+ "ablated": {
3586
+ "pred_answer": "1200",
3587
+ "correct": true,
3588
+ "n_gen_tokens": 64
3589
+ }
3590
+ },
3591
+ {
3592
+ "ex_id": "gsm8k-test-251",
3593
+ "gold_raw": "25",
3594
+ "baseline": {
3595
+ "pred_answer": "45",
3596
+ "correct": false,
3597
+ "n_gen_tokens": 64
3598
+ },
3599
+ "ablated": {
3600
+ "pred_answer": "5",
3601
+ "correct": false,
3602
+ "n_gen_tokens": 64
3603
+ }
3604
+ },
3605
+ {
3606
+ "ex_id": "gsm8k-test-252",
3607
+ "gold_raw": "21",
3608
+ "baseline": {
3609
+ "pred_answer": "3",
3610
+ "correct": false,
3611
+ "n_gen_tokens": 64
3612
+ },
3613
+ "ablated": {
3614
+ "pred_answer": "4.00000000000",
3615
+ "correct": false,
3616
+ "n_gen_tokens": 64
3617
+ }
3618
+ },
3619
+ {
3620
+ "ex_id": "gsm8k-test-253",
3621
+ "gold_raw": "2304",
3622
+ "baseline": {
3623
+ "pred_answer": "12",
3624
+ "correct": false,
3625
+ "n_gen_tokens": 64
3626
+ },
3627
+ "ablated": {
3628
+ "pred_answer": "0",
3629
+ "correct": false,
3630
+ "n_gen_tokens": 64
3631
+ }
3632
+ },
3633
+ {
3634
+ "ex_id": "gsm8k-test-254",
3635
+ "gold_raw": "2325",
3636
+ "baseline": {
3637
+ "pred_answer": "4",
3638
+ "correct": false,
3639
+ "n_gen_tokens": 64
3640
+ },
3641
+ "ablated": {
3642
+ "pred_answer": "2325",
3643
+ "correct": true,
3644
+ "n_gen_tokens": 64
3645
+ }
3646
+ },
3647
+ {
3648
+ "ex_id": "gsm8k-test-255",
3649
+ "gold_raw": "15",
3650
+ "baseline": {
3651
+ "pred_answer": "3",
3652
+ "correct": false,
3653
+ "n_gen_tokens": 64
3654
+ },
3655
+ "ablated": {
3656
+ "pred_answer": "2",
3657
+ "correct": false,
3658
+ "n_gen_tokens": 64
3659
+ }
3660
+ }
3661
+ ],
3662
+ "flip_rows": [
3663
+ {
3664
+ "ex_id": "gsm8k-test-51",
3665
+ "gold_raw": "48",
3666
+ "baseline": {
3667
+ "pred_answer": "48",
3668
+ "correct": true
3669
+ },
3670
+ "ablated": {
3671
+ "pred_answer": "12",
3672
+ "correct": false
3673
+ },
3674
+ "patched_self": {
3675
+ "pred_answer": "5",
3676
+ "correct": false
3677
+ },
3678
+ "control_time_shuffled": {
3679
+ "pred_answer": "1",
3680
+ "correct": false
3681
+ },
3682
+ "control_shared_randvec": {
3683
+ "pred_answer": "2.0",
3684
+ "correct": false
3685
+ },
3686
+ "control_rand_subspace": {
3687
+ "pred_answer": "12",
3688
+ "correct": false
3689
+ },
3690
+ "control_patch_nonshared": {
3691
+ "pred_answer": "1",
3692
+ "correct": false
3693
+ }
3694
+ },
3695
+ {
3696
+ "ex_id": "gsm8k-test-75",
3697
+ "gold_raw": "2",
3698
+ "baseline": {
3699
+ "pred_answer": "2.00",
3700
+ "correct": true
3701
+ },
3702
+ "ablated": {
3703
+ "pred_answer": "6",
3704
+ "correct": false
3705
+ },
3706
+ "patched_self": {
3707
+ "pred_answer": "2.00",
3708
+ "correct": true
3709
+ },
3710
+ "control_time_shuffled": {
3711
+ "pred_answer": "2",
3712
+ "correct": true
3713
+ },
3714
+ "control_shared_randvec": {
3715
+ "pred_answer": "2023",
3716
+ "correct": false
3717
+ },
3718
+ "control_rand_subspace": {
3719
+ "pred_answer": "10.00000000000000000000000000000000000001231029995651998697525",
3720
+ "correct": false
3721
+ },
3722
+ "control_patch_nonshared": {
3723
+ "pred_answer": "6",
3724
+ "correct": false
3725
+ }
3726
+ },
3727
+ {
3728
+ "ex_id": "gsm8k-test-84",
3729
+ "gold_raw": "6",
3730
+ "baseline": {
3731
+ "pred_answer": "6",
3732
+ "correct": true
3733
+ },
3734
+ "ablated": {
3735
+ "pred_answer": "9",
3736
+ "correct": false
3737
+ },
3738
+ "patched_self": {
3739
+ "pred_answer": "9",
3740
+ "correct": false
3741
+ },
3742
+ "control_time_shuffled": {
3743
+ "pred_answer": "1",
3744
+ "correct": false
3745
+ },
3746
+ "control_shared_randvec": {
3747
+ "pred_answer": "27",
3748
+ "correct": false
3749
+ },
3750
+ "control_rand_subspace": {
3751
+ "pred_answer": "9",
3752
+ "correct": false
3753
+ },
3754
+ "control_patch_nonshared": {
3755
+ "pred_answer": "0.0",
3756
+ "correct": false
3757
+ }
3758
+ },
3759
+ {
3760
+ "ex_id": "gsm8k-test-117",
3761
+ "gold_raw": "2",
3762
+ "baseline": {
3763
+ "pred_answer": "2",
3764
+ "correct": true
3765
+ },
3766
+ "ablated": {
3767
+ "pred_answer": "10",
3768
+ "correct": false
3769
+ },
3770
+ "patched_self": {
3771
+ "pred_answer": "1",
3772
+ "correct": false
3773
+ },
3774
+ "control_time_shuffled": {
3775
+ "pred_answer": "10",
3776
+ "correct": false
3777
+ },
3778
+ "control_shared_randvec": {
3779
+ "pred_answer": "4",
3780
+ "correct": false
3781
+ },
3782
+ "control_rand_subspace": {
3783
+ "pred_answer": "00",
3784
+ "correct": false
3785
+ },
3786
+ "control_patch_nonshared": {
3787
+ "pred_answer": "2.00",
3788
+ "correct": true
3789
+ }
3790
+ },
3791
+ {
3792
+ "ex_id": "gsm8k-test-203",
3793
+ "gold_raw": "6",
3794
+ "baseline": {
3795
+ "pred_answer": "6",
3796
+ "correct": true
3797
+ },
3798
+ "ablated": {
3799
+ "pred_answer": "4.000",
3800
+ "correct": false
3801
+ },
3802
+ "patched_self": {
3803
+ "pred_answer": "3",
3804
+ "correct": false
3805
+ },
3806
+ "control_time_shuffled": {
3807
+ "pred_answer": "3",
3808
+ "correct": false
3809
+ },
3810
+ "control_shared_randvec": {
3811
+ "pred_answer": "6",
3812
+ "correct": true
3813
+ },
3814
+ "control_rand_subspace": {
3815
+ "pred_answer": "4.041666667",
3816
+ "correct": false
3817
+ },
3818
+ "control_patch_nonshared": {
3819
+ "pred_answer": "3",
3820
+ "correct": false
3821
+ }
3822
+ },
3823
+ {
3824
+ "ex_id": "gsm8k-test-216",
3825
+ "gold_raw": "3",
3826
+ "baseline": {
3827
+ "pred_answer": "3",
3828
+ "correct": true
3829
+ },
3830
+ "ablated": {
3831
+ "pred_answer": "2",
3832
+ "correct": false
3833
+ },
3834
+ "patched_self": {
3835
+ "pred_answer": "1",
3836
+ "correct": false
3837
+ },
3838
+ "control_time_shuffled": {
3839
+ "pred_answer": "1",
3840
+ "correct": false
3841
+ },
3842
+ "control_shared_randvec": {
3843
+ "pred_answer": "1",
3844
+ "correct": false
3845
+ },
3846
+ "control_rand_subspace": {
3847
+ "pred_answer": "1.5",
3848
+ "correct": false
3849
+ },
3850
+ "control_patch_nonshared": {
3851
+ "pred_answer": "3",
3852
+ "correct": true
3853
+ }
3854
+ },
3855
+ {
3856
+ "ex_id": "gsm8k-test-224",
3857
+ "gold_raw": "3",
3858
+ "baseline": {
3859
+ "pred_answer": "3",
3860
+ "correct": true
3861
+ },
3862
+ "ablated": {
3863
+ "pred_answer": "5",
3864
+ "correct": false
3865
+ },
3866
+ "patched_self": {
3867
+ "pred_answer": "4",
3868
+ "correct": false
3869
+ },
3870
+ "control_time_shuffled": {
3871
+ "pred_answer": "4.00",
3872
+ "correct": false
3873
+ },
3874
+ "control_shared_randvec": {
3875
+ "pred_answer": "12",
3876
+ "correct": false
3877
+ },
3878
+ "control_rand_subspace": {
3879
+ "pred_answer": "1",
3880
+ "correct": false
3881
+ },
3882
+ "control_patch_nonshared": {
3883
+ "pred_answer": "3.0",
3884
+ "correct": true
3885
+ }
3886
+ }
3887
+ ]
3888
+ }
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/gsm8k_pairlogprob.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/humaneval_gencode_compile.json ADDED
@@ -0,0 +1,2336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "meta": {
3
+ "model": "Qwen/Qwen2.5-7B-Instruct",
4
+ "device": "cuda",
5
+ "dtype": "fp32",
6
+ "layer": 10,
7
+ "layers_path": "model.layers",
8
+ "seed": 123,
9
+ "task": "humaneval",
10
+ "eval_mode": "gen_code_compile",
11
+ "eval_meta": {
12
+ "hf_id": "openai_humaneval",
13
+ "split": "test",
14
+ "n_total": 164
15
+ },
16
+ "n_eval_loaded": 164,
17
+ "n_scanned": 164,
18
+ "base_acc_scan": 0.13414634146341464,
19
+ "ablt_acc_scan": 0.23780487804878048,
20
+ "flips_total": 19,
21
+ "flips_used": 19,
22
+ "patch_steps": [
23
+ 0,
24
+ 1,
25
+ 2,
26
+ 3
27
+ ],
28
+ "patch_n_steps": 4,
29
+ "Qs_path": "/home/zs89/decodeshare/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/Q_shared_layer10_seed123.npy",
30
+ "Qs_shape": [
31
+ 3584,
32
+ 190
33
+ ],
34
+ "gold_text_prefix": " ",
35
+ "dist_text_prefix": " ",
36
+ "gold_max_tokens": 0,
37
+ "distractor_mode": "next_gold",
38
+ "answer_prefix_effective": "\nFinal answer:",
39
+ "max_new_tokens_effective": 256,
40
+ "run_coeff_controls": false,
41
+ "use_benchmark_loader": false,
42
+ "hf_id": "openai_humaneval",
43
+ "hf_split": "test"
44
+ },
45
+ "summary_on_flips": {
46
+ "patched_self": {
47
+ "n": 19,
48
+ "rescued": 2,
49
+ "rescued_pct": 10.526315789473685
50
+ },
51
+ "control_time_shuffled": {
52
+ "n": 19,
53
+ "rescued": 2,
54
+ "rescued_pct": 10.526315789473685
55
+ },
56
+ "control_shared_randvec": {
57
+ "n": 19,
58
+ "rescued": 3,
59
+ "rescued_pct": 15.789473684210526
60
+ },
61
+ "control_rand_subspace": {
62
+ "n": 19,
63
+ "rescued": 5,
64
+ "rescued_pct": 26.31578947368421
65
+ },
66
+ "control_patch_nonshared": {
67
+ "n": 19,
68
+ "rescued": 0,
69
+ "rescued_pct": 0.0
70
+ }
71
+ },
72
+ "scan_rows": [
73
+ {
74
+ "ex_id": "openai_humaneval-test-18",
75
+ "baseline": {
76
+ "compile_ok": false,
77
+ "n_gen_tokens": 256
78
+ },
79
+ "ablated": {
80
+ "compile_ok": false,
81
+ "n_gen_tokens": 256
82
+ }
83
+ },
84
+ {
85
+ "ex_id": "openai_humaneval-test-31",
86
+ "baseline": {
87
+ "compile_ok": false,
88
+ "n_gen_tokens": 256
89
+ },
90
+ "ablated": {
91
+ "compile_ok": false,
92
+ "n_gen_tokens": 256
93
+ }
94
+ },
95
+ {
96
+ "ex_id": "openai_humaneval-test-158",
97
+ "baseline": {
98
+ "compile_ok": false,
99
+ "n_gen_tokens": 256
100
+ },
101
+ "ablated": {
102
+ "compile_ok": false,
103
+ "n_gen_tokens": 256
104
+ }
105
+ },
106
+ {
107
+ "ex_id": "openai_humaneval-test-43",
108
+ "baseline": {
109
+ "compile_ok": false,
110
+ "n_gen_tokens": 256
111
+ },
112
+ "ablated": {
113
+ "compile_ok": false,
114
+ "n_gen_tokens": 256
115
+ }
116
+ },
117
+ {
118
+ "ex_id": "openai_humaneval-test-39",
119
+ "baseline": {
120
+ "compile_ok": false,
121
+ "n_gen_tokens": 256
122
+ },
123
+ "ablated": {
124
+ "compile_ok": false,
125
+ "n_gen_tokens": 256
126
+ }
127
+ },
128
+ {
129
+ "ex_id": "openai_humaneval-test-15",
130
+ "baseline": {
131
+ "compile_ok": false,
132
+ "n_gen_tokens": 256
133
+ },
134
+ "ablated": {
135
+ "compile_ok": false,
136
+ "n_gen_tokens": 256
137
+ }
138
+ },
139
+ {
140
+ "ex_id": "openai_humaneval-test-151",
141
+ "baseline": {
142
+ "compile_ok": false,
143
+ "n_gen_tokens": 256
144
+ },
145
+ "ablated": {
146
+ "compile_ok": false,
147
+ "n_gen_tokens": 256
148
+ }
149
+ },
150
+ {
151
+ "ex_id": "openai_humaneval-test-101",
152
+ "baseline": {
153
+ "compile_ok": false,
154
+ "n_gen_tokens": 256
155
+ },
156
+ "ablated": {
157
+ "compile_ok": false,
158
+ "n_gen_tokens": 256
159
+ }
160
+ },
161
+ {
162
+ "ex_id": "openai_humaneval-test-34",
163
+ "baseline": {
164
+ "compile_ok": false,
165
+ "n_gen_tokens": 256
166
+ },
167
+ "ablated": {
168
+ "compile_ok": true,
169
+ "n_gen_tokens": 256
170
+ }
171
+ },
172
+ {
173
+ "ex_id": "openai_humaneval-test-52",
174
+ "baseline": {
175
+ "compile_ok": false,
176
+ "n_gen_tokens": 256
177
+ },
178
+ "ablated": {
179
+ "compile_ok": true,
180
+ "n_gen_tokens": 256
181
+ }
182
+ },
183
+ {
184
+ "ex_id": "openai_humaneval-test-113",
185
+ "baseline": {
186
+ "compile_ok": false,
187
+ "n_gen_tokens": 256
188
+ },
189
+ "ablated": {
190
+ "compile_ok": false,
191
+ "n_gen_tokens": 256
192
+ }
193
+ },
194
+ {
195
+ "ex_id": "openai_humaneval-test-119",
196
+ "baseline": {
197
+ "compile_ok": false,
198
+ "n_gen_tokens": 256
199
+ },
200
+ "ablated": {
201
+ "compile_ok": false,
202
+ "n_gen_tokens": 256
203
+ }
204
+ },
205
+ {
206
+ "ex_id": "openai_humaneval-test-83",
207
+ "baseline": {
208
+ "compile_ok": false,
209
+ "n_gen_tokens": 256
210
+ },
211
+ "ablated": {
212
+ "compile_ok": true,
213
+ "n_gen_tokens": 256
214
+ }
215
+ },
216
+ {
217
+ "ex_id": "openai_humaneval-test-116",
218
+ "baseline": {
219
+ "compile_ok": false,
220
+ "n_gen_tokens": 256
221
+ },
222
+ "ablated": {
223
+ "compile_ok": false,
224
+ "n_gen_tokens": 256
225
+ }
226
+ },
227
+ {
228
+ "ex_id": "openai_humaneval-test-56",
229
+ "baseline": {
230
+ "compile_ok": false,
231
+ "n_gen_tokens": 256
232
+ },
233
+ "ablated": {
234
+ "compile_ok": false,
235
+ "n_gen_tokens": 256
236
+ }
237
+ },
238
+ {
239
+ "ex_id": "openai_humaneval-test-131",
240
+ "baseline": {
241
+ "compile_ok": false,
242
+ "n_gen_tokens": 256
243
+ },
244
+ "ablated": {
245
+ "compile_ok": true,
246
+ "n_gen_tokens": 256
247
+ }
248
+ },
249
+ {
250
+ "ex_id": "openai_humaneval-test-1",
251
+ "baseline": {
252
+ "compile_ok": false,
253
+ "n_gen_tokens": 256
254
+ },
255
+ "ablated": {
256
+ "compile_ok": false,
257
+ "n_gen_tokens": 256
258
+ }
259
+ },
260
+ {
261
+ "ex_id": "openai_humaneval-test-159",
262
+ "baseline": {
263
+ "compile_ok": false,
264
+ "n_gen_tokens": 256
265
+ },
266
+ "ablated": {
267
+ "compile_ok": false,
268
+ "n_gen_tokens": 256
269
+ }
270
+ },
271
+ {
272
+ "ex_id": "openai_humaneval-test-123",
273
+ "baseline": {
274
+ "compile_ok": false,
275
+ "n_gen_tokens": 256
276
+ },
277
+ "ablated": {
278
+ "compile_ok": false,
279
+ "n_gen_tokens": 256
280
+ }
281
+ },
282
+ {
283
+ "ex_id": "openai_humaneval-test-23",
284
+ "baseline": {
285
+ "compile_ok": false,
286
+ "n_gen_tokens": 256
287
+ },
288
+ "ablated": {
289
+ "compile_ok": false,
290
+ "n_gen_tokens": 256
291
+ }
292
+ },
293
+ {
294
+ "ex_id": "openai_humaneval-test-124",
295
+ "baseline": {
296
+ "compile_ok": false,
297
+ "n_gen_tokens": 256
298
+ },
299
+ "ablated": {
300
+ "compile_ok": false,
301
+ "n_gen_tokens": 256
302
+ }
303
+ },
304
+ {
305
+ "ex_id": "openai_humaneval-test-38",
306
+ "baseline": {
307
+ "compile_ok": false,
308
+ "n_gen_tokens": 256
309
+ },
310
+ "ablated": {
311
+ "compile_ok": false,
312
+ "n_gen_tokens": 256
313
+ }
314
+ },
315
+ {
316
+ "ex_id": "openai_humaneval-test-84",
317
+ "baseline": {
318
+ "compile_ok": false,
319
+ "n_gen_tokens": 256
320
+ },
321
+ "ablated": {
322
+ "compile_ok": false,
323
+ "n_gen_tokens": 256
324
+ }
325
+ },
326
+ {
327
+ "ex_id": "openai_humaneval-test-41",
328
+ "baseline": {
329
+ "compile_ok": false,
330
+ "n_gen_tokens": 256
331
+ },
332
+ "ablated": {
333
+ "compile_ok": true,
334
+ "n_gen_tokens": 256
335
+ }
336
+ },
337
+ {
338
+ "ex_id": "openai_humaneval-test-134",
339
+ "baseline": {
340
+ "compile_ok": false,
341
+ "n_gen_tokens": 256
342
+ },
343
+ "ablated": {
344
+ "compile_ok": false,
345
+ "n_gen_tokens": 256
346
+ }
347
+ },
348
+ {
349
+ "ex_id": "openai_humaneval-test-2",
350
+ "baseline": {
351
+ "compile_ok": false,
352
+ "n_gen_tokens": 256
353
+ },
354
+ "ablated": {
355
+ "compile_ok": true,
356
+ "n_gen_tokens": 256
357
+ }
358
+ },
359
+ {
360
+ "ex_id": "openai_humaneval-test-80",
361
+ "baseline": {
362
+ "compile_ok": false,
363
+ "n_gen_tokens": 256
364
+ },
365
+ "ablated": {
366
+ "compile_ok": false,
367
+ "n_gen_tokens": 256
368
+ }
369
+ },
370
+ {
371
+ "ex_id": "openai_humaneval-test-74",
372
+ "baseline": {
373
+ "compile_ok": true,
374
+ "n_gen_tokens": 256
375
+ },
376
+ "ablated": {
377
+ "compile_ok": false,
378
+ "n_gen_tokens": 256
379
+ }
380
+ },
381
+ {
382
+ "ex_id": "openai_humaneval-test-162",
383
+ "baseline": {
384
+ "compile_ok": false,
385
+ "n_gen_tokens": 256
386
+ },
387
+ "ablated": {
388
+ "compile_ok": false,
389
+ "n_gen_tokens": 256
390
+ }
391
+ },
392
+ {
393
+ "ex_id": "openai_humaneval-test-138",
394
+ "baseline": {
395
+ "compile_ok": false,
396
+ "n_gen_tokens": 256
397
+ },
398
+ "ablated": {
399
+ "compile_ok": true,
400
+ "n_gen_tokens": 256
401
+ }
402
+ },
403
+ {
404
+ "ex_id": "openai_humaneval-test-87",
405
+ "baseline": {
406
+ "compile_ok": true,
407
+ "n_gen_tokens": 256
408
+ },
409
+ "ablated": {
410
+ "compile_ok": false,
411
+ "n_gen_tokens": 256
412
+ }
413
+ },
414
+ {
415
+ "ex_id": "openai_humaneval-test-145",
416
+ "baseline": {
417
+ "compile_ok": false,
418
+ "n_gen_tokens": 256
419
+ },
420
+ "ablated": {
421
+ "compile_ok": false,
422
+ "n_gen_tokens": 256
423
+ }
424
+ },
425
+ {
426
+ "ex_id": "openai_humaneval-test-54",
427
+ "baseline": {
428
+ "compile_ok": false,
429
+ "n_gen_tokens": 256
430
+ },
431
+ "ablated": {
432
+ "compile_ok": false,
433
+ "n_gen_tokens": 256
434
+ }
435
+ },
436
+ {
437
+ "ex_id": "openai_humaneval-test-109",
438
+ "baseline": {
439
+ "compile_ok": false,
440
+ "n_gen_tokens": 256
441
+ },
442
+ "ablated": {
443
+ "compile_ok": false,
444
+ "n_gen_tokens": 256
445
+ }
446
+ },
447
+ {
448
+ "ex_id": "openai_humaneval-test-102",
449
+ "baseline": {
450
+ "compile_ok": false,
451
+ "n_gen_tokens": 256
452
+ },
453
+ "ablated": {
454
+ "compile_ok": false,
455
+ "n_gen_tokens": 256
456
+ }
457
+ },
458
+ {
459
+ "ex_id": "openai_humaneval-test-62",
460
+ "baseline": {
461
+ "compile_ok": false,
462
+ "n_gen_tokens": 256
463
+ },
464
+ "ablated": {
465
+ "compile_ok": false,
466
+ "n_gen_tokens": 256
467
+ }
468
+ },
469
+ {
470
+ "ex_id": "openai_humaneval-test-129",
471
+ "baseline": {
472
+ "compile_ok": false,
473
+ "n_gen_tokens": 256
474
+ },
475
+ "ablated": {
476
+ "compile_ok": false,
477
+ "n_gen_tokens": 256
478
+ }
479
+ },
480
+ {
481
+ "ex_id": "openai_humaneval-test-110",
482
+ "baseline": {
483
+ "compile_ok": false,
484
+ "n_gen_tokens": 256
485
+ },
486
+ "ablated": {
487
+ "compile_ok": false,
488
+ "n_gen_tokens": 256
489
+ }
490
+ },
491
+ {
492
+ "ex_id": "openai_humaneval-test-4",
493
+ "baseline": {
494
+ "compile_ok": false,
495
+ "n_gen_tokens": 256
496
+ },
497
+ "ablated": {
498
+ "compile_ok": false,
499
+ "n_gen_tokens": 256
500
+ }
501
+ },
502
+ {
503
+ "ex_id": "openai_humaneval-test-8",
504
+ "baseline": {
505
+ "compile_ok": false,
506
+ "n_gen_tokens": 256
507
+ },
508
+ "ablated": {
509
+ "compile_ok": false,
510
+ "n_gen_tokens": 256
511
+ }
512
+ },
513
+ {
514
+ "ex_id": "openai_humaneval-test-97",
515
+ "baseline": {
516
+ "compile_ok": false,
517
+ "n_gen_tokens": 256
518
+ },
519
+ "ablated": {
520
+ "compile_ok": true,
521
+ "n_gen_tokens": 256
522
+ }
523
+ },
524
+ {
525
+ "ex_id": "openai_humaneval-test-137",
526
+ "baseline": {
527
+ "compile_ok": false,
528
+ "n_gen_tokens": 256
529
+ },
530
+ "ablated": {
531
+ "compile_ok": true,
532
+ "n_gen_tokens": 256
533
+ }
534
+ },
535
+ {
536
+ "ex_id": "openai_humaneval-test-96",
537
+ "baseline": {
538
+ "compile_ok": false,
539
+ "n_gen_tokens": 256
540
+ },
541
+ "ablated": {
542
+ "compile_ok": true,
543
+ "n_gen_tokens": 256
544
+ }
545
+ },
546
+ {
547
+ "ex_id": "openai_humaneval-test-65",
548
+ "baseline": {
549
+ "compile_ok": false,
550
+ "n_gen_tokens": 256
551
+ },
552
+ "ablated": {
553
+ "compile_ok": false,
554
+ "n_gen_tokens": 256
555
+ }
556
+ },
557
+ {
558
+ "ex_id": "openai_humaneval-test-114",
559
+ "baseline": {
560
+ "compile_ok": false,
561
+ "n_gen_tokens": 256
562
+ },
563
+ "ablated": {
564
+ "compile_ok": false,
565
+ "n_gen_tokens": 256
566
+ }
567
+ },
568
+ {
569
+ "ex_id": "openai_humaneval-test-117",
570
+ "baseline": {
571
+ "compile_ok": false,
572
+ "n_gen_tokens": 256
573
+ },
574
+ "ablated": {
575
+ "compile_ok": false,
576
+ "n_gen_tokens": 256
577
+ }
578
+ },
579
+ {
580
+ "ex_id": "openai_humaneval-test-155",
581
+ "baseline": {
582
+ "compile_ok": false,
583
+ "n_gen_tokens": 256
584
+ },
585
+ "ablated": {
586
+ "compile_ok": true,
587
+ "n_gen_tokens": 256
588
+ }
589
+ },
590
+ {
591
+ "ex_id": "openai_humaneval-test-37",
592
+ "baseline": {
593
+ "compile_ok": false,
594
+ "n_gen_tokens": 256
595
+ },
596
+ "ablated": {
597
+ "compile_ok": false,
598
+ "n_gen_tokens": 256
599
+ }
600
+ },
601
+ {
602
+ "ex_id": "openai_humaneval-test-115",
603
+ "baseline": {
604
+ "compile_ok": false,
605
+ "n_gen_tokens": 256
606
+ },
607
+ "ablated": {
608
+ "compile_ok": true,
609
+ "n_gen_tokens": 256
610
+ }
611
+ },
612
+ {
613
+ "ex_id": "openai_humaneval-test-86",
614
+ "baseline": {
615
+ "compile_ok": false,
616
+ "n_gen_tokens": 256
617
+ },
618
+ "ablated": {
619
+ "compile_ok": true,
620
+ "n_gen_tokens": 256
621
+ }
622
+ },
623
+ {
624
+ "ex_id": "openai_humaneval-test-85",
625
+ "baseline": {
626
+ "compile_ok": false,
627
+ "n_gen_tokens": 256
628
+ },
629
+ "ablated": {
630
+ "compile_ok": false,
631
+ "n_gen_tokens": 256
632
+ }
633
+ },
634
+ {
635
+ "ex_id": "openai_humaneval-test-53",
636
+ "baseline": {
637
+ "compile_ok": false,
638
+ "n_gen_tokens": 256
639
+ },
640
+ "ablated": {
641
+ "compile_ok": false,
642
+ "n_gen_tokens": 256
643
+ }
644
+ },
645
+ {
646
+ "ex_id": "openai_humaneval-test-130",
647
+ "baseline": {
648
+ "compile_ok": false,
649
+ "n_gen_tokens": 256
650
+ },
651
+ "ablated": {
652
+ "compile_ok": false,
653
+ "n_gen_tokens": 256
654
+ }
655
+ },
656
+ {
657
+ "ex_id": "openai_humaneval-test-139",
658
+ "baseline": {
659
+ "compile_ok": false,
660
+ "n_gen_tokens": 256
661
+ },
662
+ "ablated": {
663
+ "compile_ok": true,
664
+ "n_gen_tokens": 256
665
+ }
666
+ },
667
+ {
668
+ "ex_id": "openai_humaneval-test-81",
669
+ "baseline": {
670
+ "compile_ok": false,
671
+ "n_gen_tokens": 256
672
+ },
673
+ "ablated": {
674
+ "compile_ok": false,
675
+ "n_gen_tokens": 256
676
+ }
677
+ },
678
+ {
679
+ "ex_id": "openai_humaneval-test-19",
680
+ "baseline": {
681
+ "compile_ok": false,
682
+ "n_gen_tokens": 256
683
+ },
684
+ "ablated": {
685
+ "compile_ok": false,
686
+ "n_gen_tokens": 256
687
+ }
688
+ },
689
+ {
690
+ "ex_id": "openai_humaneval-test-69",
691
+ "baseline": {
692
+ "compile_ok": false,
693
+ "n_gen_tokens": 256
694
+ },
695
+ "ablated": {
696
+ "compile_ok": false,
697
+ "n_gen_tokens": 256
698
+ }
699
+ },
700
+ {
701
+ "ex_id": "openai_humaneval-test-122",
702
+ "baseline": {
703
+ "compile_ok": false,
704
+ "n_gen_tokens": 256
705
+ },
706
+ "ablated": {
707
+ "compile_ok": true,
708
+ "n_gen_tokens": 256
709
+ }
710
+ },
711
+ {
712
+ "ex_id": "openai_humaneval-test-108",
713
+ "baseline": {
714
+ "compile_ok": false,
715
+ "n_gen_tokens": 256
716
+ },
717
+ "ablated": {
718
+ "compile_ok": false,
719
+ "n_gen_tokens": 256
720
+ }
721
+ },
722
+ {
723
+ "ex_id": "openai_humaneval-test-48",
724
+ "baseline": {
725
+ "compile_ok": false,
726
+ "n_gen_tokens": 256
727
+ },
728
+ "ablated": {
729
+ "compile_ok": false,
730
+ "n_gen_tokens": 256
731
+ }
732
+ },
733
+ {
734
+ "ex_id": "openai_humaneval-test-17",
735
+ "baseline": {
736
+ "compile_ok": false,
737
+ "n_gen_tokens": 256
738
+ },
739
+ "ablated": {
740
+ "compile_ok": true,
741
+ "n_gen_tokens": 256
742
+ }
743
+ },
744
+ {
745
+ "ex_id": "openai_humaneval-test-95",
746
+ "baseline": {
747
+ "compile_ok": false,
748
+ "n_gen_tokens": 256
749
+ },
750
+ "ablated": {
751
+ "compile_ok": true,
752
+ "n_gen_tokens": 256
753
+ }
754
+ },
755
+ {
756
+ "ex_id": "openai_humaneval-test-91",
757
+ "baseline": {
758
+ "compile_ok": false,
759
+ "n_gen_tokens": 256
760
+ },
761
+ "ablated": {
762
+ "compile_ok": true,
763
+ "n_gen_tokens": 256
764
+ }
765
+ },
766
+ {
767
+ "ex_id": "openai_humaneval-test-79",
768
+ "baseline": {
769
+ "compile_ok": false,
770
+ "n_gen_tokens": 256
771
+ },
772
+ "ablated": {
773
+ "compile_ok": true,
774
+ "n_gen_tokens": 256
775
+ }
776
+ },
777
+ {
778
+ "ex_id": "openai_humaneval-test-14",
779
+ "baseline": {
780
+ "compile_ok": true,
781
+ "n_gen_tokens": 256
782
+ },
783
+ "ablated": {
784
+ "compile_ok": false,
785
+ "n_gen_tokens": 256
786
+ }
787
+ },
788
+ {
789
+ "ex_id": "openai_humaneval-test-77",
790
+ "baseline": {
791
+ "compile_ok": false,
792
+ "n_gen_tokens": 256
793
+ },
794
+ "ablated": {
795
+ "compile_ok": false,
796
+ "n_gen_tokens": 256
797
+ }
798
+ },
799
+ {
800
+ "ex_id": "openai_humaneval-test-3",
801
+ "baseline": {
802
+ "compile_ok": false,
803
+ "n_gen_tokens": 256
804
+ },
805
+ "ablated": {
806
+ "compile_ok": false,
807
+ "n_gen_tokens": 256
808
+ }
809
+ },
810
+ {
811
+ "ex_id": "openai_humaneval-test-146",
812
+ "baseline": {
813
+ "compile_ok": false,
814
+ "n_gen_tokens": 256
815
+ },
816
+ "ablated": {
817
+ "compile_ok": true,
818
+ "n_gen_tokens": 256
819
+ }
820
+ },
821
+ {
822
+ "ex_id": "openai_humaneval-test-57",
823
+ "baseline": {
824
+ "compile_ok": false,
825
+ "n_gen_tokens": 256
826
+ },
827
+ "ablated": {
828
+ "compile_ok": false,
829
+ "n_gen_tokens": 256
830
+ }
831
+ },
832
+ {
833
+ "ex_id": "openai_humaneval-test-143",
834
+ "baseline": {
835
+ "compile_ok": false,
836
+ "n_gen_tokens": 256
837
+ },
838
+ "ablated": {
839
+ "compile_ok": false,
840
+ "n_gen_tokens": 256
841
+ }
842
+ },
843
+ {
844
+ "ex_id": "openai_humaneval-test-0",
845
+ "baseline": {
846
+ "compile_ok": false,
847
+ "n_gen_tokens": 256
848
+ },
849
+ "ablated": {
850
+ "compile_ok": false,
851
+ "n_gen_tokens": 256
852
+ }
853
+ },
854
+ {
855
+ "ex_id": "openai_humaneval-test-106",
856
+ "baseline": {
857
+ "compile_ok": false,
858
+ "n_gen_tokens": 256
859
+ },
860
+ "ablated": {
861
+ "compile_ok": false,
862
+ "n_gen_tokens": 256
863
+ }
864
+ },
865
+ {
866
+ "ex_id": "openai_humaneval-test-50",
867
+ "baseline": {
868
+ "compile_ok": false,
869
+ "n_gen_tokens": 256
870
+ },
871
+ "ablated": {
872
+ "compile_ok": false,
873
+ "n_gen_tokens": 256
874
+ }
875
+ },
876
+ {
877
+ "ex_id": "openai_humaneval-test-58",
878
+ "baseline": {
879
+ "compile_ok": true,
880
+ "n_gen_tokens": 256
881
+ },
882
+ "ablated": {
883
+ "compile_ok": false,
884
+ "n_gen_tokens": 256
885
+ }
886
+ },
887
+ {
888
+ "ex_id": "openai_humaneval-test-147",
889
+ "baseline": {
890
+ "compile_ok": false,
891
+ "n_gen_tokens": 256
892
+ },
893
+ "ablated": {
894
+ "compile_ok": true,
895
+ "n_gen_tokens": 256
896
+ }
897
+ },
898
+ {
899
+ "ex_id": "openai_humaneval-test-160",
900
+ "baseline": {
901
+ "compile_ok": false,
902
+ "n_gen_tokens": 256
903
+ },
904
+ "ablated": {
905
+ "compile_ok": false,
906
+ "n_gen_tokens": 256
907
+ }
908
+ },
909
+ {
910
+ "ex_id": "openai_humaneval-test-67",
911
+ "baseline": {
912
+ "compile_ok": false,
913
+ "n_gen_tokens": 256
914
+ },
915
+ "ablated": {
916
+ "compile_ok": true,
917
+ "n_gen_tokens": 256
918
+ }
919
+ },
920
+ {
921
+ "ex_id": "openai_humaneval-test-55",
922
+ "baseline": {
923
+ "compile_ok": false,
924
+ "n_gen_tokens": 256
925
+ },
926
+ "ablated": {
927
+ "compile_ok": true,
928
+ "n_gen_tokens": 256
929
+ }
930
+ },
931
+ {
932
+ "ex_id": "openai_humaneval-test-118",
933
+ "baseline": {
934
+ "compile_ok": false,
935
+ "n_gen_tokens": 256
936
+ },
937
+ "ablated": {
938
+ "compile_ok": false,
939
+ "n_gen_tokens": 256
940
+ }
941
+ },
942
+ {
943
+ "ex_id": "openai_humaneval-test-154",
944
+ "baseline": {
945
+ "compile_ok": false,
946
+ "n_gen_tokens": 256
947
+ },
948
+ "ablated": {
949
+ "compile_ok": true,
950
+ "n_gen_tokens": 256
951
+ }
952
+ },
953
+ {
954
+ "ex_id": "openai_humaneval-test-25",
955
+ "baseline": {
956
+ "compile_ok": false,
957
+ "n_gen_tokens": 256
958
+ },
959
+ "ablated": {
960
+ "compile_ok": false,
961
+ "n_gen_tokens": 256
962
+ }
963
+ },
964
+ {
965
+ "ex_id": "openai_humaneval-test-36",
966
+ "baseline": {
967
+ "compile_ok": false,
968
+ "n_gen_tokens": 256
969
+ },
970
+ "ablated": {
971
+ "compile_ok": false,
972
+ "n_gen_tokens": 256
973
+ }
974
+ },
975
+ {
976
+ "ex_id": "openai_humaneval-test-63",
977
+ "baseline": {
978
+ "compile_ok": false,
979
+ "n_gen_tokens": 256
980
+ },
981
+ "ablated": {
982
+ "compile_ok": false,
983
+ "n_gen_tokens": 256
984
+ }
985
+ },
986
+ {
987
+ "ex_id": "openai_humaneval-test-132",
988
+ "baseline": {
989
+ "compile_ok": false,
990
+ "n_gen_tokens": 256
991
+ },
992
+ "ablated": {
993
+ "compile_ok": true,
994
+ "n_gen_tokens": 256
995
+ }
996
+ },
997
+ {
998
+ "ex_id": "openai_humaneval-test-21",
999
+ "baseline": {
1000
+ "compile_ok": false,
1001
+ "n_gen_tokens": 256
1002
+ },
1003
+ "ablated": {
1004
+ "compile_ok": false,
1005
+ "n_gen_tokens": 256
1006
+ }
1007
+ },
1008
+ {
1009
+ "ex_id": "openai_humaneval-test-9",
1010
+ "baseline": {
1011
+ "compile_ok": false,
1012
+ "n_gen_tokens": 256
1013
+ },
1014
+ "ablated": {
1015
+ "compile_ok": true,
1016
+ "n_gen_tokens": 256
1017
+ }
1018
+ },
1019
+ {
1020
+ "ex_id": "openai_humaneval-test-72",
1021
+ "baseline": {
1022
+ "compile_ok": false,
1023
+ "n_gen_tokens": 256
1024
+ },
1025
+ "ablated": {
1026
+ "compile_ok": true,
1027
+ "n_gen_tokens": 256
1028
+ }
1029
+ },
1030
+ {
1031
+ "ex_id": "openai_humaneval-test-128",
1032
+ "baseline": {
1033
+ "compile_ok": false,
1034
+ "n_gen_tokens": 256
1035
+ },
1036
+ "ablated": {
1037
+ "compile_ok": false,
1038
+ "n_gen_tokens": 256
1039
+ }
1040
+ },
1041
+ {
1042
+ "ex_id": "openai_humaneval-test-126",
1043
+ "baseline": {
1044
+ "compile_ok": false,
1045
+ "n_gen_tokens": 256
1046
+ },
1047
+ "ablated": {
1048
+ "compile_ok": true,
1049
+ "n_gen_tokens": 256
1050
+ }
1051
+ },
1052
+ {
1053
+ "ex_id": "openai_humaneval-test-70",
1054
+ "baseline": {
1055
+ "compile_ok": false,
1056
+ "n_gen_tokens": 256
1057
+ },
1058
+ "ablated": {
1059
+ "compile_ok": true,
1060
+ "n_gen_tokens": 256
1061
+ }
1062
+ },
1063
+ {
1064
+ "ex_id": "openai_humaneval-test-40",
1065
+ "baseline": {
1066
+ "compile_ok": false,
1067
+ "n_gen_tokens": 256
1068
+ },
1069
+ "ablated": {
1070
+ "compile_ok": false,
1071
+ "n_gen_tokens": 256
1072
+ }
1073
+ },
1074
+ {
1075
+ "ex_id": "openai_humaneval-test-66",
1076
+ "baseline": {
1077
+ "compile_ok": false,
1078
+ "n_gen_tokens": 256
1079
+ },
1080
+ "ablated": {
1081
+ "compile_ok": false,
1082
+ "n_gen_tokens": 256
1083
+ }
1084
+ },
1085
+ {
1086
+ "ex_id": "openai_humaneval-test-71",
1087
+ "baseline": {
1088
+ "compile_ok": false,
1089
+ "n_gen_tokens": 256
1090
+ },
1091
+ "ablated": {
1092
+ "compile_ok": false,
1093
+ "n_gen_tokens": 256
1094
+ }
1095
+ },
1096
+ {
1097
+ "ex_id": "openai_humaneval-test-107",
1098
+ "baseline": {
1099
+ "compile_ok": false,
1100
+ "n_gen_tokens": 256
1101
+ },
1102
+ "ablated": {
1103
+ "compile_ok": false,
1104
+ "n_gen_tokens": 256
1105
+ }
1106
+ },
1107
+ {
1108
+ "ex_id": "openai_humaneval-test-32",
1109
+ "baseline": {
1110
+ "compile_ok": true,
1111
+ "n_gen_tokens": 256
1112
+ },
1113
+ "ablated": {
1114
+ "compile_ok": true,
1115
+ "n_gen_tokens": 256
1116
+ }
1117
+ },
1118
+ {
1119
+ "ex_id": "openai_humaneval-test-11",
1120
+ "baseline": {
1121
+ "compile_ok": false,
1122
+ "n_gen_tokens": 256
1123
+ },
1124
+ "ablated": {
1125
+ "compile_ok": false,
1126
+ "n_gen_tokens": 256
1127
+ }
1128
+ },
1129
+ {
1130
+ "ex_id": "openai_humaneval-test-47",
1131
+ "baseline": {
1132
+ "compile_ok": true,
1133
+ "n_gen_tokens": 256
1134
+ },
1135
+ "ablated": {
1136
+ "compile_ok": false,
1137
+ "n_gen_tokens": 256
1138
+ }
1139
+ },
1140
+ {
1141
+ "ex_id": "openai_humaneval-test-22",
1142
+ "baseline": {
1143
+ "compile_ok": false,
1144
+ "n_gen_tokens": 256
1145
+ },
1146
+ "ablated": {
1147
+ "compile_ok": false,
1148
+ "n_gen_tokens": 256
1149
+ }
1150
+ },
1151
+ {
1152
+ "ex_id": "openai_humaneval-test-140",
1153
+ "baseline": {
1154
+ "compile_ok": false,
1155
+ "n_gen_tokens": 256
1156
+ },
1157
+ "ablated": {
1158
+ "compile_ok": false,
1159
+ "n_gen_tokens": 256
1160
+ }
1161
+ },
1162
+ {
1163
+ "ex_id": "openai_humaneval-test-30",
1164
+ "baseline": {
1165
+ "compile_ok": true,
1166
+ "n_gen_tokens": 256
1167
+ },
1168
+ "ablated": {
1169
+ "compile_ok": false,
1170
+ "n_gen_tokens": 256
1171
+ }
1172
+ },
1173
+ {
1174
+ "ex_id": "openai_humaneval-test-10",
1175
+ "baseline": {
1176
+ "compile_ok": false,
1177
+ "n_gen_tokens": 256
1178
+ },
1179
+ "ablated": {
1180
+ "compile_ok": false,
1181
+ "n_gen_tokens": 256
1182
+ }
1183
+ },
1184
+ {
1185
+ "ex_id": "openai_humaneval-test-112",
1186
+ "baseline": {
1187
+ "compile_ok": true,
1188
+ "n_gen_tokens": 256
1189
+ },
1190
+ "ablated": {
1191
+ "compile_ok": false,
1192
+ "n_gen_tokens": 256
1193
+ }
1194
+ },
1195
+ {
1196
+ "ex_id": "openai_humaneval-test-111",
1197
+ "baseline": {
1198
+ "compile_ok": true,
1199
+ "n_gen_tokens": 256
1200
+ },
1201
+ "ablated": {
1202
+ "compile_ok": false,
1203
+ "n_gen_tokens": 256
1204
+ }
1205
+ },
1206
+ {
1207
+ "ex_id": "openai_humaneval-test-28",
1208
+ "baseline": {
1209
+ "compile_ok": false,
1210
+ "n_gen_tokens": 256
1211
+ },
1212
+ "ablated": {
1213
+ "compile_ok": false,
1214
+ "n_gen_tokens": 256
1215
+ }
1216
+ },
1217
+ {
1218
+ "ex_id": "openai_humaneval-test-135",
1219
+ "baseline": {
1220
+ "compile_ok": false,
1221
+ "n_gen_tokens": 256
1222
+ },
1223
+ "ablated": {
1224
+ "compile_ok": false,
1225
+ "n_gen_tokens": 256
1226
+ }
1227
+ },
1228
+ {
1229
+ "ex_id": "openai_humaneval-test-94",
1230
+ "baseline": {
1231
+ "compile_ok": false,
1232
+ "n_gen_tokens": 256
1233
+ },
1234
+ "ablated": {
1235
+ "compile_ok": false,
1236
+ "n_gen_tokens": 256
1237
+ }
1238
+ },
1239
+ {
1240
+ "ex_id": "openai_humaneval-test-13",
1241
+ "baseline": {
1242
+ "compile_ok": false,
1243
+ "n_gen_tokens": 256
1244
+ },
1245
+ "ablated": {
1246
+ "compile_ok": false,
1247
+ "n_gen_tokens": 256
1248
+ }
1249
+ },
1250
+ {
1251
+ "ex_id": "openai_humaneval-test-7",
1252
+ "baseline": {
1253
+ "compile_ok": false,
1254
+ "n_gen_tokens": 256
1255
+ },
1256
+ "ablated": {
1257
+ "compile_ok": false,
1258
+ "n_gen_tokens": 256
1259
+ }
1260
+ },
1261
+ {
1262
+ "ex_id": "openai_humaneval-test-157",
1263
+ "baseline": {
1264
+ "compile_ok": false,
1265
+ "n_gen_tokens": 256
1266
+ },
1267
+ "ablated": {
1268
+ "compile_ok": false,
1269
+ "n_gen_tokens": 256
1270
+ }
1271
+ },
1272
+ {
1273
+ "ex_id": "openai_humaneval-test-49",
1274
+ "baseline": {
1275
+ "compile_ok": false,
1276
+ "n_gen_tokens": 256
1277
+ },
1278
+ "ablated": {
1279
+ "compile_ok": false,
1280
+ "n_gen_tokens": 256
1281
+ }
1282
+ },
1283
+ {
1284
+ "ex_id": "openai_humaneval-test-120",
1285
+ "baseline": {
1286
+ "compile_ok": false,
1287
+ "n_gen_tokens": 256
1288
+ },
1289
+ "ablated": {
1290
+ "compile_ok": false,
1291
+ "n_gen_tokens": 256
1292
+ }
1293
+ },
1294
+ {
1295
+ "ex_id": "openai_humaneval-test-89",
1296
+ "baseline": {
1297
+ "compile_ok": false,
1298
+ "n_gen_tokens": 256
1299
+ },
1300
+ "ablated": {
1301
+ "compile_ok": false,
1302
+ "n_gen_tokens": 256
1303
+ }
1304
+ },
1305
+ {
1306
+ "ex_id": "openai_humaneval-test-98",
1307
+ "baseline": {
1308
+ "compile_ok": false,
1309
+ "n_gen_tokens": 256
1310
+ },
1311
+ "ablated": {
1312
+ "compile_ok": false,
1313
+ "n_gen_tokens": 256
1314
+ }
1315
+ },
1316
+ {
1317
+ "ex_id": "openai_humaneval-test-100",
1318
+ "baseline": {
1319
+ "compile_ok": false,
1320
+ "n_gen_tokens": 256
1321
+ },
1322
+ "ablated": {
1323
+ "compile_ok": true,
1324
+ "n_gen_tokens": 256
1325
+ }
1326
+ },
1327
+ {
1328
+ "ex_id": "openai_humaneval-test-64",
1329
+ "baseline": {
1330
+ "compile_ok": false,
1331
+ "n_gen_tokens": 256
1332
+ },
1333
+ "ablated": {
1334
+ "compile_ok": false,
1335
+ "n_gen_tokens": 256
1336
+ }
1337
+ },
1338
+ {
1339
+ "ex_id": "openai_humaneval-test-103",
1340
+ "baseline": {
1341
+ "compile_ok": false,
1342
+ "n_gen_tokens": 256
1343
+ },
1344
+ "ablated": {
1345
+ "compile_ok": false,
1346
+ "n_gen_tokens": 256
1347
+ }
1348
+ },
1349
+ {
1350
+ "ex_id": "openai_humaneval-test-125",
1351
+ "baseline": {
1352
+ "compile_ok": false,
1353
+ "n_gen_tokens": 256
1354
+ },
1355
+ "ablated": {
1356
+ "compile_ok": false,
1357
+ "n_gen_tokens": 256
1358
+ }
1359
+ },
1360
+ {
1361
+ "ex_id": "openai_humaneval-test-51",
1362
+ "baseline": {
1363
+ "compile_ok": false,
1364
+ "n_gen_tokens": 256
1365
+ },
1366
+ "ablated": {
1367
+ "compile_ok": false,
1368
+ "n_gen_tokens": 256
1369
+ }
1370
+ },
1371
+ {
1372
+ "ex_id": "openai_humaneval-test-90",
1373
+ "baseline": {
1374
+ "compile_ok": false,
1375
+ "n_gen_tokens": 256
1376
+ },
1377
+ "ablated": {
1378
+ "compile_ok": false,
1379
+ "n_gen_tokens": 256
1380
+ }
1381
+ },
1382
+ {
1383
+ "ex_id": "openai_humaneval-test-152",
1384
+ "baseline": {
1385
+ "compile_ok": true,
1386
+ "n_gen_tokens": 256
1387
+ },
1388
+ "ablated": {
1389
+ "compile_ok": false,
1390
+ "n_gen_tokens": 256
1391
+ }
1392
+ },
1393
+ {
1394
+ "ex_id": "openai_humaneval-test-24",
1395
+ "baseline": {
1396
+ "compile_ok": true,
1397
+ "n_gen_tokens": 256
1398
+ },
1399
+ "ablated": {
1400
+ "compile_ok": false,
1401
+ "n_gen_tokens": 256
1402
+ }
1403
+ },
1404
+ {
1405
+ "ex_id": "openai_humaneval-test-20",
1406
+ "baseline": {
1407
+ "compile_ok": false,
1408
+ "n_gen_tokens": 256
1409
+ },
1410
+ "ablated": {
1411
+ "compile_ok": true,
1412
+ "n_gen_tokens": 256
1413
+ }
1414
+ },
1415
+ {
1416
+ "ex_id": "openai_humaneval-test-148",
1417
+ "baseline": {
1418
+ "compile_ok": false,
1419
+ "n_gen_tokens": 256
1420
+ },
1421
+ "ablated": {
1422
+ "compile_ok": false,
1423
+ "n_gen_tokens": 256
1424
+ }
1425
+ },
1426
+ {
1427
+ "ex_id": "openai_humaneval-test-92",
1428
+ "baseline": {
1429
+ "compile_ok": false,
1430
+ "n_gen_tokens": 256
1431
+ },
1432
+ "ablated": {
1433
+ "compile_ok": true,
1434
+ "n_gen_tokens": 256
1435
+ }
1436
+ },
1437
+ {
1438
+ "ex_id": "openai_humaneval-test-26",
1439
+ "baseline": {
1440
+ "compile_ok": false,
1441
+ "n_gen_tokens": 256
1442
+ },
1443
+ "ablated": {
1444
+ "compile_ok": true,
1445
+ "n_gen_tokens": 256
1446
+ }
1447
+ },
1448
+ {
1449
+ "ex_id": "openai_humaneval-test-16",
1450
+ "baseline": {
1451
+ "compile_ok": false,
1452
+ "n_gen_tokens": 256
1453
+ },
1454
+ "ablated": {
1455
+ "compile_ok": false,
1456
+ "n_gen_tokens": 256
1457
+ }
1458
+ },
1459
+ {
1460
+ "ex_id": "openai_humaneval-test-142",
1461
+ "baseline": {
1462
+ "compile_ok": true,
1463
+ "n_gen_tokens": 256
1464
+ },
1465
+ "ablated": {
1466
+ "compile_ok": false,
1467
+ "n_gen_tokens": 256
1468
+ }
1469
+ },
1470
+ {
1471
+ "ex_id": "openai_humaneval-test-99",
1472
+ "baseline": {
1473
+ "compile_ok": false,
1474
+ "n_gen_tokens": 256
1475
+ },
1476
+ "ablated": {
1477
+ "compile_ok": false,
1478
+ "n_gen_tokens": 256
1479
+ }
1480
+ },
1481
+ {
1482
+ "ex_id": "openai_humaneval-test-127",
1483
+ "baseline": {
1484
+ "compile_ok": false,
1485
+ "n_gen_tokens": 256
1486
+ },
1487
+ "ablated": {
1488
+ "compile_ok": false,
1489
+ "n_gen_tokens": 256
1490
+ }
1491
+ },
1492
+ {
1493
+ "ex_id": "openai_humaneval-test-156",
1494
+ "baseline": {
1495
+ "compile_ok": true,
1496
+ "n_gen_tokens": 256
1497
+ },
1498
+ "ablated": {
1499
+ "compile_ok": false,
1500
+ "n_gen_tokens": 256
1501
+ }
1502
+ },
1503
+ {
1504
+ "ex_id": "openai_humaneval-test-141",
1505
+ "baseline": {
1506
+ "compile_ok": true,
1507
+ "n_gen_tokens": 256
1508
+ },
1509
+ "ablated": {
1510
+ "compile_ok": false,
1511
+ "n_gen_tokens": 256
1512
+ }
1513
+ },
1514
+ {
1515
+ "ex_id": "openai_humaneval-test-78",
1516
+ "baseline": {
1517
+ "compile_ok": false,
1518
+ "n_gen_tokens": 256
1519
+ },
1520
+ "ablated": {
1521
+ "compile_ok": true,
1522
+ "n_gen_tokens": 256
1523
+ }
1524
+ },
1525
+ {
1526
+ "ex_id": "openai_humaneval-test-68",
1527
+ "baseline": {
1528
+ "compile_ok": false,
1529
+ "n_gen_tokens": 256
1530
+ },
1531
+ "ablated": {
1532
+ "compile_ok": false,
1533
+ "n_gen_tokens": 256
1534
+ }
1535
+ },
1536
+ {
1537
+ "ex_id": "openai_humaneval-test-93",
1538
+ "baseline": {
1539
+ "compile_ok": false,
1540
+ "n_gen_tokens": 256
1541
+ },
1542
+ "ablated": {
1543
+ "compile_ok": false,
1544
+ "n_gen_tokens": 256
1545
+ }
1546
+ },
1547
+ {
1548
+ "ex_id": "openai_humaneval-test-60",
1549
+ "baseline": {
1550
+ "compile_ok": false,
1551
+ "n_gen_tokens": 256
1552
+ },
1553
+ "ablated": {
1554
+ "compile_ok": false,
1555
+ "n_gen_tokens": 256
1556
+ }
1557
+ },
1558
+ {
1559
+ "ex_id": "openai_humaneval-test-82",
1560
+ "baseline": {
1561
+ "compile_ok": true,
1562
+ "n_gen_tokens": 256
1563
+ },
1564
+ "ablated": {
1565
+ "compile_ok": true,
1566
+ "n_gen_tokens": 256
1567
+ }
1568
+ },
1569
+ {
1570
+ "ex_id": "openai_humaneval-test-59",
1571
+ "baseline": {
1572
+ "compile_ok": false,
1573
+ "n_gen_tokens": 256
1574
+ },
1575
+ "ablated": {
1576
+ "compile_ok": false,
1577
+ "n_gen_tokens": 256
1578
+ }
1579
+ },
1580
+ {
1581
+ "ex_id": "openai_humaneval-test-149",
1582
+ "baseline": {
1583
+ "compile_ok": false,
1584
+ "n_gen_tokens": 256
1585
+ },
1586
+ "ablated": {
1587
+ "compile_ok": false,
1588
+ "n_gen_tokens": 256
1589
+ }
1590
+ },
1591
+ {
1592
+ "ex_id": "openai_humaneval-test-42",
1593
+ "baseline": {
1594
+ "compile_ok": true,
1595
+ "n_gen_tokens": 256
1596
+ },
1597
+ "ablated": {
1598
+ "compile_ok": false,
1599
+ "n_gen_tokens": 256
1600
+ }
1601
+ },
1602
+ {
1603
+ "ex_id": "openai_humaneval-test-163",
1604
+ "baseline": {
1605
+ "compile_ok": false,
1606
+ "n_gen_tokens": 256
1607
+ },
1608
+ "ablated": {
1609
+ "compile_ok": false,
1610
+ "n_gen_tokens": 256
1611
+ }
1612
+ },
1613
+ {
1614
+ "ex_id": "openai_humaneval-test-133",
1615
+ "baseline": {
1616
+ "compile_ok": true,
1617
+ "n_gen_tokens": 256
1618
+ },
1619
+ "ablated": {
1620
+ "compile_ok": true,
1621
+ "n_gen_tokens": 256
1622
+ }
1623
+ },
1624
+ {
1625
+ "ex_id": "openai_humaneval-test-161",
1626
+ "baseline": {
1627
+ "compile_ok": false,
1628
+ "n_gen_tokens": 256
1629
+ },
1630
+ "ablated": {
1631
+ "compile_ok": false,
1632
+ "n_gen_tokens": 256
1633
+ }
1634
+ },
1635
+ {
1636
+ "ex_id": "openai_humaneval-test-29",
1637
+ "baseline": {
1638
+ "compile_ok": false,
1639
+ "n_gen_tokens": 256
1640
+ },
1641
+ "ablated": {
1642
+ "compile_ok": false,
1643
+ "n_gen_tokens": 256
1644
+ }
1645
+ },
1646
+ {
1647
+ "ex_id": "openai_humaneval-test-27",
1648
+ "baseline": {
1649
+ "compile_ok": true,
1650
+ "n_gen_tokens": 256
1651
+ },
1652
+ "ablated": {
1653
+ "compile_ok": false,
1654
+ "n_gen_tokens": 256
1655
+ }
1656
+ },
1657
+ {
1658
+ "ex_id": "openai_humaneval-test-61",
1659
+ "baseline": {
1660
+ "compile_ok": false,
1661
+ "n_gen_tokens": 256
1662
+ },
1663
+ "ablated": {
1664
+ "compile_ok": false,
1665
+ "n_gen_tokens": 256
1666
+ }
1667
+ },
1668
+ {
1669
+ "ex_id": "openai_humaneval-test-104",
1670
+ "baseline": {
1671
+ "compile_ok": false,
1672
+ "n_gen_tokens": 256
1673
+ },
1674
+ "ablated": {
1675
+ "compile_ok": false,
1676
+ "n_gen_tokens": 256
1677
+ }
1678
+ },
1679
+ {
1680
+ "ex_id": "openai_humaneval-test-45",
1681
+ "baseline": {
1682
+ "compile_ok": true,
1683
+ "n_gen_tokens": 256
1684
+ },
1685
+ "ablated": {
1686
+ "compile_ok": false,
1687
+ "n_gen_tokens": 256
1688
+ }
1689
+ },
1690
+ {
1691
+ "ex_id": "openai_humaneval-test-75",
1692
+ "baseline": {
1693
+ "compile_ok": false,
1694
+ "n_gen_tokens": 256
1695
+ },
1696
+ "ablated": {
1697
+ "compile_ok": false,
1698
+ "n_gen_tokens": 256
1699
+ }
1700
+ },
1701
+ {
1702
+ "ex_id": "openai_humaneval-test-6",
1703
+ "baseline": {
1704
+ "compile_ok": false,
1705
+ "n_gen_tokens": 256
1706
+ },
1707
+ "ablated": {
1708
+ "compile_ok": false,
1709
+ "n_gen_tokens": 256
1710
+ }
1711
+ },
1712
+ {
1713
+ "ex_id": "openai_humaneval-test-136",
1714
+ "baseline": {
1715
+ "compile_ok": false,
1716
+ "n_gen_tokens": 256
1717
+ },
1718
+ "ablated": {
1719
+ "compile_ok": false,
1720
+ "n_gen_tokens": 256
1721
+ }
1722
+ },
1723
+ {
1724
+ "ex_id": "openai_humaneval-test-33",
1725
+ "baseline": {
1726
+ "compile_ok": false,
1727
+ "n_gen_tokens": 256
1728
+ },
1729
+ "ablated": {
1730
+ "compile_ok": false,
1731
+ "n_gen_tokens": 256
1732
+ }
1733
+ },
1734
+ {
1735
+ "ex_id": "openai_humaneval-test-44",
1736
+ "baseline": {
1737
+ "compile_ok": false,
1738
+ "n_gen_tokens": 256
1739
+ },
1740
+ "ablated": {
1741
+ "compile_ok": true,
1742
+ "n_gen_tokens": 256
1743
+ }
1744
+ },
1745
+ {
1746
+ "ex_id": "openai_humaneval-test-88",
1747
+ "baseline": {
1748
+ "compile_ok": false,
1749
+ "n_gen_tokens": 256
1750
+ },
1751
+ "ablated": {
1752
+ "compile_ok": false,
1753
+ "n_gen_tokens": 256
1754
+ }
1755
+ },
1756
+ {
1757
+ "ex_id": "openai_humaneval-test-12",
1758
+ "baseline": {
1759
+ "compile_ok": false,
1760
+ "n_gen_tokens": 256
1761
+ },
1762
+ "ablated": {
1763
+ "compile_ok": false,
1764
+ "n_gen_tokens": 256
1765
+ }
1766
+ },
1767
+ {
1768
+ "ex_id": "openai_humaneval-test-105",
1769
+ "baseline": {
1770
+ "compile_ok": false,
1771
+ "n_gen_tokens": 256
1772
+ },
1773
+ "ablated": {
1774
+ "compile_ok": false,
1775
+ "n_gen_tokens": 256
1776
+ }
1777
+ },
1778
+ {
1779
+ "ex_id": "openai_humaneval-test-121",
1780
+ "baseline": {
1781
+ "compile_ok": false,
1782
+ "n_gen_tokens": 256
1783
+ },
1784
+ "ablated": {
1785
+ "compile_ok": false,
1786
+ "n_gen_tokens": 256
1787
+ }
1788
+ },
1789
+ {
1790
+ "ex_id": "openai_humaneval-test-144",
1791
+ "baseline": {
1792
+ "compile_ok": false,
1793
+ "n_gen_tokens": 256
1794
+ },
1795
+ "ablated": {
1796
+ "compile_ok": true,
1797
+ "n_gen_tokens": 256
1798
+ }
1799
+ },
1800
+ {
1801
+ "ex_id": "openai_humaneval-test-73",
1802
+ "baseline": {
1803
+ "compile_ok": true,
1804
+ "n_gen_tokens": 256
1805
+ },
1806
+ "ablated": {
1807
+ "compile_ok": false,
1808
+ "n_gen_tokens": 256
1809
+ }
1810
+ },
1811
+ {
1812
+ "ex_id": "openai_humaneval-test-76",
1813
+ "baseline": {
1814
+ "compile_ok": false,
1815
+ "n_gen_tokens": 256
1816
+ },
1817
+ "ablated": {
1818
+ "compile_ok": false,
1819
+ "n_gen_tokens": 256
1820
+ }
1821
+ },
1822
+ {
1823
+ "ex_id": "openai_humaneval-test-5",
1824
+ "baseline": {
1825
+ "compile_ok": false,
1826
+ "n_gen_tokens": 256
1827
+ },
1828
+ "ablated": {
1829
+ "compile_ok": false,
1830
+ "n_gen_tokens": 256
1831
+ }
1832
+ },
1833
+ {
1834
+ "ex_id": "openai_humaneval-test-46",
1835
+ "baseline": {
1836
+ "compile_ok": true,
1837
+ "n_gen_tokens": 256
1838
+ },
1839
+ "ablated": {
1840
+ "compile_ok": false,
1841
+ "n_gen_tokens": 256
1842
+ }
1843
+ },
1844
+ {
1845
+ "ex_id": "openai_humaneval-test-150",
1846
+ "baseline": {
1847
+ "compile_ok": true,
1848
+ "n_gen_tokens": 256
1849
+ },
1850
+ "ablated": {
1851
+ "compile_ok": false,
1852
+ "n_gen_tokens": 256
1853
+ }
1854
+ },
1855
+ {
1856
+ "ex_id": "openai_humaneval-test-35",
1857
+ "baseline": {
1858
+ "compile_ok": false,
1859
+ "n_gen_tokens": 256
1860
+ },
1861
+ "ablated": {
1862
+ "compile_ok": false,
1863
+ "n_gen_tokens": 256
1864
+ }
1865
+ },
1866
+ {
1867
+ "ex_id": "openai_humaneval-test-153",
1868
+ "baseline": {
1869
+ "compile_ok": false,
1870
+ "n_gen_tokens": 256
1871
+ },
1872
+ "ablated": {
1873
+ "compile_ok": false,
1874
+ "n_gen_tokens": 256
1875
+ }
1876
+ }
1877
+ ],
1878
+ "flip_rows": [
1879
+ {
1880
+ "ex_id": "openai_humaneval-test-74",
1881
+ "baseline": {
1882
+ "compile_ok": true
1883
+ },
1884
+ "ablated": {
1885
+ "compile_ok": false
1886
+ },
1887
+ "patched_self": {
1888
+ "compile_ok": false
1889
+ },
1890
+ "control_time_shuffled": {
1891
+ "compile_ok": false
1892
+ },
1893
+ "control_shared_randvec": {
1894
+ "compile_ok": false
1895
+ },
1896
+ "control_rand_subspace": {
1897
+ "compile_ok": false
1898
+ },
1899
+ "control_patch_nonshared": {
1900
+ "compile_ok": false
1901
+ }
1902
+ },
1903
+ {
1904
+ "ex_id": "openai_humaneval-test-87",
1905
+ "baseline": {
1906
+ "compile_ok": true
1907
+ },
1908
+ "ablated": {
1909
+ "compile_ok": false
1910
+ },
1911
+ "patched_self": {
1912
+ "compile_ok": false
1913
+ },
1914
+ "control_time_shuffled": {
1915
+ "compile_ok": false
1916
+ },
1917
+ "control_shared_randvec": {
1918
+ "compile_ok": false
1919
+ },
1920
+ "control_rand_subspace": {
1921
+ "compile_ok": false
1922
+ },
1923
+ "control_patch_nonshared": {
1924
+ "compile_ok": false
1925
+ }
1926
+ },
1927
+ {
1928
+ "ex_id": "openai_humaneval-test-14",
1929
+ "baseline": {
1930
+ "compile_ok": true
1931
+ },
1932
+ "ablated": {
1933
+ "compile_ok": false
1934
+ },
1935
+ "patched_self": {
1936
+ "compile_ok": false
1937
+ },
1938
+ "control_time_shuffled": {
1939
+ "compile_ok": false
1940
+ },
1941
+ "control_shared_randvec": {
1942
+ "compile_ok": true
1943
+ },
1944
+ "control_rand_subspace": {
1945
+ "compile_ok": false
1946
+ },
1947
+ "control_patch_nonshared": {
1948
+ "compile_ok": false
1949
+ }
1950
+ },
1951
+ {
1952
+ "ex_id": "openai_humaneval-test-58",
1953
+ "baseline": {
1954
+ "compile_ok": true
1955
+ },
1956
+ "ablated": {
1957
+ "compile_ok": false
1958
+ },
1959
+ "patched_self": {
1960
+ "compile_ok": false
1961
+ },
1962
+ "control_time_shuffled": {
1963
+ "compile_ok": false
1964
+ },
1965
+ "control_shared_randvec": {
1966
+ "compile_ok": false
1967
+ },
1968
+ "control_rand_subspace": {
1969
+ "compile_ok": false
1970
+ },
1971
+ "control_patch_nonshared": {
1972
+ "compile_ok": false
1973
+ }
1974
+ },
1975
+ {
1976
+ "ex_id": "openai_humaneval-test-47",
1977
+ "baseline": {
1978
+ "compile_ok": true
1979
+ },
1980
+ "ablated": {
1981
+ "compile_ok": false
1982
+ },
1983
+ "patched_self": {
1984
+ "compile_ok": true
1985
+ },
1986
+ "control_time_shuffled": {
1987
+ "compile_ok": true
1988
+ },
1989
+ "control_shared_randvec": {
1990
+ "compile_ok": false
1991
+ },
1992
+ "control_rand_subspace": {
1993
+ "compile_ok": false
1994
+ },
1995
+ "control_patch_nonshared": {
1996
+ "compile_ok": false
1997
+ }
1998
+ },
1999
+ {
2000
+ "ex_id": "openai_humaneval-test-30",
2001
+ "baseline": {
2002
+ "compile_ok": true
2003
+ },
2004
+ "ablated": {
2005
+ "compile_ok": false
2006
+ },
2007
+ "patched_self": {
2008
+ "compile_ok": false
2009
+ },
2010
+ "control_time_shuffled": {
2011
+ "compile_ok": false
2012
+ },
2013
+ "control_shared_randvec": {
2014
+ "compile_ok": false
2015
+ },
2016
+ "control_rand_subspace": {
2017
+ "compile_ok": false
2018
+ },
2019
+ "control_patch_nonshared": {
2020
+ "compile_ok": false
2021
+ }
2022
+ },
2023
+ {
2024
+ "ex_id": "openai_humaneval-test-112",
2025
+ "baseline": {
2026
+ "compile_ok": true
2027
+ },
2028
+ "ablated": {
2029
+ "compile_ok": false
2030
+ },
2031
+ "patched_self": {
2032
+ "compile_ok": false
2033
+ },
2034
+ "control_time_shuffled": {
2035
+ "compile_ok": false
2036
+ },
2037
+ "control_shared_randvec": {
2038
+ "compile_ok": false
2039
+ },
2040
+ "control_rand_subspace": {
2041
+ "compile_ok": false
2042
+ },
2043
+ "control_patch_nonshared": {
2044
+ "compile_ok": false
2045
+ }
2046
+ },
2047
+ {
2048
+ "ex_id": "openai_humaneval-test-111",
2049
+ "baseline": {
2050
+ "compile_ok": true
2051
+ },
2052
+ "ablated": {
2053
+ "compile_ok": false
2054
+ },
2055
+ "patched_self": {
2056
+ "compile_ok": false
2057
+ },
2058
+ "control_time_shuffled": {
2059
+ "compile_ok": false
2060
+ },
2061
+ "control_shared_randvec": {
2062
+ "compile_ok": false
2063
+ },
2064
+ "control_rand_subspace": {
2065
+ "compile_ok": false
2066
+ },
2067
+ "control_patch_nonshared": {
2068
+ "compile_ok": false
2069
+ }
2070
+ },
2071
+ {
2072
+ "ex_id": "openai_humaneval-test-152",
2073
+ "baseline": {
2074
+ "compile_ok": true
2075
+ },
2076
+ "ablated": {
2077
+ "compile_ok": false
2078
+ },
2079
+ "patched_self": {
2080
+ "compile_ok": false
2081
+ },
2082
+ "control_time_shuffled": {
2083
+ "compile_ok": true
2084
+ },
2085
+ "control_shared_randvec": {
2086
+ "compile_ok": false
2087
+ },
2088
+ "control_rand_subspace": {
2089
+ "compile_ok": false
2090
+ },
2091
+ "control_patch_nonshared": {
2092
+ "compile_ok": false
2093
+ }
2094
+ },
2095
+ {
2096
+ "ex_id": "openai_humaneval-test-24",
2097
+ "baseline": {
2098
+ "compile_ok": true
2099
+ },
2100
+ "ablated": {
2101
+ "compile_ok": false
2102
+ },
2103
+ "patched_self": {
2104
+ "compile_ok": false
2105
+ },
2106
+ "control_time_shuffled": {
2107
+ "compile_ok": false
2108
+ },
2109
+ "control_shared_randvec": {
2110
+ "compile_ok": false
2111
+ },
2112
+ "control_rand_subspace": {
2113
+ "compile_ok": false
2114
+ },
2115
+ "control_patch_nonshared": {
2116
+ "compile_ok": false
2117
+ }
2118
+ },
2119
+ {
2120
+ "ex_id": "openai_humaneval-test-142",
2121
+ "baseline": {
2122
+ "compile_ok": true
2123
+ },
2124
+ "ablated": {
2125
+ "compile_ok": false
2126
+ },
2127
+ "patched_self": {
2128
+ "compile_ok": false
2129
+ },
2130
+ "control_time_shuffled": {
2131
+ "compile_ok": false
2132
+ },
2133
+ "control_shared_randvec": {
2134
+ "compile_ok": false
2135
+ },
2136
+ "control_rand_subspace": {
2137
+ "compile_ok": false
2138
+ },
2139
+ "control_patch_nonshared": {
2140
+ "compile_ok": false
2141
+ }
2142
+ },
2143
+ {
2144
+ "ex_id": "openai_humaneval-test-156",
2145
+ "baseline": {
2146
+ "compile_ok": true
2147
+ },
2148
+ "ablated": {
2149
+ "compile_ok": false
2150
+ },
2151
+ "patched_self": {
2152
+ "compile_ok": false
2153
+ },
2154
+ "control_time_shuffled": {
2155
+ "compile_ok": false
2156
+ },
2157
+ "control_shared_randvec": {
2158
+ "compile_ok": true
2159
+ },
2160
+ "control_rand_subspace": {
2161
+ "compile_ok": false
2162
+ },
2163
+ "control_patch_nonshared": {
2164
+ "compile_ok": false
2165
+ }
2166
+ },
2167
+ {
2168
+ "ex_id": "openai_humaneval-test-141",
2169
+ "baseline": {
2170
+ "compile_ok": true
2171
+ },
2172
+ "ablated": {
2173
+ "compile_ok": false
2174
+ },
2175
+ "patched_self": {
2176
+ "compile_ok": false
2177
+ },
2178
+ "control_time_shuffled": {
2179
+ "compile_ok": false
2180
+ },
2181
+ "control_shared_randvec": {
2182
+ "compile_ok": false
2183
+ },
2184
+ "control_rand_subspace": {
2185
+ "compile_ok": true
2186
+ },
2187
+ "control_patch_nonshared": {
2188
+ "compile_ok": false
2189
+ }
2190
+ },
2191
+ {
2192
+ "ex_id": "openai_humaneval-test-42",
2193
+ "baseline": {
2194
+ "compile_ok": true
2195
+ },
2196
+ "ablated": {
2197
+ "compile_ok": false
2198
+ },
2199
+ "patched_self": {
2200
+ "compile_ok": false
2201
+ },
2202
+ "control_time_shuffled": {
2203
+ "compile_ok": false
2204
+ },
2205
+ "control_shared_randvec": {
2206
+ "compile_ok": false
2207
+ },
2208
+ "control_rand_subspace": {
2209
+ "compile_ok": true
2210
+ },
2211
+ "control_patch_nonshared": {
2212
+ "compile_ok": false
2213
+ }
2214
+ },
2215
+ {
2216
+ "ex_id": "openai_humaneval-test-27",
2217
+ "baseline": {
2218
+ "compile_ok": true
2219
+ },
2220
+ "ablated": {
2221
+ "compile_ok": false
2222
+ },
2223
+ "patched_self": {
2224
+ "compile_ok": false
2225
+ },
2226
+ "control_time_shuffled": {
2227
+ "compile_ok": false
2228
+ },
2229
+ "control_shared_randvec": {
2230
+ "compile_ok": false
2231
+ },
2232
+ "control_rand_subspace": {
2233
+ "compile_ok": true
2234
+ },
2235
+ "control_patch_nonshared": {
2236
+ "compile_ok": false
2237
+ }
2238
+ },
2239
+ {
2240
+ "ex_id": "openai_humaneval-test-45",
2241
+ "baseline": {
2242
+ "compile_ok": true
2243
+ },
2244
+ "ablated": {
2245
+ "compile_ok": false
2246
+ },
2247
+ "patched_self": {
2248
+ "compile_ok": false
2249
+ },
2250
+ "control_time_shuffled": {
2251
+ "compile_ok": false
2252
+ },
2253
+ "control_shared_randvec": {
2254
+ "compile_ok": false
2255
+ },
2256
+ "control_rand_subspace": {
2257
+ "compile_ok": false
2258
+ },
2259
+ "control_patch_nonshared": {
2260
+ "compile_ok": false
2261
+ }
2262
+ },
2263
+ {
2264
+ "ex_id": "openai_humaneval-test-73",
2265
+ "baseline": {
2266
+ "compile_ok": true
2267
+ },
2268
+ "ablated": {
2269
+ "compile_ok": false
2270
+ },
2271
+ "patched_self": {
2272
+ "compile_ok": false
2273
+ },
2274
+ "control_time_shuffled": {
2275
+ "compile_ok": false
2276
+ },
2277
+ "control_shared_randvec": {
2278
+ "compile_ok": false
2279
+ },
2280
+ "control_rand_subspace": {
2281
+ "compile_ok": true
2282
+ },
2283
+ "control_patch_nonshared": {
2284
+ "compile_ok": false
2285
+ }
2286
+ },
2287
+ {
2288
+ "ex_id": "openai_humaneval-test-46",
2289
+ "baseline": {
2290
+ "compile_ok": true
2291
+ },
2292
+ "ablated": {
2293
+ "compile_ok": false
2294
+ },
2295
+ "patched_self": {
2296
+ "compile_ok": false
2297
+ },
2298
+ "control_time_shuffled": {
2299
+ "compile_ok": false
2300
+ },
2301
+ "control_shared_randvec": {
2302
+ "compile_ok": false
2303
+ },
2304
+ "control_rand_subspace": {
2305
+ "compile_ok": false
2306
+ },
2307
+ "control_patch_nonshared": {
2308
+ "compile_ok": false
2309
+ }
2310
+ },
2311
+ {
2312
+ "ex_id": "openai_humaneval-test-150",
2313
+ "baseline": {
2314
+ "compile_ok": true
2315
+ },
2316
+ "ablated": {
2317
+ "compile_ok": false
2318
+ },
2319
+ "patched_self": {
2320
+ "compile_ok": true
2321
+ },
2322
+ "control_time_shuffled": {
2323
+ "compile_ok": false
2324
+ },
2325
+ "control_shared_randvec": {
2326
+ "compile_ok": true
2327
+ },
2328
+ "control_rand_subspace": {
2329
+ "compile_ok": true
2330
+ },
2331
+ "control_patch_nonshared": {
2332
+ "compile_ok": false
2333
+ }
2334
+ }
2335
+ ]
2336
+ }
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/openanswer_seed123/humaneval_pairlogprob.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/aqua.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/arc_challenge.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/commonsenseqa.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/logiqa.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/openbookqa.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/piqa.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer10/subspace_mc_seed123/qasc.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/Q_shared_layer24_seed123.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a9018935fb6662a113283f4c171f976832435c2079df0e63e642751545ed5ae
3
+ size 3197056
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/Q_shared_layer24_seed456.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad986416ec27af928bd2970b60b8a182807b2aea22f2685073b2b215f8e50e8
3
+ size 3168384
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/alpha_sweep.csv ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ file,task,layer,seed,alpha,n,flip_rate,ablated_acc,pred_change_rate,mean_margin,mean_delta_margin_vs_baseline
2
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.0,61,0.0,1.0,0.0,2.8609182834625244,0.0
3
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.02,61,0.01639344262295082,0.9836065573770492,0.01639344262295082,2.83201265335083,-0.028905684128403664
4
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.05,61,0.01639344262295082,0.9836065573770492,0.01639344262295082,2.7874269485473633,-0.0734911859035492
5
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.1,61,0.03278688524590164,0.9672131147540983,0.03278688524590164,2.709939956665039,-0.15097840130329132
6
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.2,61,0.04918032786885246,0.9508196721311475,0.04918032786885246,2.5366058349609375,-0.3243127763271332
7
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.3,61,0.11475409836065574,0.8852459016393442,0.11475409836065574,2.32297682762146,-0.5379418730735779
8
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.5,61,0.16393442622950818,0.8360655737704918,0.16393442622950818,1.717830777168274,-1.1430877447128296
9
+ aqua_alpha_sweep_seed123.json,aqua,24,123,0.75,61,0.4098360655737705,0.5901639344262295,0.4098360655737705,0.5222399234771729,-2.3386785984039307
10
+ aqua_alpha_sweep_seed123.json,aqua,24,123,1.0,61,1.0,0.0,1.0,-1.7653703689575195,-4.6262898445129395
11
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.0,45,0.0,1.0,0.0,2.238931179046631,0.0
12
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.05,45,0.0,1.0,0.0,2.1714553833007812,-0.06747613847255707
13
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.1,45,0.022222222222222223,0.9777777777777777,0.022222222222222223,2.0972023010253906,-0.14172926545143127
14
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.2,45,0.022222222222222223,0.9777777777777777,0.022222222222222223,1.9297974109649658,-0.30913400650024414
15
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.3,45,0.06666666666666667,0.9333333333333333,0.06666666666666667,1.7361918687820435,-0.5027394890785217
16
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.5,45,0.2222222222222222,0.7777777777777778,0.2222222222222222,1.229968547821045,-1.0089629888534546
17
+ aqua_alpha_sweep_seed456.json,aqua,24,456,0.75,45,0.4444444444444444,0.5555555555555556,0.4444444444444444,0.25290924310684204,-1.9860222339630127
18
+ aqua_alpha_sweep_seed456.json,aqua,24,456,1.0,45,1.0,0.0,1.0,-1.6927522420883179,-3.9316842555999756
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/alpha_sweep.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | task | file | alpha | n | flip_rate | ablated_acc | mean_delta_margin_vs_baseline |
2
+ | --- | --- | --- | --- | --- | --- | --- |
3
+ | aqua | aqua_alpha_sweep_seed123.json | 0.0 | 61 | 0.0 | 100.0 | 0.000 |
4
+ | aqua | aqua_alpha_sweep_seed123.json | 0.02 | 61 | 1.6 | 98.4 | -0.029 |
5
+ | aqua | aqua_alpha_sweep_seed123.json | 0.05 | 61 | 1.6 | 98.4 | -0.073 |
6
+ | aqua | aqua_alpha_sweep_seed123.json | 0.1 | 61 | 3.3 | 96.7 | -0.151 |
7
+ | aqua | aqua_alpha_sweep_seed123.json | 0.2 | 61 | 4.9 | 95.1 | -0.324 |
8
+ | aqua | aqua_alpha_sweep_seed123.json | 0.3 | 61 | 11.5 | 88.5 | -0.538 |
9
+ | aqua | aqua_alpha_sweep_seed123.json | 0.5 | 61 | 16.4 | 83.6 | -1.143 |
10
+ | aqua | aqua_alpha_sweep_seed123.json | 0.75 | 61 | 41.0 | 59.0 | -2.339 |
11
+ | aqua | aqua_alpha_sweep_seed123.json | 1.0 | 61 | 100.0 | 0.0 | -4.626 |
12
+ | aqua | aqua_alpha_sweep_seed456.json | 0.0 | 45 | 0.0 | 100.0 | 0.000 |
13
+ | aqua | aqua_alpha_sweep_seed456.json | 0.05 | 45 | 0.0 | 100.0 | -0.067 |
14
+ | aqua | aqua_alpha_sweep_seed456.json | 0.1 | 45 | 2.2 | 97.8 | -0.142 |
15
+ | aqua | aqua_alpha_sweep_seed456.json | 0.2 | 45 | 2.2 | 97.8 | -0.309 |
16
+ | aqua | aqua_alpha_sweep_seed456.json | 0.3 | 45 | 6.7 | 93.3 | -0.503 |
17
+ | aqua | aqua_alpha_sweep_seed456.json | 0.5 | 45 | 22.2 | 77.8 | -1.009 |
18
+ | aqua | aqua_alpha_sweep_seed456.json | 0.75 | 45 | 44.4 | 55.6 | -1.986 |
19
+ | aqua | aqua_alpha_sweep_seed456.json | 1.0 | 45 | 100.0 | 0.0 | -3.932 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/analyze_qwen_results.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+
9
+ SUMMARY_CSV = os.environ.get("SUMMARY_CSV", "summary.csv")
10
+ ALPHA_CSV = os.environ.get("ALPHA_CSV", "alpha_sweep.csv")
11
+ OUT_MD = os.environ.get("OUT_MD", "qwen_report.md")
12
+ OUT_TEX = os.environ.get("OUT_TEX", "qwen_tables.tex")
13
+ PLOT_DIR = os.environ.get("PLOT_DIR", "plots")
14
+
15
+ os.makedirs(PLOT_DIR, exist_ok=True)
16
+
17
+ df = pd.read_csv(SUMMARY_CSV)
18
+ alpha_df = pd.read_csv(ALPHA_CSV) if os.path.exists(ALPHA_CSV) else pd.DataFrame()
19
+
20
+ # Helpers
21
+ def f(x, nd=3):
22
+ if pd.isna(x): return ""
23
+ if isinstance(x, (int, np.integer)): return str(int(x))
24
+ if isinstance(x, (float, np.floating)): return f"{float(x):.{nd}f}"
25
+ return str(x)
26
+
27
+ def pct(x, nd=1):
28
+ if pd.isna(x): return ""
29
+ return f"{float(x):.{nd}f}"
30
+
31
+ # Split by kind
32
+ mc = df[df["kind"] == "subspace_mc"].copy()
33
+ oa = df[df["kind"] == "openanswer"].copy()
34
+ fs = df[df["kind"] == "flipset"].copy()
35
+
36
+ # Choose representative columns
37
+ def select_cols(kind_df, cols):
38
+ keep = [c for c in cols if c in kind_df.columns]
39
+ return kind_df[keep].copy()
40
+
41
+ mc_cols = [
42
+ "task","eval_mode","seed","base_acc_scan","ablt_acc_scan","flips_scan",
43
+ "patched_0_rescued_pct","patched_full_rescued_pct",
44
+ "control_time_shuffled_rescued_pct","control_shared_randvec_rescued_pct",
45
+ "control_rand_subspace_rescued_pct","control_patch_nonshared_rescued_pct",
46
+ ]
47
+ oa_cols = [
48
+ "task","eval_mode","seed","base_acc_scan","ablt_acc_scan","flips_scan",
49
+ "patched_self_rescued_pct","control_time_shuffled_rescued_pct",
50
+ "control_shared_randvec_rescued_pct","control_rand_subspace_rescued_pct",
51
+ "control_patch_nonshared_rescued_pct",
52
+ ]
53
+ fs_cols = [
54
+ "file","seed","task","base_acc_scan","ablt_acc_scan","flips_scan",
55
+ "patched_self_rescued_pct","patched_transfer_rescued_pct",
56
+ ]
57
+
58
+ mc_tbl = select_cols(mc, mc_cols).sort_values(["task","eval_mode","seed"])
59
+ oa_tbl = select_cols(oa, oa_cols).sort_values(["task","eval_mode","seed"])
60
+ fs_tbl = select_cols(fs, fs_cols).sort_values(["seed","file"])
61
+
62
+ # ---- Plot 1: MC patched_0 rescue by task ----
63
+ if len(mc_tbl) > 0 and "patched_0_rescued_pct" in mc_tbl.columns:
64
+ mc_plot = mc_tbl.groupby("task", as_index=False)["patched_0_rescued_pct"].mean()
65
+ plt.figure()
66
+ plt.bar(mc_plot["task"], mc_plot["patched_0_rescued_pct"])
67
+ plt.xticks(rotation=45, ha="right")
68
+ plt.ylabel("Rescue% on flips (patched_0)")
69
+ plt.tight_layout()
70
+ plt.savefig(os.path.join(PLOT_DIR, "mc_patched0_rescue.pdf"), dpi=300)
71
+ plt.close()
72
+
73
+ # ---- Plot 2: MC controls gap (patched_0 vs rand-in-shared vs nonshared) ----
74
+ if len(mc_tbl) > 0 and "patched_0_rescued_pct" in mc_tbl.columns:
75
+ tmp = mc_tbl.groupby("task", as_index=False).agg({
76
+ "patched_0_rescued_pct":"mean",
77
+ "control_shared_randvec_rescued_pct":"mean",
78
+ "control_patch_nonshared_rescued_pct":"mean",
79
+ "control_rand_subspace_rescued_pct":"mean",
80
+ })
81
+ plt.figure()
82
+ x = np.arange(len(tmp))
83
+ w = 0.2
84
+ plt.bar(x - 1.5*w, tmp["patched_0_rescued_pct"], width=w, label="patched_0")
85
+ plt.bar(x - 0.5*w, tmp["control_shared_randvec_rescued_pct"], width=w, label="rand vec in shared")
86
+ plt.bar(x + 0.5*w, tmp["control_rand_subspace_rescued_pct"], width=w, label="rand subspace")
87
+ plt.bar(x + 1.5*w, tmp["control_patch_nonshared_rescued_pct"], width=w, label="nonshared patch")
88
+ plt.xticks(x, tmp["task"], rotation=45, ha="right")
89
+ plt.ylabel("Rescue% on flips")
90
+ plt.legend()
91
+ plt.tight_layout()
92
+ plt.savefig(os.path.join(PLOT_DIR, "mc_controls_gap.pdf"), dpi=300)
93
+ plt.close()
94
+
95
+ # ---- Plot 3: Alpha sweep flip_rate curves ----
96
+ if len(alpha_df) > 0 and {"alpha","flip_rate","seed"}.issubset(alpha_df.columns):
97
+ alpha_df2 = alpha_df.copy()
98
+ alpha_df2["alpha"] = pd.to_numeric(alpha_df2["alpha"], errors="coerce")
99
+ alpha_df2 = alpha_df2.dropna(subset=["alpha"])
100
+ plt.figure()
101
+ for seed in sorted(alpha_df2["seed"].dropna().unique()):
102
+ sub = alpha_df2[alpha_df2["seed"] == seed].sort_values("alpha")
103
+ plt.plot(sub["alpha"], sub["flip_rate"]*100.0, marker="o", label=f"seed={int(seed)}")
104
+ plt.xlabel("alpha")
105
+ plt.ylabel("Flip rate on flip-set (%)")
106
+ plt.legend()
107
+ plt.tight_layout()
108
+ plt.savefig(os.path.join(PLOT_DIR, "alpha_sweep_fliprate.pdf"), dpi=300)
109
+ plt.close()
110
+
111
+ # ---- Plot 4: Alpha sweep mean delta margin curves ----
112
+ if len(alpha_df) > 0 and {"alpha","mean_delta_margin_vs_baseline","seed"}.issubset(alpha_df.columns):
113
+ alpha_df2 = alpha_df.copy()
114
+ alpha_df2["alpha"] = pd.to_numeric(alpha_df2["alpha"], errors="coerce")
115
+ alpha_df2 = alpha_df2.dropna(subset=["alpha"])
116
+ plt.figure()
117
+ for seed in sorted(alpha_df2["seed"].dropna().unique()):
118
+ sub = alpha_df2[alpha_df2["seed"] == seed].sort_values("alpha")
119
+ plt.plot(sub["alpha"], sub["mean_delta_margin_vs_baseline"], marker="o", label=f"seed={int(seed)}")
120
+ plt.xlabel("alpha")
121
+ plt.ylabel("Mean Δmargin vs baseline (on flip-set)")
122
+ plt.legend()
123
+ plt.tight_layout()
124
+ plt.savefig(os.path.join(PLOT_DIR, "alpha_sweep_deltam.pdf"), dpi=300)
125
+ plt.close()
126
+
127
+ # ---- Plot 5: Open-answer patched_self rescue (pair_logprob vs gen) ----
128
+ if len(oa_tbl) > 0 and "patched_self_rescued_pct" in oa_tbl.columns:
129
+ oa_plot = oa_tbl.copy()
130
+ oa_plot["label"] = oa_plot["task"].astype(str) + ":" + oa_plot["eval_mode"].astype(str)
131
+ plt.figure()
132
+ plt.bar(oa_plot["label"], oa_plot["patched_self_rescued_pct"])
133
+ plt.xticks(rotation=45, ha="right")
134
+ plt.ylabel("Rescue% on flips (patched_self)")
135
+ plt.tight_layout()
136
+ plt.savefig(os.path.join(PLOT_DIR, "openanswer_patchedself_rescue.pdf"), dpi=300)
137
+ plt.close()
138
+
139
+ # ---- Markdown report ----
140
+ def df_to_md_table(dfx: pd.DataFrame, max_rows: int = 30) -> str:
141
+ if dfx is None or len(dfx) == 0:
142
+ return "_(none)_"
143
+ d = dfx.copy()
144
+ if len(d) > max_rows:
145
+ d = d.head(max_rows)
146
+ return d.to_markdown(index=False)
147
+
148
+ lines = []
149
+ lines.append(f"# Qwen subspace patching + flipset report\n")
150
+ lines.append(f"Generated from `{os.path.basename(SUMMARY_CSV)}` and `{os.path.basename(ALPHA_CSV)}`.\n")
151
+ lines.append("## Overview\n")
152
+ lines.append(f"- Runs: {len(df)} total JSON summaries\n")
153
+ lines.append(f"- MC runs: {len(mc)}; Open-answer runs: {len(oa)}; Flipset runs: {len(fs)}\n")
154
+ lines.append("## Key plots (PDF, dpi=300)\n")
155
+ for fn in [
156
+ "mc_patched0_rescue.pdf",
157
+ "mc_controls_gap.pdf",
158
+ "alpha_sweep_fliprate.pdf",
159
+ "alpha_sweep_deltam.pdf",
160
+ "openanswer_patchedself_rescue.pdf",
161
+ ]:
162
+ p = os.path.join(PLOT_DIR, fn)
163
+ if os.path.exists(p):
164
+ lines.append(f"- `{fn}`")
165
+ lines.append("\n")
166
+
167
+ lines.append("## Multiple-choice patchback (subspace_mc)\n")
168
+ lines.append(df_to_md_table(mc_tbl))
169
+ lines.append("\n")
170
+
171
+ lines.append("## Open-answer patchback (openanswer)\n")
172
+ lines.append(df_to_md_table(oa_tbl))
173
+ lines.append("\n")
174
+
175
+ lines.append("## Flipset transfer patching (flipset)\n")
176
+ lines.append(df_to_md_table(fs_tbl))
177
+ lines.append("\n")
178
+
179
+ if len(alpha_df) > 0:
180
+ lines.append("## Alpha sweep (flip-set)\n")
181
+ # show a compact subset (alpha=0,0.5,0.75,1.0) if present
182
+ a = alpha_df.copy()
183
+ a["alpha"] = pd.to_numeric(a["alpha"], errors="coerce")
184
+ a = a.dropna(subset=["alpha"])
185
+ keep = a[a["alpha"].isin([0.0, 0.5, 0.75, 1.0])].copy()
186
+ if len(keep) == 0:
187
+ keep = a
188
+ keep = keep.sort_values(["seed","alpha"])
189
+ cols = [c for c in ["file","seed","alpha","n","flip_rate","ablated_acc","mean_delta_margin_vs_baseline"] if c in keep.columns]
190
+ lines.append(df_to_md_table(keep[cols], max_rows=60))
191
+ lines.append("\n")
192
+
193
+ with open(OUT_MD, "w", encoding="utf-8") as f:
194
+ f.write("\n".join(lines))
195
+
196
+ # ---- LaTeX tables (quick export) ----
197
+ # We write two compact tables: MC summary + Open-answer summary
198
+ def to_latex_table(dfx: pd.DataFrame, caption: str, label: str) -> str:
199
+ if dfx is None or len(dfx) == 0:
200
+ return f"% {caption}\n% (empty)\n"
201
+ return dfx.to_latex(index=False, escape=True, caption=caption, label=label)
202
+
203
+ tex_lines = []
204
+ tex_lines.append("% Auto-generated LaTeX tables for Qwen results\n")
205
+ tex_lines.append("% Requires \\usepackage{booktabs}\n\n")
206
+
207
+ if len(mc_tbl) > 0:
208
+ tex_lines.append(to_latex_table(
209
+ mc_tbl,
210
+ caption="Qwen: multiple-choice (subspace\\Apatch) summary.",
211
+ label="tab:qwen_mc_summary"
212
+ ))
213
+ tex_lines.append("\n")
214
+
215
+ if len(oa_tbl) > 0:
216
+ tex_lines.append(to_latex_table(
217
+ oa_tbl,
218
+ caption="Qwen: open-answer (openanswer\\_subspace\\_patching) summary.",
219
+ label="tab:qwen_openanswer_summary"
220
+ ))
221
+ tex_lines.append("\n")
222
+
223
+ if len(alpha_df) > 0:
224
+ tex_lines.append(to_latex_table(
225
+ alpha_df.sort_values(["seed","alpha"]).head(40),
226
+ caption="Qwen: alpha sweep (first 40 rows shown).",
227
+ label="tab:qwen_alpha_sweep_head"
228
+ ))
229
+ tex_lines.append("\n")
230
+
231
+ with open(OUT_TEX, "w", encoding="utf-8") as f:
232
+ f.write("\n".join(tex_lines))
233
+
234
+ print(f"[OK] Wrote report: {OUT_MD}")
235
+ print(f"[OK] Wrote LaTeX tables: {OUT_TEX}")
236
+ print(f"[OK] Plots in: {PLOT_DIR}")
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/paper_table.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | kind | task | eval_mode | base_acc_scan | ablt_acc_scan | flips_scan | Patched@0 (rescue%, Δm) | Patched@full (rescue%, Δm) | Patched(self) (rescue%, Δm) | Patched(transfer) (rescue%, Δm) | Cross-example donor (rescue%, Δm) | Donor mismatch (rescue%, Δm) | Shared coeff permute (rescue%, Δm) | Shared coeff signflip (rescue%, Δm) | Rand vec in shared (rescue%, Δm) | Rand subspace (rescue%, Δm) | Nonshared patch (rescue%, Δm) |
2
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
3
+ | flipset | aqua | | 0.413 | 0.236 | 61 | | | | | | | | | | | |
4
+ | flipset | aqua | | 0.402 | 0.299 | 45 | | | | | | | | | | | |
5
+ | flipset | aqua | | 0.413 | 0.236 | 61 | | | 100.0%, 4.626 | 18.0%, -2.384 | | | | | | | |
6
+ | flipset | aqua | | 0.413 | 0.236 | 61 | | | 100.0%, 4.626 | 31.1%, 0.919 | | | | | | | |
7
+ | openanswer | gsm8k | gen_math | 0.035 | 0.039 | 8 | | | 0.0%, - | | 12.5%, - | | | | 12.5%, - | 0.0%, - | 0.0%, - |
8
+ | openanswer | gsm8k | pair_logprob | 0.801 | 0.762 | 15 | | | 93.3%, 4.365 | | 73.3%, 3.870 | | | | 20.0%, 0.357 | 13.3%, 0.209 | 6.7%, 0.067 |
9
+ | openanswer | humaneval | gen_code_compile | | | 0 | | | 5.3%, - | | 5.3%, - | | | | 5.3%, - | 0.0%, - | 0.0%, - |
10
+ | openanswer | humaneval | pair_logprob | 0.683 | 0.665 | 12 | | | 83.3%, 2.557 | | 50.0%, 1.671 | | | | 0.0%, -0.240 | 0.0%, -0.123 | 0.0%, -0.140 |
11
+ | subspace_mc | aqua | | 0.413 | 0.236 | 61 | 100.0%, 4.626 | 100.0%, 4.626 | | | 27.9%, 0.058 | | | | 32.8%, -0.133 | 31.1%, 0.133 | 0.0%, 0.000 |
12
+ | subspace_mc | arc_challenge | | 0.906 | 0.608 | 83 | 100.0%, 8.554 | 100.0%, 8.554 | | | 59.0%, 3.809 | | | | 27.7%, 0.258 | 33.7%, 0.832 | 0.0%, -0.000 |
13
+ | subspace_mc | commonsenseqa | | 0.867 | 0.645 | 64 | 100.0%, 7.356 | 100.0%, 7.356 | | | 46.9%, 0.967 | | | | 35.9%, 0.356 | 26.6%, -0.094 | 0.0%, -0.000 |
14
+ | subspace_mc | logiqa | | 0.473 | 0.418 | 34 | 100.0%, 6.486 | 100.0%, 6.486 | | | 47.1%, 1.852 | | | | 29.4%, 0.513 | 38.2%, 1.663 | 0.0%, -0.000 |
15
+ | subspace_mc | openbookqa | | 0.859 | 0.613 | 69 | 100.0%, 7.372 | 100.0%, 7.372 | | | 44.9%, 0.974 | | | | 31.9%, 0.118 | 40.6%, 0.760 | 0.0%, 0.000 |
16
+ | subspace_mc | piqa | | 0.871 | 0.707 | 53 | 100.0%, 6.291 | 100.0%, 6.291 | | | 92.5%, 5.667 | | | | 32.1%, -0.274 | 26.4%, -0.256 | 0.0%, -0.000 |
17
+ | subspace_mc | qasc | | 0.809 | 0.492 | 89 | 100.0%, 9.979 | 100.0%, 9.979 | | | 51.7%, 4.258 | | | | 61.8%, 4.160 | 66.3%, 4.673 | 0.0%, 0.000 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/alpha_sweep_deltam.pdf ADDED
Binary file (15.1 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/alpha_sweep_fliprate.pdf ADDED
Binary file (13.6 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/mc_controls_gap.pdf ADDED
Binary file (14.3 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/mc_patched0_rescue.pdf ADDED
Binary file (13.8 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/plots/openanswer_patchedself_rescue.pdf ADDED
Binary file (13.4 kB). View file
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/qwen_report.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Qwen subspace patching + flipset report
2
+
3
+ Generated from `summary.csv` and `alpha_sweep.csv`.
4
+
5
+ ## Overview
6
+
7
+ - Runs: 15 total JSON summaries
8
+
9
+ - MC runs: 7; Open-answer runs: 4; Flipset runs: 4
10
+
11
+ ## Key plots (PDF, dpi=300)
12
+
13
+ - `mc_patched0_rescue.pdf`
14
+ - `mc_controls_gap.pdf`
15
+ - `alpha_sweep_fliprate.pdf`
16
+ - `alpha_sweep_deltam.pdf`
17
+ - `openanswer_patchedself_rescue.pdf`
18
+
19
+
20
+ ## Multiple-choice patchback (subspace_mc)
21
+
22
+ | task | eval_mode | seed | base_acc_scan | ablt_acc_scan | flips_scan | patched_0_rescued_pct | patched_full_rescued_pct | control_time_shuffled_rescued_pct | control_shared_randvec_rescued_pct | control_rand_subspace_rescued_pct | control_patch_nonshared_rescued_pct |
23
+ |:--------------|------------:|-------:|----------------:|----------------:|-------------:|------------------------:|---------------------------:|------------------------------------:|-------------------------------------:|------------------------------------:|--------------------------------------:|
24
+ | aqua | nan | 123 | 0.413386 | 0.23622 | 61 | 100 | 100 | 27.8689 | 32.7869 | 31.1475 | 0 |
25
+ | arc_challenge | nan | 123 | 0.905882 | 0.607843 | 83 | 100 | 100 | 59.0361 | 27.7108 | 33.7349 | 0 |
26
+ | commonsenseqa | nan | 123 | 0.867188 | 0.644531 | 64 | 100 | 100 | 46.875 | 35.9375 | 26.5625 | 0 |
27
+ | logiqa | nan | 123 | 0.472656 | 0.417969 | 34 | 100 | 100 | 47.0588 | 29.4118 | 38.2353 | 0 |
28
+ | openbookqa | nan | 123 | 0.859375 | 0.613281 | 69 | 100 | 100 | 44.9275 | 31.8841 | 40.5797 | 0 |
29
+ | piqa | nan | 123 | 0.871094 | 0.707031 | 53 | 100 | 100 | 92.4528 | 32.0755 | 26.4151 | 0 |
30
+ | qasc | nan | 123 | 0.808594 | 0.492188 | 89 | 100 | 100 | 51.6854 | 61.7978 | 66.2921 | 0 |
31
+
32
+
33
+ ## Open-answer patchback (openanswer)
34
+
35
+ | task | eval_mode | seed | base_acc_scan | ablt_acc_scan | flips_scan | patched_self_rescued_pct | control_time_shuffled_rescued_pct | control_shared_randvec_rescued_pct | control_rand_subspace_rescued_pct | control_patch_nonshared_rescued_pct |
36
+ |:----------|:-----------------|-------:|----------------:|----------------:|-------------:|---------------------------:|------------------------------------:|-------------------------------------:|------------------------------------:|--------------------------------------:|
37
+ | gsm8k | gen_math | 123 | 0.0351562 | 0.0390625 | 8 | 0 | 12.5 | 12.5 | 0 | 0 |
38
+ | gsm8k | pair_logprob | 123 | 0.800781 | 0.761719 | 15 | 93.3333 | 73.3333 | 20 | 13.3333 | 6.66667 |
39
+ | humaneval | gen_code_compile | 123 | nan | nan | 0 | 5.26316 | 5.26316 | 5.26316 | 0 | 0 |
40
+ | humaneval | pair_logprob | 123 | 0.682927 | 0.664634 | 12 | 83.3333 | 50 | 0 | 0 | 0 |
41
+
42
+
43
+ ## Flipset transfer patching (flipset)
44
+
45
+ | file | seed | task | base_acc_scan | ablt_acc_scan | flips_scan | patched_self_rescued_pct | patched_transfer_rescued_pct |
46
+ |:----------------------------------------------------|-------:|:-------|----------------:|----------------:|-------------:|---------------------------:|-------------------------------:|
47
+ | aqua_alpha_sweep_seed123.json | 123 | aqua | 0.413386 | 0.23622 | 61 | nan | nan |
48
+ | aqua_transfer_cross_mc_baselinecorrect_seed123.json | 123 | aqua | 0.413386 | 0.23622 | 61 | 100 | 18.0328 |
49
+ | aqua_transfer_same_task_seed123.json | 123 | aqua | 0.413386 | 0.23622 | 61 | 100 | 31.1475 |
50
+ | aqua_alpha_sweep_seed456.json | 456 | aqua | 0.401575 | 0.299213 | 45 | nan | nan |
51
+
52
+
53
+ ## Alpha sweep (flip-set)
54
+
55
+ | file | seed | alpha | n | flip_rate | ablated_acc | mean_delta_margin_vs_baseline |
56
+ |:------------------------------|-------:|--------:|----:|------------:|--------------:|--------------------------------:|
57
+ | aqua_alpha_sweep_seed123.json | 123 | 0 | 61 | 0 | 1 | 0 |
58
+ | aqua_alpha_sweep_seed123.json | 123 | 0.5 | 61 | 0.163934 | 0.836066 | -1.14309 |
59
+ | aqua_alpha_sweep_seed123.json | 123 | 0.75 | 61 | 0.409836 | 0.590164 | -2.33868 |
60
+ | aqua_alpha_sweep_seed123.json | 123 | 1 | 61 | 1 | 0 | -4.62629 |
61
+ | aqua_alpha_sweep_seed456.json | 456 | 0 | 45 | 0 | 1 | 0 |
62
+ | aqua_alpha_sweep_seed456.json | 456 | 0.5 | 45 | 0.222222 | 0.777778 | -1.00896 |
63
+ | aqua_alpha_sweep_seed456.json | 456 | 0.75 | 45 | 0.444444 | 0.555556 | -1.98602 |
64
+ | aqua_alpha_sweep_seed456.json | 456 | 1 | 45 | 1 | 0 | -3.93168 |
65
+
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/qwen_tables.tex ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % Auto-generated LaTeX tables for Qwen results
2
+
3
+ % Requires \usepackage{booktabs}
4
+
5
+
6
+ \begin{table}
7
+ \caption{Qwen: multiple-choice (subspace\Apatch) summary.}
8
+ \label{tab:qwen_mc_summary}
9
+ \begin{tabular}{llrrrrrrrrrr}
10
+ \toprule
11
+ task & eval\_mode & seed & base\_acc\_scan & ablt\_acc\_scan & flips\_scan & patched\_0\_rescued\_pct & patched\_full\_rescued\_pct & control\_time\_shuffled\_rescued\_pct & control\_shared\_randvec\_rescued\_pct & control\_rand\_subspace\_rescued\_pct & control\_patch\_nonshared\_rescued\_pct \\
12
+ \midrule
13
+ aqua & NaN & 123 & 0.413386 & 0.236220 & 61 & 100.000000 & 100.000000 & 27.868852 & 32.786885 & 31.147541 & 0.000000 \\
14
+ arc\_challenge & NaN & 123 & 0.905882 & 0.607843 & 83 & 100.000000 & 100.000000 & 59.036145 & 27.710843 & 33.734940 & 0.000000 \\
15
+ commonsenseqa & NaN & 123 & 0.867188 & 0.644531 & 64 & 100.000000 & 100.000000 & 46.875000 & 35.937500 & 26.562500 & 0.000000 \\
16
+ logiqa & NaN & 123 & 0.472656 & 0.417969 & 34 & 100.000000 & 100.000000 & 47.058824 & 29.411765 & 38.235294 & 0.000000 \\
17
+ openbookqa & NaN & 123 & 0.859375 & 0.613281 & 69 & 100.000000 & 100.000000 & 44.927536 & 31.884058 & 40.579710 & 0.000000 \\
18
+ piqa & NaN & 123 & 0.871094 & 0.707031 & 53 & 100.000000 & 100.000000 & 92.452830 & 32.075472 & 26.415094 & 0.000000 \\
19
+ qasc & NaN & 123 & 0.808594 & 0.492188 & 89 & 100.000000 & 100.000000 & 51.685393 & 61.797753 & 66.292135 & 0.000000 \\
20
+ \bottomrule
21
+ \end{tabular}
22
+ \end{table}
23
+
24
+
25
+
26
+ \begin{table}
27
+ \caption{Qwen: open-answer (openanswer\_subspace\_patching) summary.}
28
+ \label{tab:qwen_openanswer_summary}
29
+ \begin{tabular}{llrrrrrrrrr}
30
+ \toprule
31
+ task & eval\_mode & seed & base\_acc\_scan & ablt\_acc\_scan & flips\_scan & patched\_self\_rescued\_pct & control\_time\_shuffled\_rescued\_pct & control\_shared\_randvec\_rescued\_pct & control\_rand\_subspace\_rescued\_pct & control\_patch\_nonshared\_rescued\_pct \\
32
+ \midrule
33
+ gsm8k & gen\_math & 123 & 0.035156 & 0.039062 & 8 & 0.000000 & 12.500000 & 12.500000 & 0.000000 & 0.000000 \\
34
+ gsm8k & pair\_logprob & 123 & 0.800781 & 0.761719 & 15 & 93.333333 & 73.333333 & 20.000000 & 13.333333 & 6.666667 \\
35
+ humaneval & gen\_code\_compile & 123 & NaN & NaN & 0 & 5.263158 & 5.263158 & 5.263158 & 0.000000 & 0.000000 \\
36
+ humaneval & pair\_logprob & 123 & 0.682927 & 0.664634 & 12 & 83.333333 & 50.000000 & 0.000000 & 0.000000 & 0.000000 \\
37
+ \bottomrule
38
+ \end{tabular}
39
+ \end{table}
40
+
41
+
42
+
43
+ \begin{table}
44
+ \caption{Qwen: alpha sweep (first 40 rows shown).}
45
+ \label{tab:qwen_alpha_sweep_head}
46
+ \begin{tabular}{llrrrrrrrrr}
47
+ \toprule
48
+ file & task & layer & seed & alpha & n & flip\_rate & ablated\_acc & pred\_change\_rate & mean\_margin & mean\_delta\_margin\_vs\_baseline \\
49
+ \midrule
50
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.000000 & 61 & 0.000000 & 1.000000 & 0.000000 & 2.860918 & 0.000000 \\
51
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.020000 & 61 & 0.016393 & 0.983607 & 0.016393 & 2.832013 & -0.028906 \\
52
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.050000 & 61 & 0.016393 & 0.983607 & 0.016393 & 2.787427 & -0.073491 \\
53
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.100000 & 61 & 0.032787 & 0.967213 & 0.032787 & 2.709940 & -0.150978 \\
54
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.200000 & 61 & 0.049180 & 0.950820 & 0.049180 & 2.536606 & -0.324313 \\
55
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.300000 & 61 & 0.114754 & 0.885246 & 0.114754 & 2.322977 & -0.537942 \\
56
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.500000 & 61 & 0.163934 & 0.836066 & 0.163934 & 1.717831 & -1.143088 \\
57
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 0.750000 & 61 & 0.409836 & 0.590164 & 0.409836 & 0.522240 & -2.338679 \\
58
+ aqua\_alpha\_sweep\_seed123.json & aqua & 24 & 123 & 1.000000 & 61 & 1.000000 & 0.000000 & 1.000000 & -1.765370 & -4.626290 \\
59
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.000000 & 45 & 0.000000 & 1.000000 & 0.000000 & 2.238931 & 0.000000 \\
60
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.050000 & 45 & 0.000000 & 1.000000 & 0.000000 & 2.171455 & -0.067476 \\
61
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.100000 & 45 & 0.022222 & 0.977778 & 0.022222 & 2.097202 & -0.141729 \\
62
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.200000 & 45 & 0.022222 & 0.977778 & 0.022222 & 1.929797 & -0.309134 \\
63
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.300000 & 45 & 0.066667 & 0.933333 & 0.066667 & 1.736192 & -0.502739 \\
64
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.500000 & 45 & 0.222222 & 0.777778 & 0.222222 & 1.229969 & -1.008963 \\
65
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 0.750000 & 45 & 0.444444 & 0.555556 & 0.444444 & 0.252909 & -1.986022 \\
66
+ aqua\_alpha\_sweep\_seed456.json & aqua & 24 & 456 & 1.000000 & 45 & 1.000000 & 0.000000 & 1.000000 & -1.692752 & -3.931684 \\
67
+ \bottomrule
68
+ \end{tabular}
69
+ \end{table}
70
+
71
+
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/summary.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kind,file,task,eval_mode,layer,seed,hf_id,hf_split,candidate_labels,Qs_shape,patch_desc,donor_source,donor_tasks,donor_pick,n_donor_bank,scan_effective,scan_skipped,base_acc_scan,ablt_acc_scan,flips_scan,anti_flips_scan,both_correct_scan,both_wrong_scan,patched_primary_method,patched_primary_rescued_pct,patched_primary_mean_dmargin,diff_time_shuffled_minus_patched_primary_rescued_pct,diff_patched_primary_minus_shared_randvec_rescued_pct,patched_0_rescued,patched_0_n,patched_0_rescued_pct,patched_0_mean_dmargin,patched_01_rescued,patched_01_n,patched_01_rescued_pct,patched_01_mean_dmargin,patched_full_rescued,patched_full_n,patched_full_rescued_pct,patched_full_mean_dmargin,patched_self_rescued,patched_self_n,patched_self_rescued_pct,patched_self_mean_dmargin,patched_transfer_rescued,patched_transfer_n,patched_transfer_rescued_pct,patched_transfer_mean_dmargin,control_time_shuffled_rescued,control_time_shuffled_n,control_time_shuffled_rescued_pct,control_time_shuffled_mean_dmargin,control_shared_mismatch_rescued,control_shared_mismatch_n,control_shared_mismatch_rescued_pct,control_shared_mismatch_mean_dmargin,control_shared_perm_rescued,control_shared_perm_n,control_shared_perm_rescued_pct,control_shared_perm_mean_dmargin,control_shared_signflip_rescued,control_shared_signflip_n,control_shared_signflip_rescued_pct,control_shared_signflip_mean_dmargin,control_shared_randvec_rescued,control_shared_randvec_n,control_shared_randvec_rescued_pct,control_shared_randvec_mean_dmargin,control_rand_subspace_rescued,control_rand_subspace_n,control_rand_subspace_rescued_pct,control_rand_subspace_mean_dmargin,control_patch_nonshared_rescued,control_patch_nonshared_n,control_patch_nonshared_rescued_pct,control_patch_nonshared_mean_dmargin
2
+ flipset,aqua_alpha_sweep_seed123.json,aqua,,24,123,,,ABCDE,3584x223,steps=0,,,,,254,0,0.41338582677165353,0.23622047244094488,61,16,44,133,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3
+ flipset,aqua_alpha_sweep_seed456.json,aqua,,24,456,,,ABCDE,3584x221,steps=0,,,,,254,0,0.4015748031496063,0.2992125984251969,45,19,57,133,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4
+ flipset,aqua_transfer_cross_mc_baselinecorrect_seed123.json,aqua,,24,123,,,ABCDE,3584x223,steps=0,cross_task_eval,"commonsenseqa,openbookqa",random,256,254,0,0.41338582677165353,0.23622047244094488,61,16,44,133,patched_self,100.0,4.6262898445129395,,,,,,,,,,,,,,,61,61,100.0,4.6262898445129395,11,61,18.0327868852459,-2.3844683170318604,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5
+ flipset,aqua_transfer_same_task_seed123.json,aqua,,24,123,,,ABCDE,3584x223,steps=0,same_task_eval,aqua,random,254,254,0,0.41338582677165353,0.23622047244094488,61,16,44,133,patched_self,100.0,4.6262898445129395,,,,,,,,,,,,,,,61,61,100.0,4.6262898445129395,19,61,31.147540983606557,0.9193577766418457,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6
+ openanswer,gsm8k_genmath.json,gsm8k,gen_math,24,123,gsm8k/main,,,3584x223,"steps=0,1,2,3",,,,,256,0,0.03515625,0.0390625,8,9,1,238,patched_self,0.0,,12.5,-12.5,,,,,,,,,,,,,0,8,0.0,,,,,,1,8,12.5,,,,,,,,,,,,,,1,8,12.5,,0,8,0.0,,0,8,0.0,
7
+ openanswer,gsm8k_pairlogprob.json,gsm8k,pair_logprob,24,123,gsm8k/main,,,3584x223,"steps=0,1,2,3",,,,,256,0,0.80078125,0.76171875,15,5,190,46,patched_self,93.33333333333333,4.364808082580566,-20.0,73.33333333333333,,,,,,,,,,,,,14,15,93.33333333333333,4.364808082580566,,,,,11,15,73.33333333333333,3.8701834678649902,,,,,,,,,,,,,3,15,20.0,0.35714292526245117,2,15,13.333333333333334,0.2093137800693512,1,15,6.666666666666667,0.06698532402515411
8
+ openanswer,humaneval_gencode_compile.json,humaneval,gen_code_compile,24,123,openai_humaneval,test,,3584x223,"steps=0,1,2,3",,,,,0,164,,,0,0,0,0,patched_self,5.2631578947368425,,0.0,0.0,,,,,,,,,,,,,1,19,5.2631578947368425,,,,,,1,19,5.2631578947368425,,,,,,,,,,,,,,1,19,5.2631578947368425,,0,19,0.0,,0,19,0.0,
9
+ openanswer,humaneval_pairlogprob.json,humaneval,pair_logprob,24,123,openai_humaneval,test,,3584x223,"steps=0,1,2,3",,,,,164,0,0.6829268292682927,0.6646341463414634,12,9,100,43,patched_self,83.33333333333333,2.5565900802612305,-33.33333333333333,83.33333333333333,,,,,,,,,,,,,10,12,83.33333333333333,2.5565900802612305,,,,,6,12,50.0,1.6708040237426758,,,,,,,,,,,,,0,12,0.0,-0.2395162135362625,0,12,0.0,-0.12262799590826035,0,12,0.0,-0.13950853049755096
10
+ subspace_mc,aqua.json,aqua,,24,123,,,ABCDE,3584x223,,,,,,254,0,0.41338582677165353,0.23622047244094488,61,16,44,133,patched_0,100.0,4.626289450731433,-72.1311475409836,67.21311475409837,61,61,100.0,4.626289450731433,61,61,100.0,4.626289450731433,61,61,100.0,4.626289450731433,,,,,,,,,17,61,27.868852459016395,0.05754191758202725,,,,,,,,,,,,,20,61,32.78688524590164,-0.13294745664127539,19,61,31.147540983606557,0.1326175615435741,0,61,0.0,1.0748378566054047e-07
11
+ subspace_mc,arc_challenge.json,arc_challenge,,24,123,,,ABCD,3584x223,,,,,,255,1,0.9058823529411765,0.6078431372549019,83,7,148,17,patched_0,100.0,8.554246582180621,-40.963855421686745,72.28915662650603,83,83,100.0,8.554246582180621,83,83,100.0,8.554246582180621,83,83,100.0,8.554246582180621,,,,,,,,,49,83,59.036144578313255,3.808587062789733,,,,,,,,,,,,,23,83,27.710843373493976,0.2576807780438159,28,83,33.734939759036145,0.8321906486189509,0,83,0.0,-1.2179455125188252e-06
12
+ subspace_mc,commonsenseqa.json,commonsenseqa,,24,123,,,ABCDE,3584x223,,,,,,256,0,0.8671875,0.64453125,64,7,158,27,patched_0,100.0,7.355989446863532,-53.125,64.0625,64,64,100.0,7.355989446863532,64,64,100.0,7.355989446863532,64,64,100.0,7.355989446863532,,,,,,,,,30,64,46.875,0.9671716894954443,,,,,,,,,,,,,23,64,35.9375,0.356116883456707,17,64,26.5625,-0.09415088221430779,0,64,0.0,-4.6566128730773926e-07
13
+ subspace_mc,logiqa.json,logiqa,,24,123,,,ABCD,3584x223,,,,,,256,0,0.47265625,0.41796875,34,20,87,115,patched_0,100.0,6.486427613917519,-52.94117647058823,70.58823529411765,34,34,100.0,6.486427613917519,34,34,100.0,6.486427613917519,34,34,100.0,6.486427613917519,,,,,,,,,16,34,47.05882352941177,1.8523672047783346,,,,,,,,,,,,,10,34,29.41176470588235,0.5126809863483205,13,34,38.23529411764706,1.6626762362087475,0,34,0.0,-6.731818704044118e-07
14
+ subspace_mc,openbookqa.json,openbookqa,,24,123,,,ABCD,3584x223,,,,,,256,0,0.859375,0.61328125,69,6,151,30,patched_0,100.0,7.371976157893306,-55.072463768115945,68.1159420289855,69,69,100.0,7.371976157893306,69,69,100.0,7.371976157893306,69,69,100.0,7.371976157893306,,,,,,,,,31,69,44.927536231884055,0.9741012505863024,,,,,,,,,,,,,22,69,31.884057971014492,0.11762981483901756,28,69,40.57971014492754,0.7597268353337827,0,69,0.0,1.603278560914855e-06
15
+ subspace_mc,piqa.json,piqa,,24,123,,,AB,3584x223,,,,,,256,0,0.87109375,0.70703125,53,11,170,22,patched_0,100.0,6.2906723629753545,-7.547169811320757,67.9245283018868,53,53,100.0,6.2906723629753545,53,53,100.0,6.2906723629753545,53,53,100.0,6.2906723629753545,,,,,,,,,49,53,92.45283018867924,5.667159054639204,,,,,,,,,,,,,17,53,32.075471698113205,-0.2743093382637456,14,53,26.41509433962264,-0.2557831530301076,0,53,0.0,-1.4934899672022407e-06
16
+ subspace_mc,qasc.json,qasc,,24,123,,,ABCDEFGH,3584x223,,,,,,256,0,0.80859375,0.4921875,89,8,118,41,patched_0,100.0,9.979466939240359,-48.31460674157304,38.20224719101124,89,89,100.0,9.979466939240359,89,89,100.0,9.979466939240359,89,89,100.0,9.979466939240359,,,,,,,,,46,89,51.68539325842696,4.2575247676185,,,,,,,,,,,,,55,89,61.79775280898876,4.159921297866307,59,89,66.29213483146067,4.673449364940772,0,89,0.0,5.598818318227704e-07
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/_summary/summary.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ | kind | task | eval_mode | file | base_acc_scan | ablt_acc_scan | flips_scan | patched_primary_method | patched_primary_rescued_pct | control_time_shuffled_rescued_pct | control_shared_randvec_rescued_pct | control_rand_subspace_rescued_pct | control_patch_nonshared_rescued_pct |
2
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
3
+ | flipset | aqua | | aqua_alpha_sweep_seed123.json | 0.413 | 0.236 | 61 | | | | | | |
4
+ | flipset | aqua | | aqua_alpha_sweep_seed456.json | 0.402 | 0.299 | 45 | | | | | | |
5
+ | flipset | aqua | | aqua_transfer_cross_mc_baselinecorrect_seed123.json | 0.413 | 0.236 | 61 | patched_self | 100.0 | | | | |
6
+ | flipset | aqua | | aqua_transfer_same_task_seed123.json | 0.413 | 0.236 | 61 | patched_self | 100.0 | | | | |
7
+ | openanswer | gsm8k | gen_math | gsm8k_genmath.json | 0.035 | 0.039 | 8 | patched_self | 0.0 | 12.5 | 12.5 | 0.0 | 0.0 |
8
+ | openanswer | gsm8k | pair_logprob | gsm8k_pairlogprob.json | 0.801 | 0.762 | 15 | patched_self | 93.3 | 73.3 | 20.0 | 13.3 | 6.7 |
9
+ | openanswer | humaneval | gen_code_compile | humaneval_gencode_compile.json | | | 0 | patched_self | 5.3 | 5.3 | 5.3 | 0.0 | 0.0 |
10
+ | openanswer | humaneval | pair_logprob | humaneval_pairlogprob.json | 0.683 | 0.665 | 12 | patched_self | 83.3 | 50.0 | 0.0 | 0.0 | 0.0 |
11
+ | subspace_mc | aqua | | aqua.json | 0.413 | 0.236 | 61 | patched_0 | 100.0 | 27.9 | 32.8 | 31.1 | 0.0 |
12
+ | subspace_mc | arc_challenge | | arc_challenge.json | 0.906 | 0.608 | 83 | patched_0 | 100.0 | 59.0 | 27.7 | 33.7 | 0.0 |
13
+ | subspace_mc | commonsenseqa | | commonsenseqa.json | 0.867 | 0.645 | 64 | patched_0 | 100.0 | 46.9 | 35.9 | 26.6 | 0.0 |
14
+ | subspace_mc | logiqa | | logiqa.json | 0.473 | 0.418 | 34 | patched_0 | 100.0 | 47.1 | 29.4 | 38.2 | 0.0 |
15
+ | subspace_mc | openbookqa | | openbookqa.json | 0.859 | 0.613 | 69 | patched_0 | 100.0 | 44.9 | 31.9 | 40.6 | 0.0 |
16
+ | subspace_mc | piqa | | piqa.json | 0.871 | 0.707 | 53 | patched_0 | 100.0 | 92.5 | 32.1 | 26.4 | 0.0 |
17
+ | subspace_mc | qasc | | qasc.json | 0.809 | 0.492 | 89 | patched_0 | 100.0 | 51.7 | 61.8 | 66.3 | 0.0 |
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/compute_Qs_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/compute_Qs_seed456.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_alpha_sweep_seed123.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_alpha_sweep_seed456.json ADDED
The diff for this file is too large to render. See raw diff
 
artifacts/patch_back/results/Qwen/Qwen2.5-7B-Instruct/layer24/flipset/aqua_transfer_cross_mc_baselinecorrect_seed123.json ADDED
The diff for this file is too large to render. See raw diff