connaaa commited on
Commit
5f2451e
·
verified ·
1 Parent(s): e65783c

Phase 5 release: 7 TopK SAEs + specificity / null-steering JSON artifacts

Browse files
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.pt filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: sae_lens
4
+ tags:
5
+ - interpretability
6
+ - sparse-autoencoder
7
+ - sae
8
+ - mechanistic-interpretability
9
+ - topk-sae
10
+ ---
11
+
12
+ # InterpGPT — Phase 5 TopK SAEs
13
+
14
+ Seven sparse autoencoders trained on the residual stream
15
+ (`hook_resid_post`) of the two Phase 1 InterpGPT models
16
+ ([`interpgpt-standard-23M`](https://huggingface.co/connaaa/interpgpt-standard-23M),
17
+ [`interpgpt-adhd-23M`](https://huggingface.co/connaaa/interpgpt-adhd-23M)).
18
+
19
+ | Model | Layer | Hook | Subdir |
20
+ |---|---|---|---|
21
+ | standard | 0 | hook_resid_post | `standard_L0_hook_resid_post/` |
22
+ | standard | 1 | hook_resid_post | `standard_L1_hook_resid_post/` |
23
+ | standard | 2 | hook_resid_post | `standard_L2_hook_resid_post/` |
24
+ | standard | 3 | hook_resid_post | `standard_L3_hook_resid_post/` |
25
+ | adhd | 1 | hook_resid_post | `adhd_L1_hook_resid_post/` |
26
+ | adhd | 2 | hook_resid_post | `adhd_L2_hook_resid_post/` |
27
+ | adhd | 3 | hook_resid_post | `adhd_L3_hook_resid_post/` |
28
+
29
+ ## Training setup
30
+
31
+ - Library: [`sae_lens`](https://github.com/jbloomAus/SAELens) TopK training SAE
32
+ - `k = 40`, `d_sae = 4096`
33
+ - All 7 SAEs pass quality gates: FVE 0.87–0.92, dead features < 2%
34
+
35
+ ## Phase 1 result artifacts (included)
36
+
37
+ - `feature_diff.json` — 312 ADHD-L2 features firing at step-onset that the
38
+ standard model lacks. Feature 2504 highlighted (2000× cross-model asymmetry).
39
+ - `causal_nulls_per_seed.json` — 5-seed causal ablation nulls for the L3 swap.
40
+ - `deepdive_steering.json` — feature 2504 four-panel steering results (all four
41
+ interventions Δ within ±0.025 of null, below 2 SEM).
42
+ - `three_probes.json` — three-probe causal-check outputs.
43
+
44
+ ## Loading
45
+
46
+ ### Minimal
47
+
48
+ ```python
49
+ from huggingface_hub import snapshot_download
50
+ from sae_lens import SAE
51
+
52
+ repo = "connaaa/interpgpt-sae-phase5"
53
+ local = snapshot_download(repo_id=repo, allow_patterns=["adhd_L2_hook_resid_post/*"])
54
+ sae = SAE.load_from_disk(f"{local}/adhd_L2_hook_resid_post")
55
+ print(sae)
56
+ ```
57
+
58
+ ### Pull everything
59
+
60
+ ```python
61
+ from huggingface_hub import snapshot_download
62
+ local = snapshot_download(repo_id="connaaa/interpgpt-sae-phase5")
63
+ ```
64
+
65
+ ## Reproducibility
66
+
67
+ Training script: `phase5_sae.py` in
68
+ [github.com/cwklurks/interpgpt](https://github.com/cwklurks/interpgpt).
69
+ Production driver: `phase5_production.py`. Four-panel steering harness:
70
+ `phase5_steering_ci.py`.
71
+
72
+ ## License
73
+
74
+ MIT.
adhd_L1_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "adhd",
3
+ "layer": 1,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 3.830482244491577,
13
+ "l0": 40.0,
14
+ "fve": 0.9020028367114843,
15
+ "n_dead": 5,
16
+ "mse": 0.0073381345719099045
17
+ },
18
+ "dead_pct_heldout": 1.3671875
19
+ }
adhd_L1_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 7.214208602905273,
5
+ "l0": 40.0,
6
+ "fve": 0.8173658289454013,
7
+ "n_dead": 0,
8
+ "mse": 0.014090251177549362
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 11.098737716674805,
13
+ "l0": 40.0,
14
+ "fve": 0.8590923953729561,
15
+ "n_dead": 488,
16
+ "mse": 0.011645233258605003
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 7.969749450683594,
21
+ "l0": 40.0,
22
+ "fve": 0.8684529110409115,
23
+ "n_dead": 148,
24
+ "mse": 0.010252815671265125
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 5.30736780166626,
29
+ "l0": 40.0,
30
+ "fve": 0.8735112871918154,
31
+ "n_dead": 18,
32
+ "mse": 0.00969421211630106
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 4.6798553466796875,
37
+ "l0": 40.0,
38
+ "fve": 0.8860032616097528,
39
+ "n_dead": 4,
40
+ "mse": 0.0090001430362463
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 4.3763346672058105,
45
+ "l0": 40.0,
46
+ "fve": 0.8946085205816868,
47
+ "n_dead": 1,
48
+ "mse": 0.008514291606843472
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 4.231931209564209,
53
+ "l0": 40.0,
54
+ "fve": 0.8952190554998088,
55
+ "n_dead": 0,
56
+ "mse": 0.008265490643680096
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 4.067999839782715,
61
+ "l0": 40.0,
62
+ "fve": 0.8984853600784243,
63
+ "n_dead": 0,
64
+ "mse": 0.007945312187075615
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 3.9912521839141846,
69
+ "l0": 40.0,
70
+ "fve": 0.897026286041148,
71
+ "n_dead": 0,
72
+ "mse": 0.007795413956046104
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 3.987309694290161,
77
+ "l0": 40.0,
78
+ "fve": 0.9006864810283836,
79
+ "n_dead": 1,
80
+ "mse": 0.007757413201034069
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 3.8046491146087646,
85
+ "l0": 40.0,
86
+ "fve": 0.9039961047080753,
87
+ "n_dead": 1,
88
+ "mse": 0.007402045652270317
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 3.7656211853027344,
93
+ "l0": 40.0,
94
+ "fve": 0.9085856363510846,
95
+ "n_dead": 4,
96
+ "mse": 0.00724159087985754
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 3.830482244491577,
101
+ "l0": 40.0,
102
+ "fve": 0.9020028367114843,
103
+ "n_dead": 5,
104
+ "mse": 0.0073381345719099045
105
+ }
106
+ ]
adhd_L1_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bfe52fb2ee7e5992b148d7aecd779fa71dfe2ebfd701619c93b3ebe01c7db11
3
+ size 16798005
adhd_L2_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "adhd",
3
+ "layer": 2,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 7.990331649780273,
13
+ "l0": 40.0,
14
+ "fve": 0.8937416797380295,
15
+ "n_dead": 1,
16
+ "mse": 0.015545391477644444
17
+ },
18
+ "dead_pct_heldout": 1.46484375
19
+ }
adhd_L2_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 14.795553207397461,
5
+ "l0": 40.0,
6
+ "fve": 0.8293616575146343,
7
+ "n_dead": 0,
8
+ "mse": 0.028897564858198166
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 22.638526916503906,
13
+ "l0": 40.0,
14
+ "fve": 0.8359017185027313,
15
+ "n_dead": 663,
16
+ "mse": 0.023937705904245377
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 19.389955520629883,
21
+ "l0": 40.0,
22
+ "fve": 0.8645987532704418,
23
+ "n_dead": 222,
24
+ "mse": 0.021326521411538124
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 11.63227653503418,
29
+ "l0": 40.0,
30
+ "fve": 0.8760963178028377,
31
+ "n_dead": 35,
32
+ "mse": 0.02005489356815815
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 9.950961112976074,
37
+ "l0": 40.0,
38
+ "fve": 0.8784041379676274,
39
+ "n_dead": 8,
40
+ "mse": 0.018850065767765045
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 9.30925178527832,
45
+ "l0": 40.0,
46
+ "fve": 0.8774098117815495,
47
+ "n_dead": 2,
48
+ "mse": 0.018041376024484634
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 8.932659149169922,
53
+ "l0": 40.0,
54
+ "fve": 0.8964732688104713,
55
+ "n_dead": 1,
56
+ "mse": 0.01737871766090393
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 8.559926986694336,
61
+ "l0": 40.0,
62
+ "fve": 0.8907664732630304,
63
+ "n_dead": 0,
64
+ "mse": 0.016718603670597076
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 8.413354873657227,
69
+ "l0": 40.0,
70
+ "fve": 0.8881362105445537,
71
+ "n_dead": 0,
72
+ "mse": 0.01643233373761177
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 8.195343017578125,
77
+ "l0": 40.0,
78
+ "fve": 0.8944998626527212,
79
+ "n_dead": 1,
80
+ "mse": 0.015944253653287888
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 8.069483757019043,
85
+ "l0": 40.0,
86
+ "fve": 0.9000171435610246,
87
+ "n_dead": 1,
88
+ "mse": 0.01569938287138939
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 7.750567436218262,
93
+ "l0": 40.0,
94
+ "fve": 0.9054223585460653,
95
+ "n_dead": 0,
96
+ "mse": 0.015137827023863792
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 7.990331649780273,
101
+ "l0": 40.0,
102
+ "fve": 0.8937416797380295,
103
+ "n_dead": 1,
104
+ "mse": 0.015545391477644444
105
+ }
106
+ ]
adhd_L2_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:725267a3fca5866bcf9c8dcab2fd77cddd7b27ecc42cb631250395d36cbcf62a
3
+ size 16798005
adhd_L3_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "adhd",
3
+ "layer": 3,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 16.80815315246582,
13
+ "l0": 40.0,
14
+ "fve": 0.8691270860953982,
15
+ "n_dead": 0,
16
+ "mse": 0.032828424125909805
17
+ },
18
+ "dead_pct_heldout": 2.2216796875
19
+ }
adhd_L3_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 29.05353546142578,
5
+ "l0": 40.0,
6
+ "fve": 0.7888412205173247,
7
+ "n_dead": 0,
8
+ "mse": 0.05674518644809723
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 44.51063537597656,
13
+ "l0": 40.0,
14
+ "fve": 0.828121429866145,
15
+ "n_dead": 769,
16
+ "mse": 0.04685475304722786
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 40.60480880737305,
21
+ "l0": 40.0,
22
+ "fve": 0.8353118397308505,
23
+ "n_dead": 239,
24
+ "mse": 0.04323723167181015
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 22.681713104248047,
29
+ "l0": 40.0,
30
+ "fve": 0.8478801929634973,
31
+ "n_dead": 29,
32
+ "mse": 0.03988751769065857
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 19.90431785583496,
37
+ "l0": 40.0,
38
+ "fve": 0.8681096795485392,
39
+ "n_dead": 4,
40
+ "mse": 0.03827929496765137
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 19.122100830078125,
45
+ "l0": 40.0,
46
+ "fve": 0.8613335049168376,
47
+ "n_dead": 2,
48
+ "mse": 0.037058740854263306
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 18.61587142944336,
53
+ "l0": 40.0,
54
+ "fve": 0.854929133645014,
55
+ "n_dead": 2,
56
+ "mse": 0.03607767075300217
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 17.96778106689453,
61
+ "l0": 40.0,
62
+ "fve": 0.8535506236950299,
63
+ "n_dead": 1,
64
+ "mse": 0.03495676815509796
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 17.5817928314209,
69
+ "l0": 40.0,
70
+ "fve": 0.8671142899286272,
71
+ "n_dead": 1,
72
+ "mse": 0.03420582413673401
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 17.37525749206543,
77
+ "l0": 40.0,
78
+ "fve": 0.8686602042934782,
79
+ "n_dead": 2,
80
+ "mse": 0.03367304056882858
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 16.68439483642578,
85
+ "l0": 40.0,
86
+ "fve": 0.877142112507239,
87
+ "n_dead": 1,
88
+ "mse": 0.03245990723371506
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 16.43060874938965,
93
+ "l0": 40.0,
94
+ "fve": 0.8754777867952352,
95
+ "n_dead": 0,
96
+ "mse": 0.03209103271365166
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 16.80815315246582,
101
+ "l0": 40.0,
102
+ "fve": 0.8691270860953982,
103
+ "n_dead": 0,
104
+ "mse": 0.032828424125909805
105
+ }
106
+ ]
adhd_L3_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f314c6a8d95b68b24e83107e45a2a41003088be5556ec13d5b3ed019f68f7ad5
3
+ size 16798005
causal_nulls_per_seed.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "n_seeds": 4,
3
+ "n_prompts": 30,
4
+ "coefs": [
5
+ 0.0,
6
+ 2.0,
7
+ 5.0,
8
+ 10.0
9
+ ],
10
+ "results": {
11
+ "L2_up_std": {
12
+ "0.0": [
13
+ 0.08018867924528301,
14
+ 0.08653846153846154,
15
+ 0.09090909090909091,
16
+ 0.06862745098039216
17
+ ],
18
+ "2.0": [
19
+ 0.08133971291866028,
20
+ 0.08695652173913043,
21
+ 0.08530805687203792,
22
+ 0.07352941176470588
23
+ ],
24
+ "5.0": [
25
+ 0.07655502392344497,
26
+ 0.08695652173913043,
27
+ 0.09268292682926829,
28
+ 0.07881773399014778
29
+ ],
30
+ "10.0": [
31
+ 0.08018867924528301,
32
+ 0.08490566037735849,
33
+ 0.11682242990654206,
34
+ 0.08866995073891626
35
+ ]
36
+ },
37
+ "L2_down_adhd": {
38
+ "0.0": [
39
+ 0.4488888888888889,
40
+ 0.3949771689497717,
41
+ 0.44794188861985473,
42
+ 0.42403628117913833
43
+ ],
44
+ "2.0": [
45
+ 0.43935926773455375,
46
+ 0.382830626450116,
47
+ 0.44364508393285373,
48
+ 0.432183908045977
49
+ ],
50
+ "5.0": [
51
+ 0.4387990762124711,
52
+ 0.39902676399026765,
53
+ 0.42857142857142855,
54
+ 0.42755344418052255
55
+ ],
56
+ "10.0": [
57
+ 0.39172749391727496,
58
+ 0.3949771689497717,
59
+ 0.40648379052369077,
60
+ 0.4230769230769231
61
+ ]
62
+ },
63
+ "L2_zero_adhd": {
64
+ "baseline": [
65
+ 0.4488888888888889,
66
+ 0.3949771689497717,
67
+ 0.44794188861985473,
68
+ 0.42403628117913833
69
+ ],
70
+ "zero_step_onset": [
71
+ 0.44742729306487694,
72
+ 0.38672768878718533,
73
+ 0.4348894348894349,
74
+ 0.4217687074829932
75
+ ]
76
+ },
77
+ "L1_up_std": {
78
+ "0.0": [
79
+ 0.08018867924528301,
80
+ 0.08653846153846154,
81
+ 0.09090909090909091,
82
+ 0.06862745098039216
83
+ ],
84
+ "2.0": [
85
+ 0.08095238095238096,
86
+ 0.08695652173913043,
87
+ 0.0861244019138756,
88
+ 0.06862745098039216
89
+ ],
90
+ "5.0": [
91
+ 0.0761904761904762,
92
+ 0.07766990291262135,
93
+ 0.08695652173913043,
94
+ 0.07352941176470588
95
+ ],
96
+ "10.0": [
97
+ 0.07692307692307693,
98
+ 0.08163265306122448,
99
+ 0.0784313725490196,
100
+ 0.07881773399014778
101
+ ]
102
+ }
103
+ }
104
+ }
deepdive_steering.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feat_2504_deep_dive": {
3
+ "top_contexts": [
4
+ {
5
+ "act": 2.2637319564819336,
6
+ "pos_class": "step_onset",
7
+ "context": "for 2 minutes at easy pace *<|sep|>* sip water <|sep|> fist pump \u2014",
8
+ "variant": "adhd"
9
+ },
10
+ {
11
+ "act": 2.1833174228668213,
12
+ "pos_class": "step_onset",
13
+ "context": "acy resources to prevent recur rence *<|sep|>* deep breath <|sep|> close eyes briefly",
14
+ "variant": "adhd"
15
+ },
16
+ {
17
+ "act": 2.110867500305176,
18
+ "pos_class": "step_onset",
19
+ "context": "excess grout while wet using sponge *<|sep|>* close eyes briefly <|sep|> apply grout",
20
+ "variant": "adhd"
21
+ },
22
+ {
23
+ "act": 2.1062774658203125,
24
+ "pos_class": "step_onset",
25
+ "context": "donate remaining items to charity organizations *<|sep|>* quick stretch <|sep|> pause and breathe",
26
+ "variant": "adhd"
27
+ },
28
+ {
29
+ "act": 2.099095106124878,
30
+ "pos_class": "step_onset",
31
+ "context": "e red details and architectural elements *<|sep|>* quick stretch <|sep|> pause and breathe",
32
+ "variant": "adhd"
33
+ },
34
+ {
35
+ "act": 2.096402645111084,
36
+ "pos_class": "step_onset",
37
+ "context": "new skills to build competen ce *<|sep|>* roll shoulders <|sep|> seek feedback from",
38
+ "variant": "adhd"
39
+ },
40
+ {
41
+ "act": 2.071779251098633,
42
+ "pos_class": "step_onset",
43
+ "context": "up monthly and weekly spread s *<|sep|>* roll shoulders <|sep|> begin adding daily",
44
+ "variant": "adhd"
45
+ },
46
+ {
47
+ "act": 2.044790029525757,
48
+ "pos_class": "step_onset",
49
+ "context": "methodology and divide data collection tasks *<|sep|>* close eyes briefly <|sep|> quick focus",
50
+ "variant": "adhd"
51
+ },
52
+ {
53
+ "act": 2.0143749713897705,
54
+ "pos_class": "step_onset",
55
+ "context": "hydration products on longer training rides *<|sep|>* close eyes briefly <|sep|> complete a",
56
+ "variant": "adhd"
57
+ },
58
+ {
59
+ "act": 2.014024257659912,
60
+ "pos_class": "step_onset",
61
+ "context": "briefly <|sep|> refin ed training intensity *<|sep|>* shake out hands <|sep|> complete weeks",
62
+ "variant": "adhd"
63
+ },
64
+ {
65
+ "act": 2.0133330821990967,
66
+ "pos_class": "step_onset",
67
+ "context": "<|sep|> rinse all scrub bed surfaces *<|sep|>* roll shoulders <|sep|> scrub the lav",
68
+ "variant": "adhd"
69
+ },
70
+ {
71
+ "act": 2.0048322677612305,
72
+ "pos_class": "step_onset",
73
+ "context": "<|sep|> write results and discussion sections *<|sep|>* present findings in final report or",
74
+ "variant": "adhd"
75
+ },
76
+ {
77
+ "act": 1.988505244255066,
78
+ "pos_class": "step_onset",
79
+ "context": "roll shoulders <|sep|> increased cardio sessions *<|sep|>* deep breath <|sep|> 30 - second",
80
+ "variant": "adhd"
81
+ },
82
+ {
83
+ "act": 1.963397741317749,
84
+ "pos_class": "step_onset",
85
+ "context": "increased cardio sessions <|sep|> deep breath *<|sep|>* 30 - second stretch <|sep|> execute",
86
+ "variant": "adhd"
87
+ },
88
+ {
89
+ "act": 1.9471516609191895,
90
+ "pos_class": "step_onset",
91
+ "context": "with proper organization and archi ving *<|sep|>* wiggle fingers <|sep|> research family gen",
92
+ "variant": "adhd"
93
+ },
94
+ {
95
+ "act": 1.9434456825256348,
96
+ "pos_class": "step_onset",
97
+ "context": "until you see no soap residue *<|sep|>* sip water <|sep|> wipe down with",
98
+ "variant": "adhd"
99
+ },
100
+ {
101
+ "act": 1.936253547668457,
102
+ "pos_class": "step_onset",
103
+ "context": "for 5 minutes at easy pace *<|sep|>* quick stretch <|sep|> hydrate and refuel",
104
+ "variant": "adhd"
105
+ },
106
+ {
107
+ "act": 1.9354044198989868,
108
+ "pos_class": "step_onset",
109
+ "context": "online and wait for approval decision *<|sep|>* wiggle fingers <|sep|> fist pump \u2014",
110
+ "variant": "adhd"
111
+ },
112
+ {
113
+ "act": 1.9305871725082397,
114
+ "pos_class": "step_onset",
115
+ "context": "for user interface and user experience *<|sep|>* sip water <|sep|> choose development framework",
116
+ "variant": "adhd"
117
+ },
118
+ {
119
+ "act": 1.9305052757263184,
120
+ "pos_class": "step_onset",
121
+ "context": "<|sep|> add color and shading details *<|sep|>* deep breath <|sep|> pause and breathe",
122
+ "variant": "adhd"
123
+ }
124
+ ],
125
+ "top_cofire_partners": [
126
+ 2418,
127
+ 653,
128
+ 1216,
129
+ 225,
130
+ 1131,
131
+ 2959,
132
+ 1650,
133
+ 3953,
134
+ 702,
135
+ 352
136
+ ]
137
+ },
138
+ "symmetry_control_substantive_count": 5,
139
+ "symmetry_control_top_15": [
140
+ 1406,
141
+ 1258,
142
+ 2097,
143
+ 156,
144
+ 697,
145
+ 120,
146
+ 1539,
147
+ 1385,
148
+ 3795,
149
+ 1065,
150
+ 531,
151
+ 3280,
152
+ 3927,
153
+ 3967,
154
+ 3846
155
+ ],
156
+ "steering_results": {
157
+ "std_baseline": {
158
+ "sep_rate": 0.08425135764158262,
159
+ "mean_step_count": 6.8125,
160
+ "spearman_rho": 0.5305203306664215,
161
+ "spearman_pval": 4.155845216990538e-07,
162
+ "regulation_rate": 0.09357798165137615,
163
+ "n_tokens_total": 6445,
164
+ "n_steps_total": 545,
165
+ "reached_end_rate": 0.975
166
+ },
167
+ "std_coef_+2.0": {
168
+ "sep_rate": 0.08416458852867831,
169
+ "mean_step_count": 6.775,
170
+ "spearman_rho": 0.530739309117657,
171
+ "spearman_pval": 4.102184726171915e-07,
172
+ "regulation_rate": 0.0940959409594096,
173
+ "n_tokens_total": 6416,
174
+ "n_steps_total": 542,
175
+ "reached_end_rate": 0.975
176
+ },
177
+ "std_coef_+5.0": {
178
+ "sep_rate": 0.08429777916796997,
179
+ "mean_step_count": 6.7625,
180
+ "spearman_rho": 0.5327086311005121,
181
+ "spearman_pval": 3.64818647432245e-07,
182
+ "regulation_rate": 0.09057301293900184,
183
+ "n_tokens_total": 6394,
184
+ "n_steps_total": 541,
185
+ "reached_end_rate": 0.975
186
+ },
187
+ "std_coef_+10.0": {
188
+ "sep_rate": 0.08608971454463073,
189
+ "mean_step_count": 7.1375,
190
+ "spearman_rho": 0.4614572824649014,
191
+ "spearman_pval": 1.6459708212616308e-05,
192
+ "regulation_rate": 0.08756567425569177,
193
+ "n_tokens_total": 6621,
194
+ "n_steps_total": 571,
195
+ "reached_end_rate": 0.975
196
+ },
197
+ "adhd_baseline": {
198
+ "sep_rate": 0.12945795615661468,
199
+ "mean_step_count": 15.0625,
200
+ "spearman_rho": 0.8158904455366707,
201
+ "spearman_pval": 3.038475633405349e-20,
202
+ "regulation_rate": 0.41327800829875516,
203
+ "n_tokens_total": 9169,
204
+ "n_steps_total": 1205,
205
+ "reached_end_rate": 0.7625
206
+ },
207
+ "adhd_coef_-2.0": {
208
+ "sep_rate": 0.12845057880676758,
209
+ "mean_step_count": 14.625,
210
+ "spearman_rho": 0.7957172265026485,
211
+ "spearman_pval": 1.1613642559127553e-18,
212
+ "regulation_rate": 0.4008547008547009,
213
+ "n_tokens_total": 8984,
214
+ "n_steps_total": 1170,
215
+ "reached_end_rate": 0.7875
216
+ },
217
+ "adhd_coef_-5.0": {
218
+ "sep_rate": 0.1263146117699709,
219
+ "mean_step_count": 14.3375,
220
+ "spearman_rho": 0.780151209846579,
221
+ "spearman_pval": 1.4775933869589044e-17,
222
+ "regulation_rate": 0.4167393199651264,
223
+ "n_tokens_total": 8938,
224
+ "n_steps_total": 1147,
225
+ "reached_end_rate": 0.7625
226
+ },
227
+ "adhd_coef_-10.0": {
228
+ "sep_rate": 0.12151652624756967,
229
+ "mean_step_count": 14.225,
230
+ "spearman_rho": 0.7471782967641961,
231
+ "spearman_pval": 1.7289858832058174e-15,
232
+ "regulation_rate": 0.3945518453427065,
233
+ "n_tokens_total": 9258,
234
+ "n_steps_total": 1138,
235
+ "reached_end_rate": 0.725
236
+ }
237
+ }
238
+ }
feature_diff.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sae_results": {
3
+ "standard_L1": {
4
+ "final_fve": 0.9156635482984559,
5
+ "final_l0": 40.0,
6
+ "dead_pct_train": 0.0244140625,
7
+ "dead_pct_heldout": 1.5380859375
8
+ },
9
+ "standard_L2": {
10
+ "final_fve": 0.9112300092129357,
11
+ "final_l0": 40.0,
12
+ "dead_pct_train": 0.0244140625,
13
+ "dead_pct_heldout": 1.220703125
14
+ },
15
+ "standard_L3": {
16
+ "final_fve": 0.8956661837197387,
17
+ "final_l0": 40.0,
18
+ "dead_pct_train": 0.0244140625,
19
+ "dead_pct_heldout": 1.025390625
20
+ },
21
+ "adhd_L1": {
22
+ "final_fve": 0.9020028367114843,
23
+ "final_l0": 40.0,
24
+ "dead_pct_train": 0.1220703125,
25
+ "dead_pct_heldout": 1.3671875
26
+ },
27
+ "adhd_L2": {
28
+ "final_fve": 0.8937416797380295,
29
+ "final_l0": 40.0,
30
+ "dead_pct_train": 0.0244140625,
31
+ "dead_pct_heldout": 1.46484375
32
+ },
33
+ "adhd_L3": {
34
+ "final_fve": 0.8691270860953982,
35
+ "final_l0": 40.0,
36
+ "dead_pct_train": 0.0,
37
+ "dead_pct_heldout": 2.2216796875
38
+ }
39
+ },
40
+ "primary_count": 312,
41
+ "symmetry_count": 139,
42
+ "layer_control_count": 194,
43
+ "primary_top_features": [
44
+ {
45
+ "feat_id": 2418,
46
+ "adhd_rate": 0.8974166512489319,
47
+ "std_rate": 0.0010833332780748606
48
+ },
49
+ {
50
+ "feat_id": 653,
51
+ "adhd_rate": 0.8386666774749756,
52
+ "std_rate": 0.0
53
+ },
54
+ {
55
+ "feat_id": 1216,
56
+ "adhd_rate": 0.6754166483879089,
57
+ "std_rate": 0.009583333507180214
58
+ },
59
+ {
60
+ "feat_id": 225,
61
+ "adhd_rate": 0.6380833387374878,
62
+ "std_rate": 0.0
63
+ },
64
+ {
65
+ "feat_id": 1131,
66
+ "adhd_rate": 0.6314166784286499,
67
+ "std_rate": 0.0
68
+ },
69
+ {
70
+ "feat_id": 2504,
71
+ "adhd_rate": 0.546999990940094,
72
+ "std_rate": 0.0005000000237487257
73
+ },
74
+ {
75
+ "feat_id": 1650,
76
+ "adhd_rate": 0.5131666660308838,
77
+ "std_rate": 8.333333244081587e-05
78
+ },
79
+ {
80
+ "feat_id": 2959,
81
+ "adhd_rate": 0.4754999876022339,
82
+ "std_rate": 0.00016666666488163173
83
+ },
84
+ {
85
+ "feat_id": 3953,
86
+ "adhd_rate": 0.4663333296775818,
87
+ "std_rate": 0.00016666666488163173
88
+ },
89
+ {
90
+ "feat_id": 352,
91
+ "adhd_rate": 0.42516666650772095,
92
+ "std_rate": 0.0016666667070239782
93
+ },
94
+ {
95
+ "feat_id": 702,
96
+ "adhd_rate": 0.4244999885559082,
97
+ "std_rate": 0.0
98
+ },
99
+ {
100
+ "feat_id": 2505,
101
+ "adhd_rate": 0.4099166691303253,
102
+ "std_rate": 0.0
103
+ },
104
+ {
105
+ "feat_id": 1156,
106
+ "adhd_rate": 0.3932499885559082,
107
+ "std_rate": 0.0
108
+ },
109
+ {
110
+ "feat_id": 2512,
111
+ "adhd_rate": 0.38883334398269653,
112
+ "std_rate": 0.0
113
+ },
114
+ {
115
+ "feat_id": 1835,
116
+ "adhd_rate": 0.3605000078678131,
117
+ "std_rate": 0.00016666666488163173
118
+ }
119
+ ]
120
+ }
loading_example.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal example: load one of the InterpGPT Phase 5 TopK SAEs from HuggingFace.
3
+ """
4
+ from huggingface_hub import snapshot_download
5
+ from sae_lens import SAE
6
+
7
+ repo_id = "connaaa/interpgpt-sae-phase5"
8
+ local = snapshot_download(repo_id=repo_id, allow_patterns=["adhd_L2_hook_resid_post/*"])
9
+ sae = SAE.load_from_disk(f"{local}/adhd_L2_hook_resid_post")
10
+ print(sae)
standard_L0_hook_resid_post/analysis.json ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "act_rate_distribution": {
3
+ "mean": 0.009765625,
4
+ "dead_count": 768,
5
+ "high_count": 0
6
+ },
7
+ "sample_features": [
8
+ {
9
+ "feature_id": 159,
10
+ "act_rate": 0.1606599986553192,
11
+ "top_examples": [
12
+ {
13
+ "activation": 0.5935809016227722,
14
+ "position": 10,
15
+ "variant": "standard"
16
+ },
17
+ {
18
+ "activation": 0.5935809016227722,
19
+ "position": 10,
20
+ "variant": "standard"
21
+ },
22
+ {
23
+ "activation": 0.5628541707992554,
24
+ "position": 22,
25
+ "variant": "standard"
26
+ },
27
+ {
28
+ "activation": 0.5628541707992554,
29
+ "position": 22,
30
+ "variant": "standard"
31
+ },
32
+ {
33
+ "activation": 0.5378819108009338,
34
+ "position": 8,
35
+ "variant": "standard"
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "feature_id": 836,
41
+ "act_rate": 0.12184999883174896,
42
+ "top_examples": [
43
+ {
44
+ "activation": 0.6280968189239502,
45
+ "position": 15,
46
+ "variant": "standard"
47
+ },
48
+ {
49
+ "activation": 0.6280968189239502,
50
+ "position": 15,
51
+ "variant": "standard"
52
+ },
53
+ {
54
+ "activation": 0.5908636450767517,
55
+ "position": 1,
56
+ "variant": "standard"
57
+ },
58
+ {
59
+ "activation": 0.5908636450767517,
60
+ "position": 1,
61
+ "variant": "standard"
62
+ },
63
+ {
64
+ "activation": 0.5908636450767517,
65
+ "position": 1,
66
+ "variant": "standard"
67
+ }
68
+ ]
69
+ },
70
+ {
71
+ "feature_id": 2203,
72
+ "act_rate": 0.1174900010228157,
73
+ "top_examples": [
74
+ {
75
+ "activation": 0.6077227592468262,
76
+ "position": 7,
77
+ "variant": "standard"
78
+ },
79
+ {
80
+ "activation": 0.6077227592468262,
81
+ "position": 7,
82
+ "variant": "standard"
83
+ },
84
+ {
85
+ "activation": 0.6077227592468262,
86
+ "position": 7,
87
+ "variant": "standard"
88
+ },
89
+ {
90
+ "activation": 0.6077227592468262,
91
+ "position": 7,
92
+ "variant": "standard"
93
+ },
94
+ {
95
+ "activation": 0.6077227592468262,
96
+ "position": 7,
97
+ "variant": "standard"
98
+ }
99
+ ]
100
+ },
101
+ {
102
+ "feature_id": 3106,
103
+ "act_rate": 0.11097999662160873,
104
+ "top_examples": [
105
+ {
106
+ "activation": 0.458252489566803,
107
+ "position": 3,
108
+ "variant": "standard"
109
+ },
110
+ {
111
+ "activation": 0.458252489566803,
112
+ "position": 3,
113
+ "variant": "standard"
114
+ },
115
+ {
116
+ "activation": 0.45644882321357727,
117
+ "position": 47,
118
+ "variant": "standard"
119
+ },
120
+ {
121
+ "activation": 0.45644882321357727,
122
+ "position": 47,
123
+ "variant": "standard"
124
+ },
125
+ {
126
+ "activation": 0.4534014165401459,
127
+ "position": 46,
128
+ "variant": "standard"
129
+ }
130
+ ]
131
+ },
132
+ {
133
+ "feature_id": 2939,
134
+ "act_rate": 0.10080999881029129,
135
+ "top_examples": [
136
+ {
137
+ "activation": 0.9360920190811157,
138
+ "position": 12,
139
+ "variant": "standard"
140
+ },
141
+ {
142
+ "activation": 0.9360920190811157,
143
+ "position": 12,
144
+ "variant": "standard"
145
+ },
146
+ {
147
+ "activation": 0.9333191514015198,
148
+ "position": 5,
149
+ "variant": "standard"
150
+ },
151
+ {
152
+ "activation": 0.9333191514015198,
153
+ "position": 5,
154
+ "variant": "standard"
155
+ },
156
+ {
157
+ "activation": 0.8574860692024231,
158
+ "position": 20,
159
+ "variant": "standard"
160
+ }
161
+ ]
162
+ },
163
+ {
164
+ "feature_id": 61,
165
+ "act_rate": 0.0965299978852272,
166
+ "top_examples": [
167
+ {
168
+ "activation": 0.49229905009269714,
169
+ "position": 4,
170
+ "variant": "standard"
171
+ },
172
+ {
173
+ "activation": 0.49229905009269714,
174
+ "position": 4,
175
+ "variant": "standard"
176
+ },
177
+ {
178
+ "activation": 0.4829713702201843,
179
+ "position": 8,
180
+ "variant": "standard"
181
+ },
182
+ {
183
+ "activation": 0.4829713702201843,
184
+ "position": 8,
185
+ "variant": "standard"
186
+ },
187
+ {
188
+ "activation": 0.47879478335380554,
189
+ "position": 10,
190
+ "variant": "standard"
191
+ }
192
+ ]
193
+ },
194
+ {
195
+ "feature_id": 4034,
196
+ "act_rate": 0.09401000291109085,
197
+ "top_examples": [
198
+ {
199
+ "activation": 0.5372604131698608,
200
+ "position": 52,
201
+ "variant": "standard"
202
+ },
203
+ {
204
+ "activation": 0.5372604131698608,
205
+ "position": 52,
206
+ "variant": "standard"
207
+ },
208
+ {
209
+ "activation": 0.5347657203674316,
210
+ "position": 108,
211
+ "variant": "standard"
212
+ },
213
+ {
214
+ "activation": 0.5347657203674316,
215
+ "position": 108,
216
+ "variant": "standard"
217
+ },
218
+ {
219
+ "activation": 0.5321744680404663,
220
+ "position": 60,
221
+ "variant": "standard"
222
+ }
223
+ ]
224
+ },
225
+ {
226
+ "feature_id": 3787,
227
+ "act_rate": 0.091279998421669,
228
+ "top_examples": [
229
+ {
230
+ "activation": 0.5871065258979797,
231
+ "position": 22,
232
+ "variant": "standard"
233
+ },
234
+ {
235
+ "activation": 0.5871065258979797,
236
+ "position": 22,
237
+ "variant": "standard"
238
+ },
239
+ {
240
+ "activation": 0.5592474937438965,
241
+ "position": 55,
242
+ "variant": "standard"
243
+ },
244
+ {
245
+ "activation": 0.5592474937438965,
246
+ "position": 55,
247
+ "variant": "standard"
248
+ },
249
+ {
250
+ "activation": 0.5409792065620422,
251
+ "position": 91,
252
+ "variant": "standard"
253
+ }
254
+ ]
255
+ },
256
+ {
257
+ "feature_id": 280,
258
+ "act_rate": 0.09025000035762787,
259
+ "top_examples": [
260
+ {
261
+ "activation": 0.534440279006958,
262
+ "position": 7,
263
+ "variant": "standard"
264
+ },
265
+ {
266
+ "activation": 0.534440279006958,
267
+ "position": 7,
268
+ "variant": "standard"
269
+ },
270
+ {
271
+ "activation": 0.5156826376914978,
272
+ "position": 9,
273
+ "variant": "standard"
274
+ },
275
+ {
276
+ "activation": 0.5156826376914978,
277
+ "position": 9,
278
+ "variant": "standard"
279
+ },
280
+ {
281
+ "activation": 0.5135127305984497,
282
+ "position": 9,
283
+ "variant": "standard"
284
+ }
285
+ ]
286
+ },
287
+ {
288
+ "feature_id": 2159,
289
+ "act_rate": 0.0877000018954277,
290
+ "top_examples": [
291
+ {
292
+ "activation": 0.603478729724884,
293
+ "position": 9,
294
+ "variant": "standard"
295
+ },
296
+ {
297
+ "activation": 0.603478729724884,
298
+ "position": 9,
299
+ "variant": "standard"
300
+ },
301
+ {
302
+ "activation": 0.603478729724884,
303
+ "position": 9,
304
+ "variant": "standard"
305
+ },
306
+ {
307
+ "activation": 0.5963488817214966,
308
+ "position": 9,
309
+ "variant": "standard"
310
+ },
311
+ {
312
+ "activation": 0.5963488817214966,
313
+ "position": 9,
314
+ "variant": "standard"
315
+ }
316
+ ]
317
+ },
318
+ {
319
+ "feature_id": 3645,
320
+ "act_rate": 0.003819999983534217,
321
+ "top_examples": [
322
+ {
323
+ "activation": 0.21240393817424774,
324
+ "position": 202,
325
+ "variant": "standard"
326
+ },
327
+ {
328
+ "activation": 0.21240393817424774,
329
+ "position": 202,
330
+ "variant": "standard"
331
+ },
332
+ {
333
+ "activation": 0.20894747972488403,
334
+ "position": 217,
335
+ "variant": "standard"
336
+ },
337
+ {
338
+ "activation": 0.20894747972488403,
339
+ "position": 217,
340
+ "variant": "standard"
341
+ },
342
+ {
343
+ "activation": 0.20496299862861633,
344
+ "position": 40,
345
+ "variant": "standard"
346
+ }
347
+ ]
348
+ },
349
+ {
350
+ "feature_id": 2682,
351
+ "act_rate": 0.003809999907389283,
352
+ "top_examples": [
353
+ {
354
+ "activation": 0.2780674993991852,
355
+ "position": 3,
356
+ "variant": "standard"
357
+ },
358
+ {
359
+ "activation": 0.2780674993991852,
360
+ "position": 3,
361
+ "variant": "standard"
362
+ },
363
+ {
364
+ "activation": 0.2724977433681488,
365
+ "position": 3,
366
+ "variant": "standard"
367
+ },
368
+ {
369
+ "activation": 0.2724977433681488,
370
+ "position": 3,
371
+ "variant": "standard"
372
+ },
373
+ {
374
+ "activation": 0.263476699590683,
375
+ "position": 3,
376
+ "variant": "standard"
377
+ }
378
+ ]
379
+ },
380
+ {
381
+ "feature_id": 2786,
382
+ "act_rate": 0.003800000064074993,
383
+ "top_examples": [
384
+ {
385
+ "activation": 0.3066694736480713,
386
+ "position": 2,
387
+ "variant": "standard"
388
+ },
389
+ {
390
+ "activation": 0.3066694736480713,
391
+ "position": 2,
392
+ "variant": "standard"
393
+ },
394
+ {
395
+ "activation": 0.3066694438457489,
396
+ "position": 2,
397
+ "variant": "standard"
398
+ },
399
+ {
400
+ "activation": 0.3066694438457489,
401
+ "position": 2,
402
+ "variant": "standard"
403
+ },
404
+ {
405
+ "activation": 0.3066694438457489,
406
+ "position": 2,
407
+ "variant": "standard"
408
+ }
409
+ ]
410
+ },
411
+ {
412
+ "feature_id": 1681,
413
+ "act_rate": 0.003800000064074993,
414
+ "top_examples": [
415
+ {
416
+ "activation": 0.21353764832019806,
417
+ "position": 40,
418
+ "variant": "standard"
419
+ },
420
+ {
421
+ "activation": 0.21353764832019806,
422
+ "position": 40,
423
+ "variant": "standard"
424
+ },
425
+ {
426
+ "activation": 0.2118273675441742,
427
+ "position": 217,
428
+ "variant": "standard"
429
+ },
430
+ {
431
+ "activation": 0.2118273675441742,
432
+ "position": 217,
433
+ "variant": "standard"
434
+ },
435
+ {
436
+ "activation": 0.21169668436050415,
437
+ "position": 202,
438
+ "variant": "standard"
439
+ }
440
+ ]
441
+ }
442
+ ],
443
+ "n_total_features": 4096
444
+ }
standard_L0_hook_resid_post/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "standard",
3
+ "layer": 0,
4
+ "hook": "hook_resid_post",
5
+ "d_sae": 4096,
6
+ "k": 40,
7
+ "n_tokens": 500000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final": {
11
+ "step": 122,
12
+ "loss": 3.7668752670288086,
13
+ "losses": {
14
+ "mse_loss": 3.7668752670288086,
15
+ "auxiliary_reconstruction_loss": 0.0
16
+ },
17
+ "l0": 40.0,
18
+ "mse": 0.007357178255915642,
19
+ "fve": 0.7880726447111882,
20
+ "n_dead": 0
21
+ }
22
+ }
standard_L0_hook_resid_post/history.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 50,
4
+ "loss": 6.100651741027832,
5
+ "losses": {
6
+ "mse_loss": 6.100651741027832,
7
+ "auxiliary_reconstruction_loss": 0.0
8
+ },
9
+ "l0": 40.0,
10
+ "mse": 0.011915335431694984,
11
+ "fve": 0.6626028674075588,
12
+ "n_dead": 0
13
+ },
14
+ {
15
+ "step": 100,
16
+ "loss": 4.176002502441406,
17
+ "losses": {
18
+ "mse_loss": 4.176002502441406,
19
+ "auxiliary_reconstruction_loss": 0.0
20
+ },
21
+ "l0": 40.0,
22
+ "mse": 0.008156255818903446,
23
+ "fve": 0.7637865209658066,
24
+ "n_dead": 0
25
+ },
26
+ {
27
+ "step": 122,
28
+ "loss": 3.7668752670288086,
29
+ "losses": {
30
+ "mse_loss": 3.7668752670288086,
31
+ "auxiliary_reconstruction_loss": 0.0
32
+ },
33
+ "l0": 40.0,
34
+ "mse": 0.007357178255915642,
35
+ "fve": 0.7880726447111882,
36
+ "n_dead": 0
37
+ }
38
+ ]
standard_L0_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d41284ab9bceb803122433b0ecdfbdbe5027bacb38b6fa2d54727dd395c26cc1
3
+ size 16798005
standard_L1_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "standard",
3
+ "layer": 1,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 4.313589572906494,
13
+ "l0": 40.0,
14
+ "fve": 0.9156635482984559,
15
+ "n_dead": 1,
16
+ "mse": 0.008392197079956532
17
+ },
18
+ "dead_pct_heldout": 1.5380859375
19
+ }
standard_L1_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 8.472447395324707,
5
+ "l0": 40.0,
6
+ "fve": 0.8475441618823183,
7
+ "n_dead": 0,
8
+ "mse": 0.016547750681638718
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 13.104448318481445,
13
+ "l0": 40.0,
14
+ "fve": 0.8741473969735887,
15
+ "n_dead": 602,
16
+ "mse": 0.013663655146956444
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 11.666557312011719,
21
+ "l0": 40.0,
22
+ "fve": 0.897244935256183,
23
+ "n_dead": 294,
24
+ "mse": 0.012026979587972164
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 8.557323455810547,
29
+ "l0": 40.0,
30
+ "fve": 0.9050136010213505,
31
+ "n_dead": 141,
32
+ "mse": 0.011152522638440132
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 5.929265975952148,
37
+ "l0": 40.0,
38
+ "fve": 0.8892356006901958,
39
+ "n_dead": 26,
40
+ "mse": 0.010530667379498482
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 5.310263633728027,
45
+ "l0": 40.0,
46
+ "fve": 0.9071743648944192,
47
+ "n_dead": 6,
48
+ "mse": 0.010134927928447723
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 5.067918300628662,
53
+ "l0": 40.0,
54
+ "fve": 0.9030405773914236,
55
+ "n_dead": 2,
56
+ "mse": 0.00982158724218607
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 4.7904052734375,
61
+ "l0": 40.0,
62
+ "fve": 0.9108113011155183,
63
+ "n_dead": 2,
64
+ "mse": 0.009283771738409996
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 4.771254062652588,
69
+ "l0": 40.0,
70
+ "fve": 0.9153215139625523,
71
+ "n_dead": 6,
72
+ "mse": 0.009105676785111427
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 4.466042518615723,
77
+ "l0": 40.0,
78
+ "fve": 0.9176716499875504,
79
+ "n_dead": 1,
80
+ "mse": 0.008688799105584621
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 4.474887371063232,
85
+ "l0": 40.0,
86
+ "fve": 0.9006545558548696,
87
+ "n_dead": 2,
88
+ "mse": 0.00867227278649807
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 4.392549991607666,
93
+ "l0": 40.0,
94
+ "fve": 0.9094376632806204,
95
+ "n_dead": 1,
96
+ "mse": 0.00854581780731678
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 4.313589572906494,
101
+ "l0": 40.0,
102
+ "fve": 0.9156635482984559,
103
+ "n_dead": 1,
104
+ "mse": 0.008392197079956532
105
+ }
106
+ ]
standard_L1_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1099ad67ddea1fb3b4ef9752f9c57b6dba97c3cbfc88d595316e5c52c54704e1
3
+ size 16798005
standard_L2_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "standard",
3
+ "layer": 2,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 9.37691879272461,
13
+ "l0": 40.0,
14
+ "fve": 0.9112300092129357,
15
+ "n_dead": 1,
16
+ "mse": 0.018243035301566124
17
+ },
18
+ "dead_pct_heldout": 1.220703125
19
+ }
standard_L2_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 17.65301513671875,
5
+ "l0": 40.0,
6
+ "fve": 0.8277783719921248,
7
+ "n_dead": 0,
8
+ "mse": 0.03447854518890381
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 26.82003402709961,
13
+ "l0": 40.0,
14
+ "fve": 0.8783504287003407,
15
+ "n_dead": 885,
16
+ "mse": 0.02819967456161976
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 24.623018264770508,
21
+ "l0": 40.0,
22
+ "fve": 0.8645155231541486,
23
+ "n_dead": 466,
24
+ "mse": 0.025606488808989525
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 20.887020111083984,
29
+ "l0": 40.0,
30
+ "fve": 0.8902317708880264,
31
+ "n_dead": 211,
32
+ "mse": 0.02367589809000492
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 13.424010276794434,
37
+ "l0": 40.0,
38
+ "fve": 0.8956536633832358,
39
+ "n_dead": 45,
40
+ "mse": 0.02241015061736107
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 11.20499324798584,
45
+ "l0": 40.0,
46
+ "fve": 0.9072051860620202,
47
+ "n_dead": 9,
48
+ "mse": 0.021145537495613098
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 10.573310852050781,
53
+ "l0": 40.0,
54
+ "fve": 0.9040789272639144,
55
+ "n_dead": 3,
56
+ "mse": 0.02041182667016983
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 10.361445426940918,
61
+ "l0": 40.0,
62
+ "fve": 0.9104271378826995,
63
+ "n_dead": 1,
64
+ "mse": 0.020158452913165092
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 9.941110610961914,
69
+ "l0": 40.0,
70
+ "fve": 0.9068027304791877,
71
+ "n_dead": 1,
72
+ "mse": 0.01934068836271763
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 9.595905303955078,
77
+ "l0": 40.0,
78
+ "fve": 0.9189567435127948,
79
+ "n_dead": 1,
80
+ "mse": 0.01866907998919487
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 9.470767974853516,
85
+ "l0": 40.0,
86
+ "fve": 0.9136617571485228,
87
+ "n_dead": 1,
88
+ "mse": 0.01842562109231949
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 9.251761436462402,
93
+ "l0": 40.0,
94
+ "fve": 0.9031395392007785,
95
+ "n_dead": 1,
96
+ "mse": 0.01799953728914261
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 9.37691879272461,
101
+ "l0": 40.0,
102
+ "fve": 0.9112300092129357,
103
+ "n_dead": 1,
104
+ "mse": 0.018243035301566124
105
+ }
106
+ ]
standard_L2_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c36a71a6c67b951c929ca6a65422b6a1490dd39bcb78f79480fd3576b95e341
3
+ size 16798005
standard_L3_hook_resid_post/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "standard",
3
+ "layer": 3,
4
+ "d_sae": 4096,
5
+ "k": 40,
6
+ "normalize_activations": "expected_average_only_in",
7
+ "n_tokens": 10000000,
8
+ "batch_size": 4096,
9
+ "lr": 0.0003,
10
+ "final_training": {
11
+ "step": 2441,
12
+ "loss": 19.480852127075195,
13
+ "l0": 40.0,
14
+ "fve": 0.8956661837197387,
15
+ "n_dead": 1,
16
+ "mse": 0.03790052607655525
17
+ },
18
+ "dead_pct_heldout": 1.025390625
19
+ }
standard_L3_hook_resid_post/history.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 200,
4
+ "loss": 34.482032775878906,
5
+ "l0": 40.0,
6
+ "fve": 0.8283586159312837,
7
+ "n_dead": 0,
8
+ "mse": 0.06734772026538849
9
+ },
10
+ {
11
+ "step": 400,
12
+ "loss": 53.01472473144531,
13
+ "l0": 40.0,
14
+ "fve": 0.8312126634199706,
15
+ "n_dead": 1054,
16
+ "mse": 0.05626294016838074
17
+ },
18
+ {
19
+ "step": 600,
20
+ "loss": 48.47029495239258,
21
+ "l0": 40.0,
22
+ "fve": 0.884058701037366,
23
+ "n_dead": 593,
24
+ "mse": 0.05088154226541519
25
+ },
26
+ {
27
+ "step": 800,
28
+ "loss": 42.332054138183594,
29
+ "l0": 40.0,
30
+ "fve": 0.8804909955406929,
31
+ "n_dead": 205,
32
+ "mse": 0.04835565760731697
33
+ },
34
+ {
35
+ "step": 1000,
36
+ "loss": 28.461124420166016,
37
+ "l0": 40.0,
38
+ "fve": 0.888121025084119,
39
+ "n_dead": 58,
40
+ "mse": 0.04564930498600006
41
+ },
42
+ {
43
+ "step": 1200,
44
+ "loss": 24.47071647644043,
45
+ "l0": 40.0,
46
+ "fve": 0.8850747737111373,
47
+ "n_dead": 21,
48
+ "mse": 0.044211357831954956
49
+ },
50
+ {
51
+ "step": 1400,
52
+ "loss": 23.19893455505371,
53
+ "l0": 40.0,
54
+ "fve": 0.8942765037624966,
55
+ "n_dead": 15,
56
+ "mse": 0.04282252490520477
57
+ },
58
+ {
59
+ "step": 1600,
60
+ "loss": 21.993581771850586,
61
+ "l0": 40.0,
62
+ "fve": 0.8978253015716876,
63
+ "n_dead": 10,
64
+ "mse": 0.04135049134492874
65
+ },
66
+ {
67
+ "step": 1800,
68
+ "loss": 21.293956756591797,
69
+ "l0": 40.0,
70
+ "fve": 0.8922899854658137,
71
+ "n_dead": 8,
72
+ "mse": 0.040335513651371
73
+ },
74
+ {
75
+ "step": 2000,
76
+ "loss": 20.39199447631836,
77
+ "l0": 40.0,
78
+ "fve": 0.9074694820636785,
79
+ "n_dead": 4,
80
+ "mse": 0.03921680152416229
81
+ },
82
+ {
83
+ "step": 2200,
84
+ "loss": 20.141448974609375,
85
+ "l0": 40.0,
86
+ "fve": 0.8943716286803387,
87
+ "n_dead": 3,
88
+ "mse": 0.038883697241544724
89
+ },
90
+ {
91
+ "step": 2400,
92
+ "loss": 19.7071475982666,
93
+ "l0": 40.0,
94
+ "fve": 0.8941642474910445,
95
+ "n_dead": 1,
96
+ "mse": 0.03834077715873718
97
+ },
98
+ {
99
+ "step": 2441,
100
+ "loss": 19.480852127075195,
101
+ "l0": 40.0,
102
+ "fve": 0.8956661837197387,
103
+ "n_dead": 1,
104
+ "mse": 0.03790052607655525
105
+ }
106
+ ]
standard_L3_hook_resid_post/sae.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9dc2de61616a842a8ac0c32f8d1da82606b2fa17f57a91f7252d3d9dcaf215f
3
+ size 16798005
three_probes.json ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "probe_1_variant_split": {
3
+ "onsets_all": 12000,
4
+ "onsets_adhd": 8770,
5
+ "onsets_std": 3230,
6
+ "primary_mixed": 312,
7
+ "symmetry_mixed": 139,
8
+ "primary_adhd_only": 312,
9
+ "symmetry_std_only": 216,
10
+ "feat2504_rate_adhd_only_in_adhd_L2": 0.5907639861106873,
11
+ "feat2504_rate_adhd_only_in_std_L2": 0.00034207524731755257
12
+ },
13
+ "probe_2_zero_ablation": {
14
+ "adhd_baseline_reg_rate": 0.41327800829875516,
15
+ "adhd_zero_ablate_L2_step_onset_reg_rate": 0.4056761268781302,
16
+ "adhd_zero_ablate_L2_all_positions_reg_rate": 0.404344193817878,
17
+ "delta_step_onset": -0.007601881420624956,
18
+ "delta_all_positions": -0.008933814480877156
19
+ },
20
+ "probe_3_L1_steering": {
21
+ "baseline": {
22
+ "sep_rate": 0.08425135764158262,
23
+ "mean_step_count": 6.8125,
24
+ "spearman_rho": 0.5305203306664215,
25
+ "spearman_pval": 4.155845216990538e-07,
26
+ "regulation_rate": 0.09357798165137615,
27
+ "n_tokens_total": 6445,
28
+ "n_steps_total": 545,
29
+ "reached_end_rate": 0.975
30
+ },
31
+ "coef_+2.0": {
32
+ "sep_rate": 0.08416458852867831,
33
+ "mean_step_count": 6.775,
34
+ "spearman_rho": 0.5369945327780861,
35
+ "spearman_pval": 2.8189998198585957e-07,
36
+ "regulation_rate": 0.0940959409594096,
37
+ "n_tokens_total": 6416,
38
+ "n_steps_total": 542,
39
+ "reached_end_rate": 0.975
40
+ },
41
+ "coef_+5.0": {
42
+ "sep_rate": 0.08443188951663529,
43
+ "mean_step_count": 6.75,
44
+ "spearman_rho": 0.5421974917403188,
45
+ "spearman_pval": 2.051407247246999e-07,
46
+ "regulation_rate": 0.08703703703703704,
47
+ "n_tokens_total": 6372,
48
+ "n_steps_total": 540,
49
+ "reached_end_rate": 0.975
50
+ },
51
+ "coef_+10.0": {
52
+ "sep_rate": 0.08677685950413223,
53
+ "mean_step_count": 6.8125,
54
+ "spearman_rho": 0.33306527760157806,
55
+ "spearman_pval": 0.002537468671163372,
56
+ "regulation_rate": 0.08440366972477065,
57
+ "n_tokens_total": 6292,
58
+ "n_steps_total": 545,
59
+ "reached_end_rate": 0.975
60
+ }
61
+ }
62
+ }