Mattimax commited on
Commit
a5a2325
·
verified ·
1 Parent(s): d86ae0d

Upload 30 files

Browse files
ckpt_step12000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34114e7d2aa584b8950f67e2e6eb754ea58da6ee90b294f0e36f6acd2da2b92e
3
+ size 6553752
ckpt_step12000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bc77055e05d83a8648b455ca79b09a5c47e0d74cc6e11b8960ac458bdb0fe6d
3
+ size 18621877
ckpt_step12000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.581811738334373e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.392214592749543e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015330615793495478,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0013256267383211992,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.4815048141745128e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0016550163198513121,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0011460702099068709,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.448621745094851e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0017662248591607762,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0014458430882383331,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.5898866106425138e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0017807006919347208,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0013159806417537298,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.5102813720775883e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.001635105435196884,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014829091987141286,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6232451749776298e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018184544851330134,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0014100814746928778,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.3923730437985064e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.001558237501848476,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.001297427992075522,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.4737556341745387e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.001819704540025606,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0011787184194186353,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3685434271200611e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0015505155323866596,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012525476892032773,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4270414227688646e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.001975340480772604,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0010307410888653853,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3767226276349794e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016244626034771176,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0013419555950221864,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.426349069844605e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0015124659452720702,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0008482409357893936,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1492330355526724e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.036966271366553116,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step15000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b074b825f1f62d689ce477e406da6b3ddd50e55619a27e6ba5a6dbd971a5670
3
+ size 6553752
ckpt_step15000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af66951b4d6ef04647101e0653236fc56fce0696a58f96114bf5811daadb10f
3
+ size 18621877
ckpt_step15000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.582241750710105e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.3958851148514928e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015343368606173897,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0013366482003336628,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.4852239374053823e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017223362461657786,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0012296341619023,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.452415333213305e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0018171136152895229,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0015010006163871945,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.5992701812381174e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018112181527277429,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.001341285394764397,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.5114929896950427e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0016495684787852368,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014644906594866655,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6378242346764618e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018133741278217345,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0014244531166704433,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.4009069472894002e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0016187947649524718,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0013597113259064683,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.4961687126861084e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0018500898850115077,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.001232206548185874,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3731041172674875e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0015964233113894123,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012714107992950199,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4444463112646798e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.002053559843842514,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0010909055245073573,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3806844722933192e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016286599203226885,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.001324042756619491,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.445963035969241e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0015793989107234082,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0008934699492999699,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1484186113718572e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03697070933128537,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step18000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff7a79363df7054783fad6a1fb0174ac603774bf29cc1384ccc8318b294ba9d6
3
+ size 6553752
ckpt_step18000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ada684c71be4d528dd460713843dd92b5159883b21bf71d32644f6e2f4fd22a3
3
+ size 18621877
ckpt_step18000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.5842506579341274e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.396422001006635e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015402755013470386,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.001352025612025824,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.489702204585635e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017307295892051637,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0012422978022775125,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4520225451421528e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0018183867845817626,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0014860432531759306,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.6026017744147062e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018476695609712975,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0013701977540996882,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.5177405084982492e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0016937003209704302,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014701769191318422,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6480374350692388e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018358808651553176,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.00145576925623706,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.404315136982035e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0016675239086959117,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0013738338720949428,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.5098767889956164e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.001854907301360303,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0012564356016231146,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.380026225424442e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0016112130072042509,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012746360207036536,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4544013003965002e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.0020856668009402056,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0011273897857047254,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3878268487246175e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.001633894099857,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0013039615059331458,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.4475491096530794e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.001573194747727762,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0009173383242371325,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1474316673942336e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.036991442361108975,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step21000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:739bc02c58ebed77db45b560270276dbfd1ec642f109107b856f29a7cb88b6f7
3
+ size 6553752
ckpt_step21000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aded99d0c73ed0e84e348fcf083967bb482a4f6d8c66ad9de0d5942a25bf25b
3
+ size 18621877
ckpt_step21000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.5853660707463055e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.396120162416502e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015464579579267726,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0013568891398419358,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.48971314694507e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017299298544437678,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.001237895037138631,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4525889898786262e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.001812825248829098,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0014997929183386254,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.6042203909597218e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018547830467957023,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0013789747727068202,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.520067678343831e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.001711166359201266,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014667620892469151,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6531287585949872e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018410505121017817,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0014701665939177867,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.4067345352857128e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0016811375860962155,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.001386186230732775,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.5147292280251253e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0018486293365505549,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0012678662002425305,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.383779454710694e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0016329259937073114,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.00127607732672023,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4582359578899773e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.002102405615571991,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0011325541527121836,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3892125493330852e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016303469195009968,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0012970737668342289,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.4459195507486286e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.001586161808597835,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.000924753881238652,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.14727286107386e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03700295403612512,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step24000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c7391f01d8b412b76cd55984b39f17d77cdde5e90734057ccd08e9e94eef892
3
+ size 6553752
ckpt_step24000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7316911a561d71de1b1419bbbf8a57c6a8be00285c9d2662327a8a066a34115e
3
+ size 18621877
ckpt_step24000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.586699036350945e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.3956053030887944e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015463161035654863,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.001357492108876386,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.4900598918414562e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017312376366694893,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0012420365039627194,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4530813960532073e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0018099663378554817,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0015023619019948404,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.6050929375432511e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018571704005501213,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0013809318700982642,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.5210742333032993e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0017202362384252472,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014654186380999106,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6545653340694e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018387726290827473,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0014700526645672414,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.4072503893733692e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0016881805554018997,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.001386279509655209,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.5165899975633548e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0018448526548457708,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.00126676105034265,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3851478180743423e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0016373029458110538,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.001275305047641664,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4592150858965758e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.0021044990527217594,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0011363571167800181,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.388911705502901e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016319394664370919,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.001292794317604125,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.445249935194624e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0015836343604617982,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0009252550647822703,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1470058390819299e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03701671097587045,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step27000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a251e00dcd4f488f8732bfc40679f17be0289dc20d265b0285e1b88628842d
3
+ size 6553752
ckpt_step27000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c30f02797a827a0122b4d19f9b16c7948b06eb61425b7a29b3e832e0a532bc85
3
+ size 18621877
ckpt_step27000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.58738843859461e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.3957144424660175e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015462274005901906,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0013571834084425197,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.4902280062727778e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017331310227969312,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0012420143282188955,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.45296898817901e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.001811304978391392,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.001503237785209896,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.6053171848573894e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018566600064462376,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0013811127960081926,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.521516901480448e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0017214831253544365,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014642733606635868,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6548252506331256e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.001838694603317441,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.001470265387443182,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.4074115404850503e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0016881379638939203,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0013850321533981083,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.516958058744355e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0018397353379614521,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0012651419863795484,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3853096797288439e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0016353369311892891,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012745740694193202,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4596549119024427e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.0021032799734713335,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0011359815371187465,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.388664720818508e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016273016853464682,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0012909373043898335,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.4445254657348794e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.001582141193710988,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0009270950648597656,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1469122605924748e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03702382598701057,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac71d1ac78ca8be0cbbd6e2783d0a781173c16c38502b7be8c8d65dae12934c
3
+ size 6553752
ckpt_step3000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f3a1458b4a747fa6c9ddd781e5525917c92fb1a791d5a254fc004f9154149f6
3
+ size 18621877
ckpt_step3000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.5722721575251916e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.3538360433878532e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0013009633219017568,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0009806638006958249,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.3978162281289483e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0012522846309088159,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0008591893586456118,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.3831941095351961e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.001282494329923795,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0011245226614926556,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.4735743036467565e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.001335447659535746,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0009407425338697058,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.4124889373715176e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.001387487442367734,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0012036952295510599,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.4793962073005056e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0015438962488531877,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0010037684617888083,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.3520645180278738e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0011730166719523753,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0010373295140886681,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.3207615540723584e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0011169617888365016,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0008502002038224287,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3003884439983394e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.001148089610578582,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0010316128654612143,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.3064186788070484e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.0011656039895843145,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0006925634595222173,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.302203312470367e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0012216476088303093,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0009262990260556739,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.308989564840049e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0009419608504622752,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0005326317604001864,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1641727667871719e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.036867817634565696,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step30000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f22b068f9b345afd39d4328d0b686852b1e8a6e16a519f9f5b2e0c5f87bd70a
3
+ size 6553752
ckpt_step30000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5fc58e620c5a584a59a95945f098e82896fd15e9215faeaf46d7476f082a638
3
+ size 18621877
ckpt_step30000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.5875506936873147e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.3957357587506314e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015460898875120869,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0013567888210008275,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.490188357983396e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0017328931908777191,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0012420368559586533,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4529456823744988e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0018116695288467406,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.001503234147918581,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.6053559804953867e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0018565433784602007,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.00138081676742794,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.521564507849419e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.001721999738053149,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0014643287413571575,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.6548077712797422e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0018386093029695045,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0014701377302512222,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.4073107855131086e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.001687991181589562,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.001384218338799364,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.516973690686405e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0018394041097878283,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0012654396576074916,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3852953267638705e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.001635414487626684,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012745447364248441,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4596075897505998e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.00210327011758519,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0011355915256241926,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3887219905698373e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0016268933700633612,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0012903153275749626,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.444489654376728e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0015825252212746687,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0009272237193735378,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.146917163337936e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03702550054899922,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step6000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037ba892a08ad1e84455806a22915daefa26625da772d076cd5ea92ce6e1eba4
3
+ size 6553752
ckpt_step6000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:024c3e07c9bdf3c4197efdde242814e8697b4bc7a7ee4f17a9a8ffb3cc6a8256
3
+ size 18621877
ckpt_step6000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.576527897602447e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.378099091186781e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0014341432127296268,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0012399422454891053,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.444702817222867e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.0015248707513040257,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.0010049688852547473,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4253803157631853e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0015096906919986994,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.001277954873030655,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.541712233740787e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0015895651399489275,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0012072392419436597,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.4674046603366728e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0015400529225849543,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.001324677874615887,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.5549068718082285e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.0017806409699579675,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0012346178376637857,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.3762538114820376e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0013776558439431979,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0011256408352420836,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.3971424914265849e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0014574954442959883,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0010178668375898346,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3411574015166323e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.0013714578995423804,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0011894033733184694,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.3673849581055776e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.0016128875691929013,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0008645662725310438,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.342961611846353e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0014681386280116884,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0011316316887019,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.3784712735161398e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0012796107792348186,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0006700524395714406,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1577189063489027e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03691173921784649,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }
ckpt_step9000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23a7779c7bca4d7e8b8f0d3bbe21ff4d9dc6b8b34dac15e7506a274e411c3d70
3
+ size 6553752
ckpt_step9000/model_fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42ebfba98fcf1b33494c782f205b0f53c1097661012e1037fdf2ac5dfbfbffe5
3
+ size 18621877
ckpt_step9000/model_scales.json ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tok_emb.weight": {
3
+ "scale": 3.578842033577095e-05,
4
+ "nbits": 18,
5
+ "dtype": "int32",
6
+ "emulated": true
7
+ },
8
+ "pos_emb.weight": {
9
+ "scale": 3.314935944296658e-05,
10
+ "nbits": 18,
11
+ "dtype": "int32",
12
+ "emulated": true
13
+ },
14
+ "blocks.0.norm1.weight": {
15
+ "scale": 1.394089999469874e-07,
16
+ "nbits": 24,
17
+ "dtype": "int32",
18
+ "emulated": true
19
+ },
20
+ "blocks.0.attn.mask": {
21
+ "scale": 0.007874015826771653,
22
+ "nbits": 8,
23
+ "dtype": "int8",
24
+ "emulated": false
25
+ },
26
+ "blocks.0.attn.W_qkv.weight": {
27
+ "scale": 0.0015216064583467499,
28
+ "nbits": 8,
29
+ "dtype": "int8",
30
+ "emulated": false
31
+ },
32
+ "blocks.0.attn.W_o.weight": {
33
+ "scale": 0.0012548967927328996,
34
+ "nbits": 8,
35
+ "dtype": "int8",
36
+ "emulated": false
37
+ },
38
+ "blocks.0.norm2.weight": {
39
+ "scale": 1.46743095042103e-07,
40
+ "nbits": 24,
41
+ "dtype": "int32",
42
+ "emulated": true
43
+ },
44
+ "blocks.0.ff.fc1.weight": {
45
+ "scale": 0.001648169412280706,
46
+ "nbits": 8,
47
+ "dtype": "int8",
48
+ "emulated": false
49
+ },
50
+ "blocks.0.ff.fc2.weight": {
51
+ "scale": 0.001054338192269633,
52
+ "nbits": 8,
53
+ "dtype": "int8",
54
+ "emulated": false
55
+ },
56
+ "blocks.1.norm1.weight": {
57
+ "scale": 1.4451383799718111e-07,
58
+ "nbits": 24,
59
+ "dtype": "int32",
60
+ "emulated": true
61
+ },
62
+ "blocks.1.attn.mask": {
63
+ "scale": 0.007874015826771653,
64
+ "nbits": 8,
65
+ "dtype": "int8",
66
+ "emulated": false
67
+ },
68
+ "blocks.1.attn.W_qkv.weight": {
69
+ "scale": 0.0016650045564663503,
70
+ "nbits": 8,
71
+ "dtype": "int8",
72
+ "emulated": false
73
+ },
74
+ "blocks.1.attn.W_o.weight": {
75
+ "scale": 0.0013984300572910458,
76
+ "nbits": 8,
77
+ "dtype": "int8",
78
+ "emulated": false
79
+ },
80
+ "blocks.1.norm2.weight": {
81
+ "scale": 1.5704104898250477e-07,
82
+ "nbits": 24,
83
+ "dtype": "int32",
84
+ "emulated": true
85
+ },
86
+ "blocks.1.ff.fc1.weight": {
87
+ "scale": 0.0017093292924232181,
88
+ "nbits": 8,
89
+ "dtype": "int8",
90
+ "emulated": false
91
+ },
92
+ "blocks.1.ff.fc2.weight": {
93
+ "scale": 0.0012757184081998209,
94
+ "nbits": 8,
95
+ "dtype": "int8",
96
+ "emulated": false
97
+ },
98
+ "blocks.2.norm1.weight": {
99
+ "scale": 1.4974127310561757e-07,
100
+ "nbits": 24,
101
+ "dtype": "int32",
102
+ "emulated": true
103
+ },
104
+ "blocks.2.attn.mask": {
105
+ "scale": 0.007874015826771653,
106
+ "nbits": 8,
107
+ "dtype": "int8",
108
+ "emulated": false
109
+ },
110
+ "blocks.2.attn.W_qkv.weight": {
111
+ "scale": 0.0015654190402200653,
112
+ "nbits": 8,
113
+ "dtype": "int8",
114
+ "emulated": false
115
+ },
116
+ "blocks.2.attn.W_o.weight": {
117
+ "scale": 0.0013663089030240276,
118
+ "nbits": 8,
119
+ "dtype": "int8",
120
+ "emulated": false
121
+ },
122
+ "blocks.2.norm2.weight": {
123
+ "scale": 1.5941392094229872e-07,
124
+ "nbits": 24,
125
+ "dtype": "int32",
126
+ "emulated": true
127
+ },
128
+ "blocks.2.ff.fc1.weight": {
129
+ "scale": 0.001793610729463532,
130
+ "nbits": 8,
131
+ "dtype": "int8",
132
+ "emulated": false
133
+ },
134
+ "blocks.2.ff.fc2.weight": {
135
+ "scale": 0.0013356635503750899,
136
+ "nbits": 8,
137
+ "dtype": "int8",
138
+ "emulated": false
139
+ },
140
+ "blocks.3.norm1.weight": {
141
+ "scale": 1.3939665071276776e-07,
142
+ "nbits": 24,
143
+ "dtype": "int32",
144
+ "emulated": true
145
+ },
146
+ "blocks.3.attn.mask": {
147
+ "scale": 0.007874015826771653,
148
+ "nbits": 8,
149
+ "dtype": "int8",
150
+ "emulated": false
151
+ },
152
+ "blocks.3.attn.W_qkv.weight": {
153
+ "scale": 0.0014945593254642035,
154
+ "nbits": 8,
155
+ "dtype": "int8",
156
+ "emulated": false
157
+ },
158
+ "blocks.3.attn.W_o.weight": {
159
+ "scale": 0.0012709688097342544,
160
+ "nbits": 8,
161
+ "dtype": "int8",
162
+ "emulated": false
163
+ },
164
+ "blocks.3.norm2.weight": {
165
+ "scale": 1.4363823607952405e-07,
166
+ "nbits": 24,
167
+ "dtype": "int32",
168
+ "emulated": true
169
+ },
170
+ "blocks.3.ff.fc1.weight": {
171
+ "scale": 0.0017127885911277711,
172
+ "nbits": 8,
173
+ "dtype": "int8",
174
+ "emulated": false
175
+ },
176
+ "blocks.3.ff.fc2.weight": {
177
+ "scale": 0.0011641868539551863,
178
+ "nbits": 8,
179
+ "dtype": "int8",
180
+ "emulated": false
181
+ },
182
+ "blocks.4.norm1.weight": {
183
+ "scale": 1.3554279435228193e-07,
184
+ "nbits": 24,
185
+ "dtype": "int32",
186
+ "emulated": true
187
+ },
188
+ "blocks.4.attn.mask": {
189
+ "scale": 0.007874015826771653,
190
+ "nbits": 8,
191
+ "dtype": "int8",
192
+ "emulated": false
193
+ },
194
+ "blocks.4.attn.W_qkv.weight": {
195
+ "scale": 0.001422968632485998,
196
+ "nbits": 8,
197
+ "dtype": "int8",
198
+ "emulated": false
199
+ },
200
+ "blocks.4.attn.W_o.weight": {
201
+ "scale": 0.0012153839585218654,
202
+ "nbits": 8,
203
+ "dtype": "int8",
204
+ "emulated": false
205
+ },
206
+ "blocks.4.norm2.weight": {
207
+ "scale": 1.4058388250062366e-07,
208
+ "nbits": 24,
209
+ "dtype": "int32",
210
+ "emulated": true
211
+ },
212
+ "blocks.4.ff.fc1.weight": {
213
+ "scale": 0.001863799539969737,
214
+ "nbits": 8,
215
+ "dtype": "int8",
216
+ "emulated": false
217
+ },
218
+ "blocks.4.ff.fc2.weight": {
219
+ "scale": 0.0009441251561266981,
220
+ "nbits": 8,
221
+ "dtype": "int8",
222
+ "emulated": false
223
+ },
224
+ "blocks.5.norm1.weight": {
225
+ "scale": 1.3613868398323536e-07,
226
+ "nbits": 24,
227
+ "dtype": "int32",
228
+ "emulated": true
229
+ },
230
+ "blocks.5.attn.mask": {
231
+ "scale": 0.007874015826771653,
232
+ "nbits": 8,
233
+ "dtype": "int8",
234
+ "emulated": false
235
+ },
236
+ "blocks.5.attn.W_qkv.weight": {
237
+ "scale": 0.0015208560030160738,
238
+ "nbits": 8,
239
+ "dtype": "int8",
240
+ "emulated": false
241
+ },
242
+ "blocks.5.attn.W_o.weight": {
243
+ "scale": 0.0012394373659881833,
244
+ "nbits": 8,
245
+ "dtype": "int8",
246
+ "emulated": false
247
+ },
248
+ "blocks.5.norm2.weight": {
249
+ "scale": 1.4058437988059799e-07,
250
+ "nbits": 24,
251
+ "dtype": "int32",
252
+ "emulated": true
253
+ },
254
+ "blocks.5.ff.fc1.weight": {
255
+ "scale": 0.0013886798872591755,
256
+ "nbits": 8,
257
+ "dtype": "int8",
258
+ "emulated": false
259
+ },
260
+ "blocks.5.ff.fc2.weight": {
261
+ "scale": 0.0007607480638478496,
262
+ "nbits": 8,
263
+ "dtype": "int8",
264
+ "emulated": false
265
+ },
266
+ "norm_f.weight": {
267
+ "scale": 1.1516335334173259e-07,
268
+ "nbits": 24,
269
+ "dtype": "int32",
270
+ "emulated": true
271
+ },
272
+ "lm_head.weight": {
273
+ "scale": 0.03693562237661287,
274
+ "nbits": 8,
275
+ "dtype": "int8",
276
+ "emulated": false
277
+ }
278
+ }