Upload 30 files
Browse files- ckpt_step12000/model.safetensors +3 -0
- ckpt_step12000/model_fp32.pt +3 -0
- ckpt_step12000/model_scales.json +278 -0
- ckpt_step15000/model.safetensors +3 -0
- ckpt_step15000/model_fp32.pt +3 -0
- ckpt_step15000/model_scales.json +278 -0
- ckpt_step18000/model.safetensors +3 -0
- ckpt_step18000/model_fp32.pt +3 -0
- ckpt_step18000/model_scales.json +278 -0
- ckpt_step21000/model.safetensors +3 -0
- ckpt_step21000/model_fp32.pt +3 -0
- ckpt_step21000/model_scales.json +278 -0
- ckpt_step24000/model.safetensors +3 -0
- ckpt_step24000/model_fp32.pt +3 -0
- ckpt_step24000/model_scales.json +278 -0
- ckpt_step27000/model.safetensors +3 -0
- ckpt_step27000/model_fp32.pt +3 -0
- ckpt_step27000/model_scales.json +278 -0
- ckpt_step3000/model.safetensors +3 -0
- ckpt_step3000/model_fp32.pt +3 -0
- ckpt_step3000/model_scales.json +278 -0
- ckpt_step30000/model.safetensors +3 -0
- ckpt_step30000/model_fp32.pt +3 -0
- ckpt_step30000/model_scales.json +278 -0
- ckpt_step6000/model.safetensors +3 -0
- ckpt_step6000/model_fp32.pt +3 -0
- ckpt_step6000/model_scales.json +278 -0
- ckpt_step9000/model.safetensors +3 -0
- ckpt_step9000/model_fp32.pt +3 -0
- ckpt_step9000/model_scales.json +278 -0
ckpt_step12000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34114e7d2aa584b8950f67e2e6eb754ea58da6ee90b294f0e36f6acd2da2b92e
|
| 3 |
+
size 6553752
|
ckpt_step12000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bc77055e05d83a8648b455ca79b09a5c47e0d74cc6e11b8960ac458bdb0fe6d
|
| 3 |
+
size 18621877
|
ckpt_step12000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.581811738334373e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.392214592749543e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015330615793495478,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0013256267383211992,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.4815048141745128e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0016550163198513121,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0011460702099068709,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.448621745094851e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0017662248591607762,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0014458430882383331,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.5898866106425138e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0017807006919347208,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0013159806417537298,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.5102813720775883e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.001635105435196884,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014829091987141286,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6232451749776298e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018184544851330134,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0014100814746928778,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.3923730437985064e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.001558237501848476,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.001297427992075522,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.4737556341745387e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.001819704540025606,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0011787184194186353,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3685434271200611e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0015505155323866596,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012525476892032773,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4270414227688646e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.001975340480772604,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0010307410888653853,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3767226276349794e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016244626034771176,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0013419555950221864,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.426349069844605e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0015124659452720702,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0008482409357893936,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1492330355526724e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.036966271366553116,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step15000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b074b825f1f62d689ce477e406da6b3ddd50e55619a27e6ba5a6dbd971a5670
|
| 3 |
+
size 6553752
|
ckpt_step15000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0af66951b4d6ef04647101e0653236fc56fce0696a58f96114bf5811daadb10f
|
| 3 |
+
size 18621877
|
ckpt_step15000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.582241750710105e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.3958851148514928e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015343368606173897,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0013366482003336628,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.4852239374053823e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017223362461657786,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0012296341619023,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.452415333213305e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0018171136152895229,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0015010006163871945,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.5992701812381174e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018112181527277429,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.001341285394764397,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.5114929896950427e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0016495684787852368,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014644906594866655,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6378242346764618e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018133741278217345,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0014244531166704433,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.4009069472894002e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0016187947649524718,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0013597113259064683,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.4961687126861084e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0018500898850115077,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.001232206548185874,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3731041172674875e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0015964233113894123,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012714107992950199,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4444463112646798e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.002053559843842514,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0010909055245073573,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3806844722933192e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016286599203226885,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.001324042756619491,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.445963035969241e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0015793989107234082,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0008934699492999699,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1484186113718572e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03697070933128537,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step18000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff7a79363df7054783fad6a1fb0174ac603774bf29cc1384ccc8318b294ba9d6
|
| 3 |
+
size 6553752
|
ckpt_step18000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ada684c71be4d528dd460713843dd92b5159883b21bf71d32644f6e2f4fd22a3
|
| 3 |
+
size 18621877
|
ckpt_step18000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.5842506579341274e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.396422001006635e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015402755013470386,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.001352025612025824,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.489702204585635e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017307295892051637,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0012422978022775125,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4520225451421528e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0018183867845817626,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0014860432531759306,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.6026017744147062e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018476695609712975,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0013701977540996882,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.5177405084982492e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0016937003209704302,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014701769191318422,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6480374350692388e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018358808651553176,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.00145576925623706,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.404315136982035e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0016675239086959117,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0013738338720949428,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.5098767889956164e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.001854907301360303,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0012564356016231146,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.380026225424442e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0016112130072042509,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012746360207036536,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4544013003965002e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.0020856668009402056,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0011273897857047254,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3878268487246175e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.001633894099857,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0013039615059331458,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.4475491096530794e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.001573194747727762,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0009173383242371325,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1474316673942336e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.036991442361108975,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step21000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:739bc02c58ebed77db45b560270276dbfd1ec642f109107b856f29a7cb88b6f7
|
| 3 |
+
size 6553752
|
ckpt_step21000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aded99d0c73ed0e84e348fcf083967bb482a4f6d8c66ad9de0d5942a25bf25b
|
| 3 |
+
size 18621877
|
ckpt_step21000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.5853660707463055e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.396120162416502e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015464579579267726,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0013568891398419358,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.48971314694507e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017299298544437678,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.001237895037138631,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4525889898786262e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.001812825248829098,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0014997929183386254,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.6042203909597218e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018547830467957023,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0013789747727068202,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.520067678343831e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.001711166359201266,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014667620892469151,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6531287585949872e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018410505121017817,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0014701665939177867,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.4067345352857128e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0016811375860962155,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.001386186230732775,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.5147292280251253e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0018486293365505549,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0012678662002425305,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.383779454710694e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0016329259937073114,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.00127607732672023,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4582359578899773e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.002102405615571991,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0011325541527121836,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3892125493330852e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016303469195009968,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0012970737668342289,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.4459195507486286e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.001586161808597835,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.000924753881238652,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.14727286107386e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03700295403612512,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step24000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c7391f01d8b412b76cd55984b39f17d77cdde5e90734057ccd08e9e94eef892
|
| 3 |
+
size 6553752
|
ckpt_step24000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7316911a561d71de1b1419bbbf8a57c6a8be00285c9d2662327a8a066a34115e
|
| 3 |
+
size 18621877
|
ckpt_step24000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.586699036350945e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.3956053030887944e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015463161035654863,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.001357492108876386,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.4900598918414562e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017312376366694893,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0012420365039627194,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4530813960532073e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0018099663378554817,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0015023619019948404,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.6050929375432511e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018571704005501213,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0013809318700982642,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.5210742333032993e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0017202362384252472,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014654186380999106,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6545653340694e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018387726290827473,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0014700526645672414,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.4072503893733692e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0016881805554018997,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.001386279509655209,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.5165899975633548e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0018448526548457708,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.00126676105034265,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3851478180743423e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0016373029458110538,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.001275305047641664,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4592150858965758e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.0021044990527217594,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0011363571167800181,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.388911705502901e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016319394664370919,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.001292794317604125,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.445249935194624e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0015836343604617982,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0009252550647822703,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1470058390819299e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03701671097587045,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step27000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96a251e00dcd4f488f8732bfc40679f17be0289dc20d265b0285e1b88628842d
|
| 3 |
+
size 6553752
|
ckpt_step27000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c30f02797a827a0122b4d19f9b16c7948b06eb61425b7a29b3e832e0a532bc85
|
| 3 |
+
size 18621877
|
ckpt_step27000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.58738843859461e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.3957144424660175e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015462274005901906,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0013571834084425197,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.4902280062727778e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017331310227969312,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0012420143282188955,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.45296898817901e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.001811304978391392,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.001503237785209896,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.6053171848573894e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018566600064462376,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0013811127960081926,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.521516901480448e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0017214831253544365,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014642733606635868,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6548252506331256e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.001838694603317441,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.001470265387443182,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.4074115404850503e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0016881379638939203,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0013850321533981083,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.516958058744355e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0018397353379614521,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0012651419863795484,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3853096797288439e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0016353369311892891,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012745740694193202,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4596549119024427e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.0021032799734713335,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0011359815371187465,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.388664720818508e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016273016853464682,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0012909373043898335,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.4445254657348794e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.001582141193710988,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0009270950648597656,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1469122605924748e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03702382598701057,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step3000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ac71d1ac78ca8be0cbbd6e2783d0a781173c16c38502b7be8c8d65dae12934c
|
| 3 |
+
size 6553752
|
ckpt_step3000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f3a1458b4a747fa6c9ddd781e5525917c92fb1a791d5a254fc004f9154149f6
|
| 3 |
+
size 18621877
|
ckpt_step3000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.5722721575251916e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.3538360433878532e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0013009633219017568,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0009806638006958249,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.3978162281289483e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0012522846309088159,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0008591893586456118,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.3831941095351961e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.001282494329923795,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0011245226614926556,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.4735743036467565e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.001335447659535746,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0009407425338697058,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.4124889373715176e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.001387487442367734,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0012036952295510599,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.4793962073005056e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0015438962488531877,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0010037684617888083,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.3520645180278738e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0011730166719523753,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0010373295140886681,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.3207615540723584e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0011169617888365016,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0008502002038224287,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3003884439983394e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.001148089610578582,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0010316128654612143,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.3064186788070484e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.0011656039895843145,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0006925634595222173,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.302203312470367e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0012216476088303093,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0009262990260556739,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.308989564840049e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0009419608504622752,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0005326317604001864,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1641727667871719e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.036867817634565696,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step30000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f22b068f9b345afd39d4328d0b686852b1e8a6e16a519f9f5b2e0c5f87bd70a
|
| 3 |
+
size 6553752
|
ckpt_step30000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5fc58e620c5a584a59a95945f098e82896fd15e9215faeaf46d7476f082a638
|
| 3 |
+
size 18621877
|
ckpt_step30000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.5875506936873147e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.3957357587506314e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015460898875120869,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0013567888210008275,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.490188357983396e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0017328931908777191,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0012420368559586533,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4529456823744988e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0018116695288467406,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.001503234147918581,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.6053559804953867e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0018565433784602007,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.00138081676742794,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.521564507849419e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.001721999738053149,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0014643287413571575,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.6548077712797422e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0018386093029695045,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0014701377302512222,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.4073107855131086e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.001687991181589562,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.001384218338799364,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.516973690686405e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0018394041097878283,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0012654396576074916,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3852953267638705e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.001635414487626684,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012745447364248441,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4596075897505998e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.00210327011758519,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0011355915256241926,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3887219905698373e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0016268933700633612,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0012903153275749626,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.444489654376728e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0015825252212746687,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0009272237193735378,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.146917163337936e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03702550054899922,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step6000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:037ba892a08ad1e84455806a22915daefa26625da772d076cd5ea92ce6e1eba4
|
| 3 |
+
size 6553752
|
ckpt_step6000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:024c3e07c9bdf3c4197efdde242814e8697b4bc7a7ee4f17a9a8ffb3cc6a8256
|
| 3 |
+
size 18621877
|
ckpt_step6000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.576527897602447e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.378099091186781e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0014341432127296268,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0012399422454891053,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.444702817222867e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.0015248707513040257,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.0010049688852547473,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4253803157631853e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0015096906919986994,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.001277954873030655,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.541712233740787e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0015895651399489275,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0012072392419436597,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.4674046603366728e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0015400529225849543,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.001324677874615887,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.5549068718082285e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.0017806409699579675,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0012346178376637857,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.3762538114820376e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0013776558439431979,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0011256408352420836,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.3971424914265849e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0014574954442959883,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0010178668375898346,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3411574015166323e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.0013714578995423804,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0011894033733184694,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.3673849581055776e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.0016128875691929013,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0008645662725310438,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.342961611846353e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0014681386280116884,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0011316316887019,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.3784712735161398e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0012796107792348186,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0006700524395714406,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1577189063489027e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03691173921784649,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|
ckpt_step9000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23a7779c7bca4d7e8b8f0d3bbe21ff4d9dc6b8b34dac15e7506a274e411c3d70
|
| 3 |
+
size 6553752
|
ckpt_step9000/model_fp32.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42ebfba98fcf1b33494c782f205b0f53c1097661012e1037fdf2ac5dfbfbffe5
|
| 3 |
+
size 18621877
|
ckpt_step9000/model_scales.json
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tok_emb.weight": {
|
| 3 |
+
"scale": 3.578842033577095e-05,
|
| 4 |
+
"nbits": 18,
|
| 5 |
+
"dtype": "int32",
|
| 6 |
+
"emulated": true
|
| 7 |
+
},
|
| 8 |
+
"pos_emb.weight": {
|
| 9 |
+
"scale": 3.314935944296658e-05,
|
| 10 |
+
"nbits": 18,
|
| 11 |
+
"dtype": "int32",
|
| 12 |
+
"emulated": true
|
| 13 |
+
},
|
| 14 |
+
"blocks.0.norm1.weight": {
|
| 15 |
+
"scale": 1.394089999469874e-07,
|
| 16 |
+
"nbits": 24,
|
| 17 |
+
"dtype": "int32",
|
| 18 |
+
"emulated": true
|
| 19 |
+
},
|
| 20 |
+
"blocks.0.attn.mask": {
|
| 21 |
+
"scale": 0.007874015826771653,
|
| 22 |
+
"nbits": 8,
|
| 23 |
+
"dtype": "int8",
|
| 24 |
+
"emulated": false
|
| 25 |
+
},
|
| 26 |
+
"blocks.0.attn.W_qkv.weight": {
|
| 27 |
+
"scale": 0.0015216064583467499,
|
| 28 |
+
"nbits": 8,
|
| 29 |
+
"dtype": "int8",
|
| 30 |
+
"emulated": false
|
| 31 |
+
},
|
| 32 |
+
"blocks.0.attn.W_o.weight": {
|
| 33 |
+
"scale": 0.0012548967927328996,
|
| 34 |
+
"nbits": 8,
|
| 35 |
+
"dtype": "int8",
|
| 36 |
+
"emulated": false
|
| 37 |
+
},
|
| 38 |
+
"blocks.0.norm2.weight": {
|
| 39 |
+
"scale": 1.46743095042103e-07,
|
| 40 |
+
"nbits": 24,
|
| 41 |
+
"dtype": "int32",
|
| 42 |
+
"emulated": true
|
| 43 |
+
},
|
| 44 |
+
"blocks.0.ff.fc1.weight": {
|
| 45 |
+
"scale": 0.001648169412280706,
|
| 46 |
+
"nbits": 8,
|
| 47 |
+
"dtype": "int8",
|
| 48 |
+
"emulated": false
|
| 49 |
+
},
|
| 50 |
+
"blocks.0.ff.fc2.weight": {
|
| 51 |
+
"scale": 0.001054338192269633,
|
| 52 |
+
"nbits": 8,
|
| 53 |
+
"dtype": "int8",
|
| 54 |
+
"emulated": false
|
| 55 |
+
},
|
| 56 |
+
"blocks.1.norm1.weight": {
|
| 57 |
+
"scale": 1.4451383799718111e-07,
|
| 58 |
+
"nbits": 24,
|
| 59 |
+
"dtype": "int32",
|
| 60 |
+
"emulated": true
|
| 61 |
+
},
|
| 62 |
+
"blocks.1.attn.mask": {
|
| 63 |
+
"scale": 0.007874015826771653,
|
| 64 |
+
"nbits": 8,
|
| 65 |
+
"dtype": "int8",
|
| 66 |
+
"emulated": false
|
| 67 |
+
},
|
| 68 |
+
"blocks.1.attn.W_qkv.weight": {
|
| 69 |
+
"scale": 0.0016650045564663503,
|
| 70 |
+
"nbits": 8,
|
| 71 |
+
"dtype": "int8",
|
| 72 |
+
"emulated": false
|
| 73 |
+
},
|
| 74 |
+
"blocks.1.attn.W_o.weight": {
|
| 75 |
+
"scale": 0.0013984300572910458,
|
| 76 |
+
"nbits": 8,
|
| 77 |
+
"dtype": "int8",
|
| 78 |
+
"emulated": false
|
| 79 |
+
},
|
| 80 |
+
"blocks.1.norm2.weight": {
|
| 81 |
+
"scale": 1.5704104898250477e-07,
|
| 82 |
+
"nbits": 24,
|
| 83 |
+
"dtype": "int32",
|
| 84 |
+
"emulated": true
|
| 85 |
+
},
|
| 86 |
+
"blocks.1.ff.fc1.weight": {
|
| 87 |
+
"scale": 0.0017093292924232181,
|
| 88 |
+
"nbits": 8,
|
| 89 |
+
"dtype": "int8",
|
| 90 |
+
"emulated": false
|
| 91 |
+
},
|
| 92 |
+
"blocks.1.ff.fc2.weight": {
|
| 93 |
+
"scale": 0.0012757184081998209,
|
| 94 |
+
"nbits": 8,
|
| 95 |
+
"dtype": "int8",
|
| 96 |
+
"emulated": false
|
| 97 |
+
},
|
| 98 |
+
"blocks.2.norm1.weight": {
|
| 99 |
+
"scale": 1.4974127310561757e-07,
|
| 100 |
+
"nbits": 24,
|
| 101 |
+
"dtype": "int32",
|
| 102 |
+
"emulated": true
|
| 103 |
+
},
|
| 104 |
+
"blocks.2.attn.mask": {
|
| 105 |
+
"scale": 0.007874015826771653,
|
| 106 |
+
"nbits": 8,
|
| 107 |
+
"dtype": "int8",
|
| 108 |
+
"emulated": false
|
| 109 |
+
},
|
| 110 |
+
"blocks.2.attn.W_qkv.weight": {
|
| 111 |
+
"scale": 0.0015654190402200653,
|
| 112 |
+
"nbits": 8,
|
| 113 |
+
"dtype": "int8",
|
| 114 |
+
"emulated": false
|
| 115 |
+
},
|
| 116 |
+
"blocks.2.attn.W_o.weight": {
|
| 117 |
+
"scale": 0.0013663089030240276,
|
| 118 |
+
"nbits": 8,
|
| 119 |
+
"dtype": "int8",
|
| 120 |
+
"emulated": false
|
| 121 |
+
},
|
| 122 |
+
"blocks.2.norm2.weight": {
|
| 123 |
+
"scale": 1.5941392094229872e-07,
|
| 124 |
+
"nbits": 24,
|
| 125 |
+
"dtype": "int32",
|
| 126 |
+
"emulated": true
|
| 127 |
+
},
|
| 128 |
+
"blocks.2.ff.fc1.weight": {
|
| 129 |
+
"scale": 0.001793610729463532,
|
| 130 |
+
"nbits": 8,
|
| 131 |
+
"dtype": "int8",
|
| 132 |
+
"emulated": false
|
| 133 |
+
},
|
| 134 |
+
"blocks.2.ff.fc2.weight": {
|
| 135 |
+
"scale": 0.0013356635503750899,
|
| 136 |
+
"nbits": 8,
|
| 137 |
+
"dtype": "int8",
|
| 138 |
+
"emulated": false
|
| 139 |
+
},
|
| 140 |
+
"blocks.3.norm1.weight": {
|
| 141 |
+
"scale": 1.3939665071276776e-07,
|
| 142 |
+
"nbits": 24,
|
| 143 |
+
"dtype": "int32",
|
| 144 |
+
"emulated": true
|
| 145 |
+
},
|
| 146 |
+
"blocks.3.attn.mask": {
|
| 147 |
+
"scale": 0.007874015826771653,
|
| 148 |
+
"nbits": 8,
|
| 149 |
+
"dtype": "int8",
|
| 150 |
+
"emulated": false
|
| 151 |
+
},
|
| 152 |
+
"blocks.3.attn.W_qkv.weight": {
|
| 153 |
+
"scale": 0.0014945593254642035,
|
| 154 |
+
"nbits": 8,
|
| 155 |
+
"dtype": "int8",
|
| 156 |
+
"emulated": false
|
| 157 |
+
},
|
| 158 |
+
"blocks.3.attn.W_o.weight": {
|
| 159 |
+
"scale": 0.0012709688097342544,
|
| 160 |
+
"nbits": 8,
|
| 161 |
+
"dtype": "int8",
|
| 162 |
+
"emulated": false
|
| 163 |
+
},
|
| 164 |
+
"blocks.3.norm2.weight": {
|
| 165 |
+
"scale": 1.4363823607952405e-07,
|
| 166 |
+
"nbits": 24,
|
| 167 |
+
"dtype": "int32",
|
| 168 |
+
"emulated": true
|
| 169 |
+
},
|
| 170 |
+
"blocks.3.ff.fc1.weight": {
|
| 171 |
+
"scale": 0.0017127885911277711,
|
| 172 |
+
"nbits": 8,
|
| 173 |
+
"dtype": "int8",
|
| 174 |
+
"emulated": false
|
| 175 |
+
},
|
| 176 |
+
"blocks.3.ff.fc2.weight": {
|
| 177 |
+
"scale": 0.0011641868539551863,
|
| 178 |
+
"nbits": 8,
|
| 179 |
+
"dtype": "int8",
|
| 180 |
+
"emulated": false
|
| 181 |
+
},
|
| 182 |
+
"blocks.4.norm1.weight": {
|
| 183 |
+
"scale": 1.3554279435228193e-07,
|
| 184 |
+
"nbits": 24,
|
| 185 |
+
"dtype": "int32",
|
| 186 |
+
"emulated": true
|
| 187 |
+
},
|
| 188 |
+
"blocks.4.attn.mask": {
|
| 189 |
+
"scale": 0.007874015826771653,
|
| 190 |
+
"nbits": 8,
|
| 191 |
+
"dtype": "int8",
|
| 192 |
+
"emulated": false
|
| 193 |
+
},
|
| 194 |
+
"blocks.4.attn.W_qkv.weight": {
|
| 195 |
+
"scale": 0.001422968632485998,
|
| 196 |
+
"nbits": 8,
|
| 197 |
+
"dtype": "int8",
|
| 198 |
+
"emulated": false
|
| 199 |
+
},
|
| 200 |
+
"blocks.4.attn.W_o.weight": {
|
| 201 |
+
"scale": 0.0012153839585218654,
|
| 202 |
+
"nbits": 8,
|
| 203 |
+
"dtype": "int8",
|
| 204 |
+
"emulated": false
|
| 205 |
+
},
|
| 206 |
+
"blocks.4.norm2.weight": {
|
| 207 |
+
"scale": 1.4058388250062366e-07,
|
| 208 |
+
"nbits": 24,
|
| 209 |
+
"dtype": "int32",
|
| 210 |
+
"emulated": true
|
| 211 |
+
},
|
| 212 |
+
"blocks.4.ff.fc1.weight": {
|
| 213 |
+
"scale": 0.001863799539969737,
|
| 214 |
+
"nbits": 8,
|
| 215 |
+
"dtype": "int8",
|
| 216 |
+
"emulated": false
|
| 217 |
+
},
|
| 218 |
+
"blocks.4.ff.fc2.weight": {
|
| 219 |
+
"scale": 0.0009441251561266981,
|
| 220 |
+
"nbits": 8,
|
| 221 |
+
"dtype": "int8",
|
| 222 |
+
"emulated": false
|
| 223 |
+
},
|
| 224 |
+
"blocks.5.norm1.weight": {
|
| 225 |
+
"scale": 1.3613868398323536e-07,
|
| 226 |
+
"nbits": 24,
|
| 227 |
+
"dtype": "int32",
|
| 228 |
+
"emulated": true
|
| 229 |
+
},
|
| 230 |
+
"blocks.5.attn.mask": {
|
| 231 |
+
"scale": 0.007874015826771653,
|
| 232 |
+
"nbits": 8,
|
| 233 |
+
"dtype": "int8",
|
| 234 |
+
"emulated": false
|
| 235 |
+
},
|
| 236 |
+
"blocks.5.attn.W_qkv.weight": {
|
| 237 |
+
"scale": 0.0015208560030160738,
|
| 238 |
+
"nbits": 8,
|
| 239 |
+
"dtype": "int8",
|
| 240 |
+
"emulated": false
|
| 241 |
+
},
|
| 242 |
+
"blocks.5.attn.W_o.weight": {
|
| 243 |
+
"scale": 0.0012394373659881833,
|
| 244 |
+
"nbits": 8,
|
| 245 |
+
"dtype": "int8",
|
| 246 |
+
"emulated": false
|
| 247 |
+
},
|
| 248 |
+
"blocks.5.norm2.weight": {
|
| 249 |
+
"scale": 1.4058437988059799e-07,
|
| 250 |
+
"nbits": 24,
|
| 251 |
+
"dtype": "int32",
|
| 252 |
+
"emulated": true
|
| 253 |
+
},
|
| 254 |
+
"blocks.5.ff.fc1.weight": {
|
| 255 |
+
"scale": 0.0013886798872591755,
|
| 256 |
+
"nbits": 8,
|
| 257 |
+
"dtype": "int8",
|
| 258 |
+
"emulated": false
|
| 259 |
+
},
|
| 260 |
+
"blocks.5.ff.fc2.weight": {
|
| 261 |
+
"scale": 0.0007607480638478496,
|
| 262 |
+
"nbits": 8,
|
| 263 |
+
"dtype": "int8",
|
| 264 |
+
"emulated": false
|
| 265 |
+
},
|
| 266 |
+
"norm_f.weight": {
|
| 267 |
+
"scale": 1.1516335334173259e-07,
|
| 268 |
+
"nbits": 24,
|
| 269 |
+
"dtype": "int32",
|
| 270 |
+
"emulated": true
|
| 271 |
+
},
|
| 272 |
+
"lm_head.weight": {
|
| 273 |
+
"scale": 0.03693562237661287,
|
| 274 |
+
"nbits": 8,
|
| 275 |
+
"dtype": "int8",
|
| 276 |
+
"emulated": false
|
| 277 |
+
}
|
| 278 |
+
}
|