prince-canuma commited on
Commit
74ecb8a
·
verified ·
1 Parent(s): 284faf2

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: mlx
4
+ base_model: deepseek-ai/DeepSeek-V4-Flash
5
+ tags:
6
+ - mlx
7
+ pipeline_tag: text-generation
8
+ ---
config.json ADDED
@@ -0,0 +1,1402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV4ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "compress_ratios": [
9
+ 0,
10
+ 0,
11
+ 4,
12
+ 128,
13
+ 4,
14
+ 128,
15
+ 4,
16
+ 128,
17
+ 4,
18
+ 128,
19
+ 4,
20
+ 128,
21
+ 4,
22
+ 128,
23
+ 4,
24
+ 128,
25
+ 4,
26
+ 128,
27
+ 4,
28
+ 128,
29
+ 4,
30
+ 128,
31
+ 4,
32
+ 128,
33
+ 4,
34
+ 128,
35
+ 4,
36
+ 128,
37
+ 4,
38
+ 128,
39
+ 4,
40
+ 128,
41
+ 4,
42
+ 128,
43
+ 4,
44
+ 128,
45
+ 4,
46
+ 128,
47
+ 4,
48
+ 128,
49
+ 4,
50
+ 128,
51
+ 4,
52
+ 0
53
+ ],
54
+ "compress_rope_theta": 160000,
55
+ "eos_token_id": 1,
56
+ "expert_dtype": "fp4",
57
+ "hc_eps": 1e-06,
58
+ "hc_mult": 4,
59
+ "hc_sinkhorn_iters": 20,
60
+ "head_dim": 512,
61
+ "hidden_act": "silu",
62
+ "hidden_size": 4096,
63
+ "index_head_dim": 128,
64
+ "index_n_heads": 64,
65
+ "index_topk": 512,
66
+ "initializer_range": 0.02,
67
+ "max_position_embeddings": 1048576,
68
+ "model_type": "deepseek_v4",
69
+ "moe_intermediate_size": 2048,
70
+ "n_routed_experts": 256,
71
+ "n_shared_experts": 1,
72
+ "norm_topk_prob": true,
73
+ "num_attention_heads": 64,
74
+ "num_experts_per_tok": 6,
75
+ "num_hash_layers": 3,
76
+ "num_hidden_layers": 43,
77
+ "num_key_value_heads": 1,
78
+ "num_nextn_predict_layers": 1,
79
+ "o_groups": 8,
80
+ "o_lora_rank": 1024,
81
+ "q_lora_rank": 1024,
82
+ "qk_rope_head_dim": 64,
83
+ "quantization": {
84
+ "group_size": 64,
85
+ "bits": 8,
86
+ "mode": "affine",
87
+ "model.layers.0.ffn.switch_mlp.gate_proj": {
88
+ "group_size": 64,
89
+ "bits": 3,
90
+ "mode": "affine"
91
+ },
92
+ "model.layers.0.ffn.switch_mlp.up_proj": {
93
+ "group_size": 64,
94
+ "bits": 3,
95
+ "mode": "affine"
96
+ },
97
+ "model.layers.0.ffn.switch_mlp.down_proj": {
98
+ "group_size": 64,
99
+ "bits": 3,
100
+ "mode": "affine"
101
+ },
102
+ "model.layers.1.ffn.switch_mlp.gate_proj": {
103
+ "group_size": 64,
104
+ "bits": 3,
105
+ "mode": "affine"
106
+ },
107
+ "model.layers.1.ffn.switch_mlp.up_proj": {
108
+ "group_size": 64,
109
+ "bits": 3,
110
+ "mode": "affine"
111
+ },
112
+ "model.layers.1.ffn.switch_mlp.down_proj": {
113
+ "group_size": 64,
114
+ "bits": 3,
115
+ "mode": "affine"
116
+ },
117
+ "model.layers.2.ffn.switch_mlp.gate_proj": {
118
+ "group_size": 64,
119
+ "bits": 3,
120
+ "mode": "affine"
121
+ },
122
+ "model.layers.2.ffn.switch_mlp.up_proj": {
123
+ "group_size": 64,
124
+ "bits": 3,
125
+ "mode": "affine"
126
+ },
127
+ "model.layers.2.ffn.switch_mlp.down_proj": {
128
+ "group_size": 64,
129
+ "bits": 3,
130
+ "mode": "affine"
131
+ },
132
+ "model.layers.3.ffn.switch_mlp.gate_proj": {
133
+ "group_size": 64,
134
+ "bits": 3,
135
+ "mode": "affine"
136
+ },
137
+ "model.layers.3.ffn.switch_mlp.up_proj": {
138
+ "group_size": 64,
139
+ "bits": 3,
140
+ "mode": "affine"
141
+ },
142
+ "model.layers.3.ffn.switch_mlp.down_proj": {
143
+ "group_size": 64,
144
+ "bits": 3,
145
+ "mode": "affine"
146
+ },
147
+ "model.layers.4.ffn.switch_mlp.gate_proj": {
148
+ "group_size": 64,
149
+ "bits": 3,
150
+ "mode": "affine"
151
+ },
152
+ "model.layers.4.ffn.switch_mlp.up_proj": {
153
+ "group_size": 64,
154
+ "bits": 3,
155
+ "mode": "affine"
156
+ },
157
+ "model.layers.4.ffn.switch_mlp.down_proj": {
158
+ "group_size": 64,
159
+ "bits": 3,
160
+ "mode": "affine"
161
+ },
162
+ "model.layers.5.ffn.switch_mlp.gate_proj": {
163
+ "group_size": 64,
164
+ "bits": 3,
165
+ "mode": "affine"
166
+ },
167
+ "model.layers.5.ffn.switch_mlp.up_proj": {
168
+ "group_size": 64,
169
+ "bits": 3,
170
+ "mode": "affine"
171
+ },
172
+ "model.layers.5.ffn.switch_mlp.down_proj": {
173
+ "group_size": 64,
174
+ "bits": 3,
175
+ "mode": "affine"
176
+ },
177
+ "model.layers.6.ffn.switch_mlp.gate_proj": {
178
+ "group_size": 64,
179
+ "bits": 3,
180
+ "mode": "affine"
181
+ },
182
+ "model.layers.6.ffn.switch_mlp.up_proj": {
183
+ "group_size": 64,
184
+ "bits": 3,
185
+ "mode": "affine"
186
+ },
187
+ "model.layers.6.ffn.switch_mlp.down_proj": {
188
+ "group_size": 64,
189
+ "bits": 3,
190
+ "mode": "affine"
191
+ },
192
+ "model.layers.7.ffn.switch_mlp.gate_proj": {
193
+ "group_size": 64,
194
+ "bits": 3,
195
+ "mode": "affine"
196
+ },
197
+ "model.layers.7.ffn.switch_mlp.up_proj": {
198
+ "group_size": 64,
199
+ "bits": 3,
200
+ "mode": "affine"
201
+ },
202
+ "model.layers.7.ffn.switch_mlp.down_proj": {
203
+ "group_size": 64,
204
+ "bits": 3,
205
+ "mode": "affine"
206
+ },
207
+ "model.layers.8.ffn.switch_mlp.gate_proj": {
208
+ "group_size": 64,
209
+ "bits": 3,
210
+ "mode": "affine"
211
+ },
212
+ "model.layers.8.ffn.switch_mlp.up_proj": {
213
+ "group_size": 64,
214
+ "bits": 3,
215
+ "mode": "affine"
216
+ },
217
+ "model.layers.8.ffn.switch_mlp.down_proj": {
218
+ "group_size": 64,
219
+ "bits": 3,
220
+ "mode": "affine"
221
+ },
222
+ "model.layers.9.ffn.switch_mlp.gate_proj": {
223
+ "group_size": 64,
224
+ "bits": 3,
225
+ "mode": "affine"
226
+ },
227
+ "model.layers.9.ffn.switch_mlp.up_proj": {
228
+ "group_size": 64,
229
+ "bits": 3,
230
+ "mode": "affine"
231
+ },
232
+ "model.layers.9.ffn.switch_mlp.down_proj": {
233
+ "group_size": 64,
234
+ "bits": 3,
235
+ "mode": "affine"
236
+ },
237
+ "model.layers.10.ffn.switch_mlp.gate_proj": {
238
+ "group_size": 64,
239
+ "bits": 3,
240
+ "mode": "affine"
241
+ },
242
+ "model.layers.10.ffn.switch_mlp.up_proj": {
243
+ "group_size": 64,
244
+ "bits": 3,
245
+ "mode": "affine"
246
+ },
247
+ "model.layers.10.ffn.switch_mlp.down_proj": {
248
+ "group_size": 64,
249
+ "bits": 3,
250
+ "mode": "affine"
251
+ },
252
+ "model.layers.11.ffn.switch_mlp.gate_proj": {
253
+ "group_size": 64,
254
+ "bits": 3,
255
+ "mode": "affine"
256
+ },
257
+ "model.layers.11.ffn.switch_mlp.up_proj": {
258
+ "group_size": 64,
259
+ "bits": 3,
260
+ "mode": "affine"
261
+ },
262
+ "model.layers.11.ffn.switch_mlp.down_proj": {
263
+ "group_size": 64,
264
+ "bits": 3,
265
+ "mode": "affine"
266
+ },
267
+ "model.layers.12.ffn.switch_mlp.gate_proj": {
268
+ "group_size": 64,
269
+ "bits": 3,
270
+ "mode": "affine"
271
+ },
272
+ "model.layers.12.ffn.switch_mlp.up_proj": {
273
+ "group_size": 64,
274
+ "bits": 3,
275
+ "mode": "affine"
276
+ },
277
+ "model.layers.12.ffn.switch_mlp.down_proj": {
278
+ "group_size": 64,
279
+ "bits": 3,
280
+ "mode": "affine"
281
+ },
282
+ "model.layers.13.ffn.switch_mlp.gate_proj": {
283
+ "group_size": 64,
284
+ "bits": 3,
285
+ "mode": "affine"
286
+ },
287
+ "model.layers.13.ffn.switch_mlp.up_proj": {
288
+ "group_size": 64,
289
+ "bits": 3,
290
+ "mode": "affine"
291
+ },
292
+ "model.layers.13.ffn.switch_mlp.down_proj": {
293
+ "group_size": 64,
294
+ "bits": 3,
295
+ "mode": "affine"
296
+ },
297
+ "model.layers.14.ffn.switch_mlp.gate_proj": {
298
+ "group_size": 64,
299
+ "bits": 3,
300
+ "mode": "affine"
301
+ },
302
+ "model.layers.14.ffn.switch_mlp.up_proj": {
303
+ "group_size": 64,
304
+ "bits": 3,
305
+ "mode": "affine"
306
+ },
307
+ "model.layers.14.ffn.switch_mlp.down_proj": {
308
+ "group_size": 64,
309
+ "bits": 3,
310
+ "mode": "affine"
311
+ },
312
+ "model.layers.15.ffn.switch_mlp.gate_proj": {
313
+ "group_size": 64,
314
+ "bits": 3,
315
+ "mode": "affine"
316
+ },
317
+ "model.layers.15.ffn.switch_mlp.up_proj": {
318
+ "group_size": 64,
319
+ "bits": 3,
320
+ "mode": "affine"
321
+ },
322
+ "model.layers.15.ffn.switch_mlp.down_proj": {
323
+ "group_size": 64,
324
+ "bits": 3,
325
+ "mode": "affine"
326
+ },
327
+ "model.layers.16.ffn.switch_mlp.gate_proj": {
328
+ "group_size": 64,
329
+ "bits": 3,
330
+ "mode": "affine"
331
+ },
332
+ "model.layers.16.ffn.switch_mlp.up_proj": {
333
+ "group_size": 64,
334
+ "bits": 3,
335
+ "mode": "affine"
336
+ },
337
+ "model.layers.16.ffn.switch_mlp.down_proj": {
338
+ "group_size": 64,
339
+ "bits": 3,
340
+ "mode": "affine"
341
+ },
342
+ "model.layers.17.ffn.switch_mlp.gate_proj": {
343
+ "group_size": 64,
344
+ "bits": 3,
345
+ "mode": "affine"
346
+ },
347
+ "model.layers.17.ffn.switch_mlp.up_proj": {
348
+ "group_size": 64,
349
+ "bits": 3,
350
+ "mode": "affine"
351
+ },
352
+ "model.layers.17.ffn.switch_mlp.down_proj": {
353
+ "group_size": 64,
354
+ "bits": 3,
355
+ "mode": "affine"
356
+ },
357
+ "model.layers.18.ffn.switch_mlp.gate_proj": {
358
+ "group_size": 64,
359
+ "bits": 3,
360
+ "mode": "affine"
361
+ },
362
+ "model.layers.18.ffn.switch_mlp.up_proj": {
363
+ "group_size": 64,
364
+ "bits": 3,
365
+ "mode": "affine"
366
+ },
367
+ "model.layers.18.ffn.switch_mlp.down_proj": {
368
+ "group_size": 64,
369
+ "bits": 3,
370
+ "mode": "affine"
371
+ },
372
+ "model.layers.19.ffn.switch_mlp.gate_proj": {
373
+ "group_size": 64,
374
+ "bits": 3,
375
+ "mode": "affine"
376
+ },
377
+ "model.layers.19.ffn.switch_mlp.up_proj": {
378
+ "group_size": 64,
379
+ "bits": 3,
380
+ "mode": "affine"
381
+ },
382
+ "model.layers.19.ffn.switch_mlp.down_proj": {
383
+ "group_size": 64,
384
+ "bits": 3,
385
+ "mode": "affine"
386
+ },
387
+ "model.layers.20.ffn.switch_mlp.gate_proj": {
388
+ "group_size": 64,
389
+ "bits": 3,
390
+ "mode": "affine"
391
+ },
392
+ "model.layers.20.ffn.switch_mlp.up_proj": {
393
+ "group_size": 64,
394
+ "bits": 3,
395
+ "mode": "affine"
396
+ },
397
+ "model.layers.20.ffn.switch_mlp.down_proj": {
398
+ "group_size": 64,
399
+ "bits": 3,
400
+ "mode": "affine"
401
+ },
402
+ "model.layers.21.ffn.switch_mlp.gate_proj": {
403
+ "group_size": 64,
404
+ "bits": 3,
405
+ "mode": "affine"
406
+ },
407
+ "model.layers.21.ffn.switch_mlp.up_proj": {
408
+ "group_size": 64,
409
+ "bits": 3,
410
+ "mode": "affine"
411
+ },
412
+ "model.layers.21.ffn.switch_mlp.down_proj": {
413
+ "group_size": 64,
414
+ "bits": 3,
415
+ "mode": "affine"
416
+ },
417
+ "model.layers.22.ffn.switch_mlp.gate_proj": {
418
+ "group_size": 64,
419
+ "bits": 3,
420
+ "mode": "affine"
421
+ },
422
+ "model.layers.22.ffn.switch_mlp.up_proj": {
423
+ "group_size": 64,
424
+ "bits": 3,
425
+ "mode": "affine"
426
+ },
427
+ "model.layers.22.ffn.switch_mlp.down_proj": {
428
+ "group_size": 64,
429
+ "bits": 3,
430
+ "mode": "affine"
431
+ },
432
+ "model.layers.23.ffn.switch_mlp.gate_proj": {
433
+ "group_size": 64,
434
+ "bits": 3,
435
+ "mode": "affine"
436
+ },
437
+ "model.layers.23.ffn.switch_mlp.up_proj": {
438
+ "group_size": 64,
439
+ "bits": 3,
440
+ "mode": "affine"
441
+ },
442
+ "model.layers.23.ffn.switch_mlp.down_proj": {
443
+ "group_size": 64,
444
+ "bits": 3,
445
+ "mode": "affine"
446
+ },
447
+ "model.layers.24.ffn.switch_mlp.gate_proj": {
448
+ "group_size": 64,
449
+ "bits": 3,
450
+ "mode": "affine"
451
+ },
452
+ "model.layers.24.ffn.switch_mlp.up_proj": {
453
+ "group_size": 64,
454
+ "bits": 3,
455
+ "mode": "affine"
456
+ },
457
+ "model.layers.24.ffn.switch_mlp.down_proj": {
458
+ "group_size": 64,
459
+ "bits": 3,
460
+ "mode": "affine"
461
+ },
462
+ "model.layers.25.ffn.switch_mlp.gate_proj": {
463
+ "group_size": 64,
464
+ "bits": 3,
465
+ "mode": "affine"
466
+ },
467
+ "model.layers.25.ffn.switch_mlp.up_proj": {
468
+ "group_size": 64,
469
+ "bits": 3,
470
+ "mode": "affine"
471
+ },
472
+ "model.layers.25.ffn.switch_mlp.down_proj": {
473
+ "group_size": 64,
474
+ "bits": 3,
475
+ "mode": "affine"
476
+ },
477
+ "model.layers.26.ffn.switch_mlp.gate_proj": {
478
+ "group_size": 64,
479
+ "bits": 3,
480
+ "mode": "affine"
481
+ },
482
+ "model.layers.26.ffn.switch_mlp.up_proj": {
483
+ "group_size": 64,
484
+ "bits": 3,
485
+ "mode": "affine"
486
+ },
487
+ "model.layers.26.ffn.switch_mlp.down_proj": {
488
+ "group_size": 64,
489
+ "bits": 3,
490
+ "mode": "affine"
491
+ },
492
+ "model.layers.27.ffn.switch_mlp.gate_proj": {
493
+ "group_size": 64,
494
+ "bits": 3,
495
+ "mode": "affine"
496
+ },
497
+ "model.layers.27.ffn.switch_mlp.up_proj": {
498
+ "group_size": 64,
499
+ "bits": 3,
500
+ "mode": "affine"
501
+ },
502
+ "model.layers.27.ffn.switch_mlp.down_proj": {
503
+ "group_size": 64,
504
+ "bits": 3,
505
+ "mode": "affine"
506
+ },
507
+ "model.layers.28.ffn.switch_mlp.gate_proj": {
508
+ "group_size": 64,
509
+ "bits": 3,
510
+ "mode": "affine"
511
+ },
512
+ "model.layers.28.ffn.switch_mlp.up_proj": {
513
+ "group_size": 64,
514
+ "bits": 3,
515
+ "mode": "affine"
516
+ },
517
+ "model.layers.28.ffn.switch_mlp.down_proj": {
518
+ "group_size": 64,
519
+ "bits": 3,
520
+ "mode": "affine"
521
+ },
522
+ "model.layers.29.ffn.switch_mlp.gate_proj": {
523
+ "group_size": 64,
524
+ "bits": 3,
525
+ "mode": "affine"
526
+ },
527
+ "model.layers.29.ffn.switch_mlp.up_proj": {
528
+ "group_size": 64,
529
+ "bits": 3,
530
+ "mode": "affine"
531
+ },
532
+ "model.layers.29.ffn.switch_mlp.down_proj": {
533
+ "group_size": 64,
534
+ "bits": 3,
535
+ "mode": "affine"
536
+ },
537
+ "model.layers.30.ffn.switch_mlp.gate_proj": {
538
+ "group_size": 64,
539
+ "bits": 3,
540
+ "mode": "affine"
541
+ },
542
+ "model.layers.30.ffn.switch_mlp.up_proj": {
543
+ "group_size": 64,
544
+ "bits": 3,
545
+ "mode": "affine"
546
+ },
547
+ "model.layers.30.ffn.switch_mlp.down_proj": {
548
+ "group_size": 64,
549
+ "bits": 3,
550
+ "mode": "affine"
551
+ },
552
+ "model.layers.31.ffn.switch_mlp.gate_proj": {
553
+ "group_size": 64,
554
+ "bits": 3,
555
+ "mode": "affine"
556
+ },
557
+ "model.layers.31.ffn.switch_mlp.up_proj": {
558
+ "group_size": 64,
559
+ "bits": 3,
560
+ "mode": "affine"
561
+ },
562
+ "model.layers.31.ffn.switch_mlp.down_proj": {
563
+ "group_size": 64,
564
+ "bits": 3,
565
+ "mode": "affine"
566
+ },
567
+ "model.layers.32.ffn.switch_mlp.gate_proj": {
568
+ "group_size": 64,
569
+ "bits": 3,
570
+ "mode": "affine"
571
+ },
572
+ "model.layers.32.ffn.switch_mlp.up_proj": {
573
+ "group_size": 64,
574
+ "bits": 3,
575
+ "mode": "affine"
576
+ },
577
+ "model.layers.32.ffn.switch_mlp.down_proj": {
578
+ "group_size": 64,
579
+ "bits": 3,
580
+ "mode": "affine"
581
+ },
582
+ "model.layers.33.ffn.switch_mlp.gate_proj": {
583
+ "group_size": 64,
584
+ "bits": 3,
585
+ "mode": "affine"
586
+ },
587
+ "model.layers.33.ffn.switch_mlp.up_proj": {
588
+ "group_size": 64,
589
+ "bits": 3,
590
+ "mode": "affine"
591
+ },
592
+ "model.layers.33.ffn.switch_mlp.down_proj": {
593
+ "group_size": 64,
594
+ "bits": 3,
595
+ "mode": "affine"
596
+ },
597
+ "model.layers.34.ffn.switch_mlp.gate_proj": {
598
+ "group_size": 64,
599
+ "bits": 3,
600
+ "mode": "affine"
601
+ },
602
+ "model.layers.34.ffn.switch_mlp.up_proj": {
603
+ "group_size": 64,
604
+ "bits": 3,
605
+ "mode": "affine"
606
+ },
607
+ "model.layers.34.ffn.switch_mlp.down_proj": {
608
+ "group_size": 64,
609
+ "bits": 3,
610
+ "mode": "affine"
611
+ },
612
+ "model.layers.35.ffn.switch_mlp.gate_proj": {
613
+ "group_size": 64,
614
+ "bits": 3,
615
+ "mode": "affine"
616
+ },
617
+ "model.layers.35.ffn.switch_mlp.up_proj": {
618
+ "group_size": 64,
619
+ "bits": 3,
620
+ "mode": "affine"
621
+ },
622
+ "model.layers.35.ffn.switch_mlp.down_proj": {
623
+ "group_size": 64,
624
+ "bits": 3,
625
+ "mode": "affine"
626
+ },
627
+ "model.layers.36.ffn.switch_mlp.gate_proj": {
628
+ "group_size": 64,
629
+ "bits": 3,
630
+ "mode": "affine"
631
+ },
632
+ "model.layers.36.ffn.switch_mlp.up_proj": {
633
+ "group_size": 64,
634
+ "bits": 3,
635
+ "mode": "affine"
636
+ },
637
+ "model.layers.36.ffn.switch_mlp.down_proj": {
638
+ "group_size": 64,
639
+ "bits": 3,
640
+ "mode": "affine"
641
+ },
642
+ "model.layers.37.ffn.switch_mlp.gate_proj": {
643
+ "group_size": 64,
644
+ "bits": 3,
645
+ "mode": "affine"
646
+ },
647
+ "model.layers.37.ffn.switch_mlp.up_proj": {
648
+ "group_size": 64,
649
+ "bits": 3,
650
+ "mode": "affine"
651
+ },
652
+ "model.layers.37.ffn.switch_mlp.down_proj": {
653
+ "group_size": 64,
654
+ "bits": 3,
655
+ "mode": "affine"
656
+ },
657
+ "model.layers.38.ffn.switch_mlp.gate_proj": {
658
+ "group_size": 64,
659
+ "bits": 3,
660
+ "mode": "affine"
661
+ },
662
+ "model.layers.38.ffn.switch_mlp.up_proj": {
663
+ "group_size": 64,
664
+ "bits": 3,
665
+ "mode": "affine"
666
+ },
667
+ "model.layers.38.ffn.switch_mlp.down_proj": {
668
+ "group_size": 64,
669
+ "bits": 3,
670
+ "mode": "affine"
671
+ },
672
+ "model.layers.39.ffn.switch_mlp.gate_proj": {
673
+ "group_size": 64,
674
+ "bits": 3,
675
+ "mode": "affine"
676
+ },
677
+ "model.layers.39.ffn.switch_mlp.up_proj": {
678
+ "group_size": 64,
679
+ "bits": 3,
680
+ "mode": "affine"
681
+ },
682
+ "model.layers.39.ffn.switch_mlp.down_proj": {
683
+ "group_size": 64,
684
+ "bits": 3,
685
+ "mode": "affine"
686
+ },
687
+ "model.layers.40.ffn.switch_mlp.gate_proj": {
688
+ "group_size": 64,
689
+ "bits": 3,
690
+ "mode": "affine"
691
+ },
692
+ "model.layers.40.ffn.switch_mlp.up_proj": {
693
+ "group_size": 64,
694
+ "bits": 3,
695
+ "mode": "affine"
696
+ },
697
+ "model.layers.40.ffn.switch_mlp.down_proj": {
698
+ "group_size": 64,
699
+ "bits": 3,
700
+ "mode": "affine"
701
+ },
702
+ "model.layers.41.ffn.switch_mlp.gate_proj": {
703
+ "group_size": 64,
704
+ "bits": 3,
705
+ "mode": "affine"
706
+ },
707
+ "model.layers.41.ffn.switch_mlp.up_proj": {
708
+ "group_size": 64,
709
+ "bits": 3,
710
+ "mode": "affine"
711
+ },
712
+ "model.layers.41.ffn.switch_mlp.down_proj": {
713
+ "group_size": 64,
714
+ "bits": 3,
715
+ "mode": "affine"
716
+ },
717
+ "model.layers.42.ffn.switch_mlp.gate_proj": {
718
+ "group_size": 64,
719
+ "bits": 3,
720
+ "mode": "affine"
721
+ },
722
+ "model.layers.42.ffn.switch_mlp.up_proj": {
723
+ "group_size": 64,
724
+ "bits": 3,
725
+ "mode": "affine"
726
+ },
727
+ "model.layers.42.ffn.switch_mlp.down_proj": {
728
+ "group_size": 64,
729
+ "bits": 3,
730
+ "mode": "affine"
731
+ }
732
+ },
733
+ "quantization_config": {
734
+ "group_size": 64,
735
+ "bits": 8,
736
+ "mode": "affine",
737
+ "model.layers.0.ffn.switch_mlp.gate_proj": {
738
+ "group_size": 64,
739
+ "bits": 3,
740
+ "mode": "affine"
741
+ },
742
+ "model.layers.0.ffn.switch_mlp.up_proj": {
743
+ "group_size": 64,
744
+ "bits": 3,
745
+ "mode": "affine"
746
+ },
747
+ "model.layers.0.ffn.switch_mlp.down_proj": {
748
+ "group_size": 64,
749
+ "bits": 3,
750
+ "mode": "affine"
751
+ },
752
+ "model.layers.1.ffn.switch_mlp.gate_proj": {
753
+ "group_size": 64,
754
+ "bits": 3,
755
+ "mode": "affine"
756
+ },
757
+ "model.layers.1.ffn.switch_mlp.up_proj": {
758
+ "group_size": 64,
759
+ "bits": 3,
760
+ "mode": "affine"
761
+ },
762
+ "model.layers.1.ffn.switch_mlp.down_proj": {
763
+ "group_size": 64,
764
+ "bits": 3,
765
+ "mode": "affine"
766
+ },
767
+ "model.layers.2.ffn.switch_mlp.gate_proj": {
768
+ "group_size": 64,
769
+ "bits": 3,
770
+ "mode": "affine"
771
+ },
772
+ "model.layers.2.ffn.switch_mlp.up_proj": {
773
+ "group_size": 64,
774
+ "bits": 3,
775
+ "mode": "affine"
776
+ },
777
+ "model.layers.2.ffn.switch_mlp.down_proj": {
778
+ "group_size": 64,
779
+ "bits": 3,
780
+ "mode": "affine"
781
+ },
782
+ "model.layers.3.ffn.switch_mlp.gate_proj": {
783
+ "group_size": 64,
784
+ "bits": 3,
785
+ "mode": "affine"
786
+ },
787
+ "model.layers.3.ffn.switch_mlp.up_proj": {
788
+ "group_size": 64,
789
+ "bits": 3,
790
+ "mode": "affine"
791
+ },
792
+ "model.layers.3.ffn.switch_mlp.down_proj": {
793
+ "group_size": 64,
794
+ "bits": 3,
795
+ "mode": "affine"
796
+ },
797
+ "model.layers.4.ffn.switch_mlp.gate_proj": {
798
+ "group_size": 64,
799
+ "bits": 3,
800
+ "mode": "affine"
801
+ },
802
+ "model.layers.4.ffn.switch_mlp.up_proj": {
803
+ "group_size": 64,
804
+ "bits": 3,
805
+ "mode": "affine"
806
+ },
807
+ "model.layers.4.ffn.switch_mlp.down_proj": {
808
+ "group_size": 64,
809
+ "bits": 3,
810
+ "mode": "affine"
811
+ },
812
+ "model.layers.5.ffn.switch_mlp.gate_proj": {
813
+ "group_size": 64,
814
+ "bits": 3,
815
+ "mode": "affine"
816
+ },
817
+ "model.layers.5.ffn.switch_mlp.up_proj": {
818
+ "group_size": 64,
819
+ "bits": 3,
820
+ "mode": "affine"
821
+ },
822
+ "model.layers.5.ffn.switch_mlp.down_proj": {
823
+ "group_size": 64,
824
+ "bits": 3,
825
+ "mode": "affine"
826
+ },
827
+ "model.layers.6.ffn.switch_mlp.gate_proj": {
828
+ "group_size": 64,
829
+ "bits": 3,
830
+ "mode": "affine"
831
+ },
832
+ "model.layers.6.ffn.switch_mlp.up_proj": {
833
+ "group_size": 64,
834
+ "bits": 3,
835
+ "mode": "affine"
836
+ },
837
+ "model.layers.6.ffn.switch_mlp.down_proj": {
838
+ "group_size": 64,
839
+ "bits": 3,
840
+ "mode": "affine"
841
+ },
842
+ "model.layers.7.ffn.switch_mlp.gate_proj": {
843
+ "group_size": 64,
844
+ "bits": 3,
845
+ "mode": "affine"
846
+ },
847
+ "model.layers.7.ffn.switch_mlp.up_proj": {
848
+ "group_size": 64,
849
+ "bits": 3,
850
+ "mode": "affine"
851
+ },
852
+ "model.layers.7.ffn.switch_mlp.down_proj": {
853
+ "group_size": 64,
854
+ "bits": 3,
855
+ "mode": "affine"
856
+ },
857
+ "model.layers.8.ffn.switch_mlp.gate_proj": {
858
+ "group_size": 64,
859
+ "bits": 3,
860
+ "mode": "affine"
861
+ },
862
+ "model.layers.8.ffn.switch_mlp.up_proj": {
863
+ "group_size": 64,
864
+ "bits": 3,
865
+ "mode": "affine"
866
+ },
867
+ "model.layers.8.ffn.switch_mlp.down_proj": {
868
+ "group_size": 64,
869
+ "bits": 3,
870
+ "mode": "affine"
871
+ },
872
+ "model.layers.9.ffn.switch_mlp.gate_proj": {
873
+ "group_size": 64,
874
+ "bits": 3,
875
+ "mode": "affine"
876
+ },
877
+ "model.layers.9.ffn.switch_mlp.up_proj": {
878
+ "group_size": 64,
879
+ "bits": 3,
880
+ "mode": "affine"
881
+ },
882
+ "model.layers.9.ffn.switch_mlp.down_proj": {
883
+ "group_size": 64,
884
+ "bits": 3,
885
+ "mode": "affine"
886
+ },
887
+ "model.layers.10.ffn.switch_mlp.gate_proj": {
888
+ "group_size": 64,
889
+ "bits": 3,
890
+ "mode": "affine"
891
+ },
892
+ "model.layers.10.ffn.switch_mlp.up_proj": {
893
+ "group_size": 64,
894
+ "bits": 3,
895
+ "mode": "affine"
896
+ },
897
+ "model.layers.10.ffn.switch_mlp.down_proj": {
898
+ "group_size": 64,
899
+ "bits": 3,
900
+ "mode": "affine"
901
+ },
902
+ "model.layers.11.ffn.switch_mlp.gate_proj": {
903
+ "group_size": 64,
904
+ "bits": 3,
905
+ "mode": "affine"
906
+ },
907
+ "model.layers.11.ffn.switch_mlp.up_proj": {
908
+ "group_size": 64,
909
+ "bits": 3,
910
+ "mode": "affine"
911
+ },
912
+ "model.layers.11.ffn.switch_mlp.down_proj": {
913
+ "group_size": 64,
914
+ "bits": 3,
915
+ "mode": "affine"
916
+ },
917
+ "model.layers.12.ffn.switch_mlp.gate_proj": {
918
+ "group_size": 64,
919
+ "bits": 3,
920
+ "mode": "affine"
921
+ },
922
+ "model.layers.12.ffn.switch_mlp.up_proj": {
923
+ "group_size": 64,
924
+ "bits": 3,
925
+ "mode": "affine"
926
+ },
927
+ "model.layers.12.ffn.switch_mlp.down_proj": {
928
+ "group_size": 64,
929
+ "bits": 3,
930
+ "mode": "affine"
931
+ },
932
+ "model.layers.13.ffn.switch_mlp.gate_proj": {
933
+ "group_size": 64,
934
+ "bits": 3,
935
+ "mode": "affine"
936
+ },
937
+ "model.layers.13.ffn.switch_mlp.up_proj": {
938
+ "group_size": 64,
939
+ "bits": 3,
940
+ "mode": "affine"
941
+ },
942
+ "model.layers.13.ffn.switch_mlp.down_proj": {
943
+ "group_size": 64,
944
+ "bits": 3,
945
+ "mode": "affine"
946
+ },
947
+ "model.layers.14.ffn.switch_mlp.gate_proj": {
948
+ "group_size": 64,
949
+ "bits": 3,
950
+ "mode": "affine"
951
+ },
952
+ "model.layers.14.ffn.switch_mlp.up_proj": {
953
+ "group_size": 64,
954
+ "bits": 3,
955
+ "mode": "affine"
956
+ },
957
+ "model.layers.14.ffn.switch_mlp.down_proj": {
958
+ "group_size": 64,
959
+ "bits": 3,
960
+ "mode": "affine"
961
+ },
962
+ "model.layers.15.ffn.switch_mlp.gate_proj": {
963
+ "group_size": 64,
964
+ "bits": 3,
965
+ "mode": "affine"
966
+ },
967
+ "model.layers.15.ffn.switch_mlp.up_proj": {
968
+ "group_size": 64,
969
+ "bits": 3,
970
+ "mode": "affine"
971
+ },
972
+ "model.layers.15.ffn.switch_mlp.down_proj": {
973
+ "group_size": 64,
974
+ "bits": 3,
975
+ "mode": "affine"
976
+ },
977
+ "model.layers.16.ffn.switch_mlp.gate_proj": {
978
+ "group_size": 64,
979
+ "bits": 3,
980
+ "mode": "affine"
981
+ },
982
+ "model.layers.16.ffn.switch_mlp.up_proj": {
983
+ "group_size": 64,
984
+ "bits": 3,
985
+ "mode": "affine"
986
+ },
987
+ "model.layers.16.ffn.switch_mlp.down_proj": {
988
+ "group_size": 64,
989
+ "bits": 3,
990
+ "mode": "affine"
991
+ },
992
+ "model.layers.17.ffn.switch_mlp.gate_proj": {
993
+ "group_size": 64,
994
+ "bits": 3,
995
+ "mode": "affine"
996
+ },
997
+ "model.layers.17.ffn.switch_mlp.up_proj": {
998
+ "group_size": 64,
999
+ "bits": 3,
1000
+ "mode": "affine"
1001
+ },
1002
+ "model.layers.17.ffn.switch_mlp.down_proj": {
1003
+ "group_size": 64,
1004
+ "bits": 3,
1005
+ "mode": "affine"
1006
+ },
1007
+ "model.layers.18.ffn.switch_mlp.gate_proj": {
1008
+ "group_size": 64,
1009
+ "bits": 3,
1010
+ "mode": "affine"
1011
+ },
1012
+ "model.layers.18.ffn.switch_mlp.up_proj": {
1013
+ "group_size": 64,
1014
+ "bits": 3,
1015
+ "mode": "affine"
1016
+ },
1017
+ "model.layers.18.ffn.switch_mlp.down_proj": {
1018
+ "group_size": 64,
1019
+ "bits": 3,
1020
+ "mode": "affine"
1021
+ },
1022
+ "model.layers.19.ffn.switch_mlp.gate_proj": {
1023
+ "group_size": 64,
1024
+ "bits": 3,
1025
+ "mode": "affine"
1026
+ },
1027
+ "model.layers.19.ffn.switch_mlp.up_proj": {
1028
+ "group_size": 64,
1029
+ "bits": 3,
1030
+ "mode": "affine"
1031
+ },
1032
+ "model.layers.19.ffn.switch_mlp.down_proj": {
1033
+ "group_size": 64,
1034
+ "bits": 3,
1035
+ "mode": "affine"
1036
+ },
1037
+ "model.layers.20.ffn.switch_mlp.gate_proj": {
1038
+ "group_size": 64,
1039
+ "bits": 3,
1040
+ "mode": "affine"
1041
+ },
1042
+ "model.layers.20.ffn.switch_mlp.up_proj": {
1043
+ "group_size": 64,
1044
+ "bits": 3,
1045
+ "mode": "affine"
1046
+ },
1047
+ "model.layers.20.ffn.switch_mlp.down_proj": {
1048
+ "group_size": 64,
1049
+ "bits": 3,
1050
+ "mode": "affine"
1051
+ },
1052
+ "model.layers.21.ffn.switch_mlp.gate_proj": {
1053
+ "group_size": 64,
1054
+ "bits": 3,
1055
+ "mode": "affine"
1056
+ },
1057
+ "model.layers.21.ffn.switch_mlp.up_proj": {
1058
+ "group_size": 64,
1059
+ "bits": 3,
1060
+ "mode": "affine"
1061
+ },
1062
+ "model.layers.21.ffn.switch_mlp.down_proj": {
1063
+ "group_size": 64,
1064
+ "bits": 3,
1065
+ "mode": "affine"
1066
+ },
1067
+ "model.layers.22.ffn.switch_mlp.gate_proj": {
1068
+ "group_size": 64,
1069
+ "bits": 3,
1070
+ "mode": "affine"
1071
+ },
1072
+ "model.layers.22.ffn.switch_mlp.up_proj": {
1073
+ "group_size": 64,
1074
+ "bits": 3,
1075
+ "mode": "affine"
1076
+ },
1077
+ "model.layers.22.ffn.switch_mlp.down_proj": {
1078
+ "group_size": 64,
1079
+ "bits": 3,
1080
+ "mode": "affine"
1081
+ },
1082
+ "model.layers.23.ffn.switch_mlp.gate_proj": {
1083
+ "group_size": 64,
1084
+ "bits": 3,
1085
+ "mode": "affine"
1086
+ },
1087
+ "model.layers.23.ffn.switch_mlp.up_proj": {
1088
+ "group_size": 64,
1089
+ "bits": 3,
1090
+ "mode": "affine"
1091
+ },
1092
+ "model.layers.23.ffn.switch_mlp.down_proj": {
1093
+ "group_size": 64,
1094
+ "bits": 3,
1095
+ "mode": "affine"
1096
+ },
1097
+ "model.layers.24.ffn.switch_mlp.gate_proj": {
1098
+ "group_size": 64,
1099
+ "bits": 3,
1100
+ "mode": "affine"
1101
+ },
1102
+ "model.layers.24.ffn.switch_mlp.up_proj": {
1103
+ "group_size": 64,
1104
+ "bits": 3,
1105
+ "mode": "affine"
1106
+ },
1107
+ "model.layers.24.ffn.switch_mlp.down_proj": {
1108
+ "group_size": 64,
1109
+ "bits": 3,
1110
+ "mode": "affine"
1111
+ },
1112
+ "model.layers.25.ffn.switch_mlp.gate_proj": {
1113
+ "group_size": 64,
1114
+ "bits": 3,
1115
+ "mode": "affine"
1116
+ },
1117
+ "model.layers.25.ffn.switch_mlp.up_proj": {
1118
+ "group_size": 64,
1119
+ "bits": 3,
1120
+ "mode": "affine"
1121
+ },
1122
+ "model.layers.25.ffn.switch_mlp.down_proj": {
1123
+ "group_size": 64,
1124
+ "bits": 3,
1125
+ "mode": "affine"
1126
+ },
1127
+ "model.layers.26.ffn.switch_mlp.gate_proj": {
1128
+ "group_size": 64,
1129
+ "bits": 3,
1130
+ "mode": "affine"
1131
+ },
1132
+ "model.layers.26.ffn.switch_mlp.up_proj": {
1133
+ "group_size": 64,
1134
+ "bits": 3,
1135
+ "mode": "affine"
1136
+ },
1137
+ "model.layers.26.ffn.switch_mlp.down_proj": {
1138
+ "group_size": 64,
1139
+ "bits": 3,
1140
+ "mode": "affine"
1141
+ },
1142
+ "model.layers.27.ffn.switch_mlp.gate_proj": {
1143
+ "group_size": 64,
1144
+ "bits": 3,
1145
+ "mode": "affine"
1146
+ },
1147
+ "model.layers.27.ffn.switch_mlp.up_proj": {
1148
+ "group_size": 64,
1149
+ "bits": 3,
1150
+ "mode": "affine"
1151
+ },
1152
+ "model.layers.27.ffn.switch_mlp.down_proj": {
1153
+ "group_size": 64,
1154
+ "bits": 3,
1155
+ "mode": "affine"
1156
+ },
1157
+ "model.layers.28.ffn.switch_mlp.gate_proj": {
1158
+ "group_size": 64,
1159
+ "bits": 3,
1160
+ "mode": "affine"
1161
+ },
1162
+ "model.layers.28.ffn.switch_mlp.up_proj": {
1163
+ "group_size": 64,
1164
+ "bits": 3,
1165
+ "mode": "affine"
1166
+ },
1167
+ "model.layers.28.ffn.switch_mlp.down_proj": {
1168
+ "group_size": 64,
1169
+ "bits": 3,
1170
+ "mode": "affine"
1171
+ },
1172
+ "model.layers.29.ffn.switch_mlp.gate_proj": {
1173
+ "group_size": 64,
1174
+ "bits": 3,
1175
+ "mode": "affine"
1176
+ },
1177
+ "model.layers.29.ffn.switch_mlp.up_proj": {
1178
+ "group_size": 64,
1179
+ "bits": 3,
1180
+ "mode": "affine"
1181
+ },
1182
+ "model.layers.29.ffn.switch_mlp.down_proj": {
1183
+ "group_size": 64,
1184
+ "bits": 3,
1185
+ "mode": "affine"
1186
+ },
1187
+ "model.layers.30.ffn.switch_mlp.gate_proj": {
1188
+ "group_size": 64,
1189
+ "bits": 3,
1190
+ "mode": "affine"
1191
+ },
1192
+ "model.layers.30.ffn.switch_mlp.up_proj": {
1193
+ "group_size": 64,
1194
+ "bits": 3,
1195
+ "mode": "affine"
1196
+ },
1197
+ "model.layers.30.ffn.switch_mlp.down_proj": {
1198
+ "group_size": 64,
1199
+ "bits": 3,
1200
+ "mode": "affine"
1201
+ },
1202
+ "model.layers.31.ffn.switch_mlp.gate_proj": {
1203
+ "group_size": 64,
1204
+ "bits": 3,
1205
+ "mode": "affine"
1206
+ },
1207
+ "model.layers.31.ffn.switch_mlp.up_proj": {
1208
+ "group_size": 64,
1209
+ "bits": 3,
1210
+ "mode": "affine"
1211
+ },
1212
+ "model.layers.31.ffn.switch_mlp.down_proj": {
1213
+ "group_size": 64,
1214
+ "bits": 3,
1215
+ "mode": "affine"
1216
+ },
1217
+ "model.layers.32.ffn.switch_mlp.gate_proj": {
1218
+ "group_size": 64,
1219
+ "bits": 3,
1220
+ "mode": "affine"
1221
+ },
1222
+ "model.layers.32.ffn.switch_mlp.up_proj": {
1223
+ "group_size": 64,
1224
+ "bits": 3,
1225
+ "mode": "affine"
1226
+ },
1227
+ "model.layers.32.ffn.switch_mlp.down_proj": {
1228
+ "group_size": 64,
1229
+ "bits": 3,
1230
+ "mode": "affine"
1231
+ },
1232
+ "model.layers.33.ffn.switch_mlp.gate_proj": {
1233
+ "group_size": 64,
1234
+ "bits": 3,
1235
+ "mode": "affine"
1236
+ },
1237
+ "model.layers.33.ffn.switch_mlp.up_proj": {
1238
+ "group_size": 64,
1239
+ "bits": 3,
1240
+ "mode": "affine"
1241
+ },
1242
+ "model.layers.33.ffn.switch_mlp.down_proj": {
1243
+ "group_size": 64,
1244
+ "bits": 3,
1245
+ "mode": "affine"
1246
+ },
1247
+ "model.layers.34.ffn.switch_mlp.gate_proj": {
1248
+ "group_size": 64,
1249
+ "bits": 3,
1250
+ "mode": "affine"
1251
+ },
1252
+ "model.layers.34.ffn.switch_mlp.up_proj": {
1253
+ "group_size": 64,
1254
+ "bits": 3,
1255
+ "mode": "affine"
1256
+ },
1257
+ "model.layers.34.ffn.switch_mlp.down_proj": {
1258
+ "group_size": 64,
1259
+ "bits": 3,
1260
+ "mode": "affine"
1261
+ },
1262
+ "model.layers.35.ffn.switch_mlp.gate_proj": {
1263
+ "group_size": 64,
1264
+ "bits": 3,
1265
+ "mode": "affine"
1266
+ },
1267
+ "model.layers.35.ffn.switch_mlp.up_proj": {
1268
+ "group_size": 64,
1269
+ "bits": 3,
1270
+ "mode": "affine"
1271
+ },
1272
+ "model.layers.35.ffn.switch_mlp.down_proj": {
1273
+ "group_size": 64,
1274
+ "bits": 3,
1275
+ "mode": "affine"
1276
+ },
1277
+ "model.layers.36.ffn.switch_mlp.gate_proj": {
1278
+ "group_size": 64,
1279
+ "bits": 3,
1280
+ "mode": "affine"
1281
+ },
1282
+ "model.layers.36.ffn.switch_mlp.up_proj": {
1283
+ "group_size": 64,
1284
+ "bits": 3,
1285
+ "mode": "affine"
1286
+ },
1287
+ "model.layers.36.ffn.switch_mlp.down_proj": {
1288
+ "group_size": 64,
1289
+ "bits": 3,
1290
+ "mode": "affine"
1291
+ },
1292
+ "model.layers.37.ffn.switch_mlp.gate_proj": {
1293
+ "group_size": 64,
1294
+ "bits": 3,
1295
+ "mode": "affine"
1296
+ },
1297
+ "model.layers.37.ffn.switch_mlp.up_proj": {
1298
+ "group_size": 64,
1299
+ "bits": 3,
1300
+ "mode": "affine"
1301
+ },
1302
+ "model.layers.37.ffn.switch_mlp.down_proj": {
1303
+ "group_size": 64,
1304
+ "bits": 3,
1305
+ "mode": "affine"
1306
+ },
1307
+ "model.layers.38.ffn.switch_mlp.gate_proj": {
1308
+ "group_size": 64,
1309
+ "bits": 3,
1310
+ "mode": "affine"
1311
+ },
1312
+ "model.layers.38.ffn.switch_mlp.up_proj": {
1313
+ "group_size": 64,
1314
+ "bits": 3,
1315
+ "mode": "affine"
1316
+ },
1317
+ "model.layers.38.ffn.switch_mlp.down_proj": {
1318
+ "group_size": 64,
1319
+ "bits": 3,
1320
+ "mode": "affine"
1321
+ },
1322
+ "model.layers.39.ffn.switch_mlp.gate_proj": {
1323
+ "group_size": 64,
1324
+ "bits": 3,
1325
+ "mode": "affine"
1326
+ },
1327
+ "model.layers.39.ffn.switch_mlp.up_proj": {
1328
+ "group_size": 64,
1329
+ "bits": 3,
1330
+ "mode": "affine"
1331
+ },
1332
+ "model.layers.39.ffn.switch_mlp.down_proj": {
1333
+ "group_size": 64,
1334
+ "bits": 3,
1335
+ "mode": "affine"
1336
+ },
1337
+ "model.layers.40.ffn.switch_mlp.gate_proj": {
1338
+ "group_size": 64,
1339
+ "bits": 3,
1340
+ "mode": "affine"
1341
+ },
1342
+ "model.layers.40.ffn.switch_mlp.up_proj": {
1343
+ "group_size": 64,
1344
+ "bits": 3,
1345
+ "mode": "affine"
1346
+ },
1347
+ "model.layers.40.ffn.switch_mlp.down_proj": {
1348
+ "group_size": 64,
1349
+ "bits": 3,
1350
+ "mode": "affine"
1351
+ },
1352
+ "model.layers.41.ffn.switch_mlp.gate_proj": {
1353
+ "group_size": 64,
1354
+ "bits": 3,
1355
+ "mode": "affine"
1356
+ },
1357
+ "model.layers.41.ffn.switch_mlp.up_proj": {
1358
+ "group_size": 64,
1359
+ "bits": 3,
1360
+ "mode": "affine"
1361
+ },
1362
+ "model.layers.41.ffn.switch_mlp.down_proj": {
1363
+ "group_size": 64,
1364
+ "bits": 3,
1365
+ "mode": "affine"
1366
+ },
1367
+ "model.layers.42.ffn.switch_mlp.gate_proj": {
1368
+ "group_size": 64,
1369
+ "bits": 3,
1370
+ "mode": "affine"
1371
+ },
1372
+ "model.layers.42.ffn.switch_mlp.up_proj": {
1373
+ "group_size": 64,
1374
+ "bits": 3,
1375
+ "mode": "affine"
1376
+ },
1377
+ "model.layers.42.ffn.switch_mlp.down_proj": {
1378
+ "group_size": 64,
1379
+ "bits": 3,
1380
+ "mode": "affine"
1381
+ }
1382
+ },
1383
+ "rms_norm_eps": 1e-06,
1384
+ "rope_scaling": {
1385
+ "beta_fast": 32,
1386
+ "beta_slow": 1,
1387
+ "factor": 16,
1388
+ "original_max_position_embeddings": 65536,
1389
+ "type": "yarn"
1390
+ },
1391
+ "rope_theta": 10000,
1392
+ "routed_scaling_factor": 1.5,
1393
+ "scoring_func": "sqrtsoftplus",
1394
+ "sliding_window": 128,
1395
+ "swiglu_limit": 10.0,
1396
+ "tie_word_embeddings": false,
1397
+ "topk_method": "noaux_tc",
1398
+ "torch_dtype": "bfloat16",
1399
+ "transformers_version": "4.57.1",
1400
+ "use_cache": true,
1401
+ "vocab_size": 129280
1402
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "do_sample": true,
6
+ "temperature": 1.0,
7
+ "top_p": 1.0,
8
+ "transformers_version": "4.46.3"
9
+ }
model-00001-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2eb52ca11ba88720a5e679b7ebafe505a6c59374dd3be4b3f0303d2ddaf1f27
3
+ size 4594522718
model-00002-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:605ed3f6653e159b0aaa47656df2aa0fea2ee882149f75031b5f60d651533b47
3
+ size 5020054954
model-00003-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e09925294aa08dd60936a2002a8fb0f237111586b2f8bb6803bdd23f67351ab2
3
+ size 4863615152
model-00004-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3799f36dac9a324c56939c921ab9a1368c59b38a220cd84bee1724abecbb4842
3
+ size 5013850643
model-00005-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce0eb0911ffe7ce1d72863d8d3e345914ed38fe6c2131cdbcaa599e5fa361e7
3
+ size 5013850515
model-00006-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0058ae1b52e59f664140ae2a7a0be1c8108c901fcc421590594fde51c4d9936f
3
+ size 4847857926
model-00007-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7ccce639b9ff732cbbeb9d58b23af96e390348e1b89e2a14be0684e880b2d9
3
+ size 5013850690
model-00008-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4b413b5816171bab518604218ef2219d40811c46fd8c733796028d4a9fe5790
3
+ size 5013850710
model-00009-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1b046b7f742a76eafa36adca463193122e60e10a580e47ab58a57ceddf5aa2
3
+ size 4863615242
model-00010-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d6e6589dfbaecdf35102a6aedc9b10504bc998a8c39b6eba626c7e71760109d
3
+ size 5013850772
model-00011-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5fdba55c53cd30e5afa35f899981834ec90bbcf776721d729761d3b0073925
3
+ size 5013850652
model-00012-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25530b53e0caeff46ac0ff1df9012b4a4f6c4c64e5b0dbd9559c6ee645a82466
3
+ size 4847857940
model-00013-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b16cdc1a68d418595f4568d2b0dcb7863da7e76435c454a1a400c9b1d1e40b
3
+ size 5013850732
model-00014-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8688b4b35770ba9e5a62e9f7311ddffe49b432f95457c892cfe54c77238ff9e9
3
+ size 5013850706
model-00015-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7830f4db970b7d971ac41b1018563062317c943ab4934a57db71d895a11d4f69
3
+ size 4863615238
model-00016-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8add16e29f9c04f979c90bff37eb793927a21e7f2882d3790d50dee5ea25d700
3
+ size 5013850670
model-00017-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009c4520b050e4d8bd962f3b7061973186fed484143a82ee97e457efddc7749d
3
+ size 5013850698
model-00018-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41611a67b34d312adb181c630b5bce32186285893fc08c1d73dcf518532eb198
3
+ size 4847857984
model-00019-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29afc6596d88bf527b5fd80b5f4e0e4195bf83464993cc0a2c9d635385d6fe69
3
+ size 5013850762
model-00020-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95197b0929646dc6bb5a0936c0535f108e30e0bb0ab01d871396b1e084f37a56
3
+ size 5013850682
model-00021-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6dbc06f30d283286da9ec753336307f26f564ad4f91f9a4478e50c870753ec5
3
+ size 4863615222
model-00022-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a2b96c2e9936c0593c2d3e051ee7cb12da508923c9e02f477bde92839a774a
3
+ size 5013850652
model-00023-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fdb8a2d8b0b76391d25cfcba0987b3150a14c1cb6469953998be03e0d7cb4f8
3
+ size 5013850604
model-00024-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9507fc6568bc6573dddd5ee1ed5dd911fa5ef6105d93a6e1a39cb5795e137222
3
+ size 4847857958
model-00025-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96bb3ae8b05adc9de9cc3c84033b8e051461e7f71b938b742d1326a2f502933f
3
+ size 5013850750
model-00026-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b1be9fc0f382def809a47daeb5d62fd7e3b95c60f25d893a8b9e74f1088d32
3
+ size 4893788840
model-00027-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b349e53c2759068ea930dcce19e1c1c12a84cb8a63140d36728721f12812f210
3
+ size 562626863
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin▁of▁sentence|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|end▁of▁sentence|>",
6
+ "is_local": true,
7
+ "legacy": true,
8
+ "model_max_length": 1048576,
9
+ "pad_token": "<|end▁of▁sentence|>",
10
+ "sp_model_kwargs": {},
11
+ "tokenizer_class": "TokenizersBackend",
12
+ "unk_token": null
13
+ }