pastapaul commited on
Commit
e910552
·
verified ·
1 Parent(s): 9eb6086

Phase 2 full GPTQ + BF16 MTP: 89.1% MTP acceptance

Browse files
config.json ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV4ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "compress_rates": {
9
+ "compressed_sparse_attention": 4,
10
+ "heavily_compressed_attention": 128
11
+ },
12
+ "compress_rope_theta": 160000,
13
+ "dtype": "bfloat16",
14
+ "eos_token_id": 1,
15
+ "expert_dtype": "bf16",
16
+ "hc_eps": 1e-06,
17
+ "hc_mult": 4,
18
+ "hc_sinkhorn_iters": 20,
19
+ "head_dim": 512,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 4096,
22
+ "index_head_dim": 128,
23
+ "index_n_heads": 64,
24
+ "index_topk": 512,
25
+ "initializer_range": 0.02,
26
+ "layer_types": [
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "compressed_sparse_attention",
30
+ "heavily_compressed_attention",
31
+ "compressed_sparse_attention",
32
+ "heavily_compressed_attention",
33
+ "compressed_sparse_attention",
34
+ "heavily_compressed_attention",
35
+ "compressed_sparse_attention",
36
+ "heavily_compressed_attention",
37
+ "compressed_sparse_attention",
38
+ "heavily_compressed_attention",
39
+ "compressed_sparse_attention",
40
+ "heavily_compressed_attention",
41
+ "compressed_sparse_attention",
42
+ "heavily_compressed_attention",
43
+ "compressed_sparse_attention",
44
+ "heavily_compressed_attention",
45
+ "compressed_sparse_attention",
46
+ "heavily_compressed_attention",
47
+ "compressed_sparse_attention",
48
+ "heavily_compressed_attention",
49
+ "compressed_sparse_attention",
50
+ "heavily_compressed_attention",
51
+ "compressed_sparse_attention",
52
+ "heavily_compressed_attention",
53
+ "compressed_sparse_attention",
54
+ "heavily_compressed_attention",
55
+ "compressed_sparse_attention",
56
+ "heavily_compressed_attention",
57
+ "compressed_sparse_attention",
58
+ "heavily_compressed_attention",
59
+ "compressed_sparse_attention",
60
+ "heavily_compressed_attention",
61
+ "compressed_sparse_attention",
62
+ "heavily_compressed_attention",
63
+ "compressed_sparse_attention",
64
+ "heavily_compressed_attention",
65
+ "compressed_sparse_attention",
66
+ "heavily_compressed_attention",
67
+ "compressed_sparse_attention",
68
+ "heavily_compressed_attention",
69
+ "compressed_sparse_attention"
70
+ ],
71
+ "max_position_embeddings": 1048576,
72
+ "mlp_bias": false,
73
+ "mlp_layer_types": [
74
+ "hash_moe",
75
+ "hash_moe",
76
+ "hash_moe",
77
+ "moe",
78
+ "moe",
79
+ "moe",
80
+ "moe",
81
+ "moe",
82
+ "moe",
83
+ "moe",
84
+ "moe",
85
+ "moe",
86
+ "moe",
87
+ "moe",
88
+ "moe",
89
+ "moe",
90
+ "moe",
91
+ "moe",
92
+ "moe",
93
+ "moe",
94
+ "moe",
95
+ "moe",
96
+ "moe",
97
+ "moe",
98
+ "moe",
99
+ "moe",
100
+ "moe",
101
+ "moe",
102
+ "moe",
103
+ "moe",
104
+ "moe",
105
+ "moe",
106
+ "moe",
107
+ "moe",
108
+ "moe",
109
+ "moe",
110
+ "moe",
111
+ "moe",
112
+ "moe",
113
+ "moe",
114
+ "moe",
115
+ "moe",
116
+ "moe"
117
+ ],
118
+ "model_type": "deepseek_v4",
119
+ "moe_intermediate_size": 2048,
120
+ "n_routed_experts": 256,
121
+ "n_shared_experts": 1,
122
+ "norm_topk_prob": true,
123
+ "num_attention_heads": 64,
124
+ "num_experts_per_tok": 6,
125
+ "num_hidden_layers": 43,
126
+ "num_key_value_heads": 1,
127
+ "num_nextn_predict_layers": 1,
128
+ "o_groups": 8,
129
+ "o_lora_rank": 1024,
130
+ "output_router_logits": false,
131
+ "pad_token_id": null,
132
+ "partial_rotary_factor": 0.125,
133
+ "q_lora_rank": 1024,
134
+ "qk_rope_head_dim": 64,
135
+ "quantization_config": {
136
+ "config_groups": {
137
+ "group_0": {
138
+ "format": "float-quantized",
139
+ "input_activations": {
140
+ "actorder": null,
141
+ "block_structure": null,
142
+ "dynamic": true,
143
+ "group_size": 128,
144
+ "num_bits": 8,
145
+ "observer": null,
146
+ "observer_kwargs": {},
147
+ "scale_dtype": null,
148
+ "strategy": "group",
149
+ "symmetric": true,
150
+ "type": "float",
151
+ "zp_dtype": null
152
+ },
153
+ "output_activations": null,
154
+ "targets": [
155
+ "re:.*attn\\.(wq_a|wq_b|wkv|wo_a|wo_b|fused_wqa_wkv|q_a_proj|q_b_proj|kv_proj|o_a_proj|o_b_proj)$",
156
+ "re:.*attn\\.compressor\\.(wgate|wkv|fused_wkv_wgate|gate_proj|kv_proj)$",
157
+ "re:.*attn\\.indexer\\.(weights_proj|wq_b|q_b_proj)$",
158
+ "re:.*attn\\.indexer\\.compressor\\.(wgate|wkv|gate_proj|kv_proj)$"
159
+ ],
160
+ "weights": {
161
+ "actorder": null,
162
+ "block_structure": [
163
+ 128,
164
+ 128
165
+ ],
166
+ "dynamic": false,
167
+ "group_size": null,
168
+ "num_bits": 8,
169
+ "observer": "memoryless_minmax",
170
+ "observer_kwargs": {},
171
+ "scale_dtype": null,
172
+ "strategy": "block",
173
+ "symmetric": true,
174
+ "type": "float",
175
+ "zp_dtype": null
176
+ }
177
+ },
178
+ "group_1": {
179
+ "format": "pack-quantized",
180
+ "input_activations": null,
181
+ "output_activations": null,
182
+ "targets": [
183
+ "re:.*experts\\.\\d+\\.(w1|w2|w3|gate_proj|up_proj|down_proj|gate_up_proj)$"
184
+ ],
185
+ "weights": {
186
+ "actorder": "static",
187
+ "block_structure": null,
188
+ "dynamic": false,
189
+ "group_size": 128,
190
+ "num_bits": 4,
191
+ "observer": "memoryless_minmax",
192
+ "observer_kwargs": {},
193
+ "scale_dtype": null,
194
+ "strategy": "group",
195
+ "symmetric": true,
196
+ "type": "int",
197
+ "zp_dtype": null
198
+ }
199
+ }
200
+ },
201
+ "format": "mixed-precision",
202
+ "global_compression_ratio": null,
203
+ "ignore": [
204
+ "layers.0.ffn.shared_experts.w1",
205
+ "layers.0.ffn.shared_experts.w2",
206
+ "layers.0.ffn.shared_experts.w3",
207
+ "layers.1.ffn.shared_experts.w1",
208
+ "layers.1.ffn.shared_experts.w2",
209
+ "layers.1.ffn.shared_experts.w3",
210
+ "layers.2.ffn.shared_experts.w1",
211
+ "layers.2.ffn.shared_experts.w2",
212
+ "layers.2.ffn.shared_experts.w3",
213
+ "layers.3.ffn.shared_experts.w1",
214
+ "layers.3.ffn.shared_experts.w2",
215
+ "layers.3.ffn.shared_experts.w3",
216
+ "layers.4.ffn.shared_experts.w1",
217
+ "layers.4.ffn.shared_experts.w2",
218
+ "layers.4.ffn.shared_experts.w3",
219
+ "layers.5.ffn.shared_experts.w1",
220
+ "layers.5.ffn.shared_experts.w2",
221
+ "layers.5.ffn.shared_experts.w3",
222
+ "layers.6.ffn.shared_experts.w1",
223
+ "layers.6.ffn.shared_experts.w2",
224
+ "layers.6.ffn.shared_experts.w3",
225
+ "layers.7.ffn.shared_experts.w1",
226
+ "layers.7.ffn.shared_experts.w2",
227
+ "layers.7.ffn.shared_experts.w3",
228
+ "layers.8.ffn.shared_experts.w1",
229
+ "layers.8.ffn.shared_experts.w2",
230
+ "layers.8.ffn.shared_experts.w3",
231
+ "layers.9.ffn.shared_experts.w1",
232
+ "layers.9.ffn.shared_experts.w2",
233
+ "layers.9.ffn.shared_experts.w3",
234
+ "layers.10.ffn.shared_experts.w1",
235
+ "layers.10.ffn.shared_experts.w2",
236
+ "layers.10.ffn.shared_experts.w3",
237
+ "layers.11.ffn.shared_experts.w1",
238
+ "layers.11.ffn.shared_experts.w2",
239
+ "layers.11.ffn.shared_experts.w3",
240
+ "layers.12.ffn.shared_experts.w1",
241
+ "layers.12.ffn.shared_experts.w2",
242
+ "layers.12.ffn.shared_experts.w3",
243
+ "layers.13.ffn.shared_experts.w1",
244
+ "layers.13.ffn.shared_experts.w2",
245
+ "layers.13.ffn.shared_experts.w3",
246
+ "layers.14.ffn.shared_experts.w1",
247
+ "layers.14.ffn.shared_experts.w2",
248
+ "layers.14.ffn.shared_experts.w3",
249
+ "layers.15.ffn.shared_experts.w1",
250
+ "layers.15.ffn.shared_experts.w2",
251
+ "layers.15.ffn.shared_experts.w3",
252
+ "layers.16.ffn.shared_experts.w1",
253
+ "layers.16.ffn.shared_experts.w2",
254
+ "layers.16.ffn.shared_experts.w3",
255
+ "layers.17.ffn.shared_experts.w1",
256
+ "layers.17.ffn.shared_experts.w2",
257
+ "layers.17.ffn.shared_experts.w3",
258
+ "layers.18.ffn.shared_experts.w1",
259
+ "layers.18.ffn.shared_experts.w2",
260
+ "layers.18.ffn.shared_experts.w3",
261
+ "layers.19.ffn.shared_experts.w1",
262
+ "layers.19.ffn.shared_experts.w2",
263
+ "layers.19.ffn.shared_experts.w3",
264
+ "layers.20.ffn.shared_experts.w1",
265
+ "layers.20.ffn.shared_experts.w2",
266
+ "layers.20.ffn.shared_experts.w3",
267
+ "layers.21.ffn.shared_experts.w1",
268
+ "layers.21.ffn.shared_experts.w2",
269
+ "layers.21.ffn.shared_experts.w3",
270
+ "layers.22.ffn.shared_experts.w1",
271
+ "layers.22.ffn.shared_experts.w2",
272
+ "layers.22.ffn.shared_experts.w3",
273
+ "layers.23.ffn.shared_experts.w1",
274
+ "layers.23.ffn.shared_experts.w2",
275
+ "layers.23.ffn.shared_experts.w3",
276
+ "layers.24.ffn.shared_experts.w1",
277
+ "layers.24.ffn.shared_experts.w2",
278
+ "layers.24.ffn.shared_experts.w3",
279
+ "layers.25.ffn.shared_experts.w1",
280
+ "layers.25.ffn.shared_experts.w2",
281
+ "layers.25.ffn.shared_experts.w3",
282
+ "layers.26.ffn.shared_experts.w1",
283
+ "layers.26.ffn.shared_experts.w2",
284
+ "layers.26.ffn.shared_experts.w3",
285
+ "layers.27.ffn.shared_experts.w1",
286
+ "layers.27.ffn.shared_experts.w2",
287
+ "layers.27.ffn.shared_experts.w3",
288
+ "layers.28.ffn.shared_experts.w1",
289
+ "layers.28.ffn.shared_experts.w2",
290
+ "layers.28.ffn.shared_experts.w3",
291
+ "layers.29.ffn.shared_experts.w1",
292
+ "layers.29.ffn.shared_experts.w2",
293
+ "layers.29.ffn.shared_experts.w3",
294
+ "layers.30.ffn.shared_experts.w1",
295
+ "layers.30.ffn.shared_experts.w2",
296
+ "layers.30.ffn.shared_experts.w3",
297
+ "layers.31.ffn.shared_experts.w1",
298
+ "layers.31.ffn.shared_experts.w2",
299
+ "layers.31.ffn.shared_experts.w3",
300
+ "layers.32.ffn.shared_experts.w1",
301
+ "layers.32.ffn.shared_experts.w2",
302
+ "layers.32.ffn.shared_experts.w3",
303
+ "layers.33.ffn.shared_experts.w1",
304
+ "layers.33.ffn.shared_experts.w2",
305
+ "layers.33.ffn.shared_experts.w3",
306
+ "layers.34.ffn.shared_experts.w1",
307
+ "layers.34.ffn.shared_experts.w2",
308
+ "layers.34.ffn.shared_experts.w3",
309
+ "layers.35.ffn.shared_experts.w1",
310
+ "layers.35.ffn.shared_experts.w2",
311
+ "layers.35.ffn.shared_experts.w3",
312
+ "layers.36.ffn.shared_experts.w1",
313
+ "layers.36.ffn.shared_experts.w2",
314
+ "layers.36.ffn.shared_experts.w3",
315
+ "layers.37.ffn.shared_experts.w1",
316
+ "layers.37.ffn.shared_experts.w2",
317
+ "layers.37.ffn.shared_experts.w3",
318
+ "layers.38.ffn.shared_experts.w1",
319
+ "layers.38.ffn.shared_experts.w2",
320
+ "layers.38.ffn.shared_experts.w3",
321
+ "layers.39.ffn.shared_experts.w1",
322
+ "layers.39.ffn.shared_experts.w2",
323
+ "layers.39.ffn.shared_experts.w3",
324
+ "layers.40.ffn.shared_experts.w1",
325
+ "layers.40.ffn.shared_experts.w2",
326
+ "layers.40.ffn.shared_experts.w3",
327
+ "layers.41.ffn.shared_experts.w1",
328
+ "layers.41.ffn.shared_experts.w2",
329
+ "layers.41.ffn.shared_experts.w3",
330
+ "layers.42.ffn.shared_experts.w1",
331
+ "layers.42.ffn.shared_experts.w2",
332
+ "layers.42.ffn.shared_experts.w3",
333
+ "re:^layers\\.43\\.",
334
+ "re:^model\\.layers\\.43\\.",
335
+ "lm_head"
336
+ ],
337
+ "kv_cache_scheme": null,
338
+ "quant_method": "compressed-tensors",
339
+ "quantization_status": "compressed",
340
+ "sparsity_config": {},
341
+ "transform_config": {},
342
+ "version": "0.15.1.a20260515",
343
+ "scale_fmt": "ue8m0"
344
+ },
345
+ "rms_norm_eps": 1e-06,
346
+ "rope_parameters": {
347
+ "compress": {
348
+ "attention_factor": 1.0,
349
+ "beta_fast": 32,
350
+ "beta_slow": 1,
351
+ "factor": 16,
352
+ "original_max_position_embeddings": 65536,
353
+ "partial_rotary_factor": 0.125,
354
+ "rope_theta": 160000,
355
+ "rope_type": "yarn",
356
+ "type": "yarn"
357
+ },
358
+ "main": {
359
+ "partial_rotary_factor": 0.125,
360
+ "rope_theta": 10000,
361
+ "rope_type": "default"
362
+ },
363
+ "partial_rotary_factor": 0.125,
364
+ "rope_theta": 10000,
365
+ "rope_type": "default"
366
+ },
367
+ "rope_theta": 10000,
368
+ "routed_scaling_factor": 1.5,
369
+ "router_aux_loss_coef": 0.001,
370
+ "router_jitter_noise": 0.0,
371
+ "scoring_func": "sqrtsoftplus",
372
+ "sliding_window": 128,
373
+ "swiglu_limit": 10.0,
374
+ "tie_word_embeddings": false,
375
+ "topk_method": "noaux_tc",
376
+ "transformers_version": "5.8.1",
377
+ "use_cache": true,
378
+ "vocab_size": 129280,
379
+ "compress_ratios": [
380
+ 0,
381
+ 0,
382
+ 4,
383
+ 128,
384
+ 4,
385
+ 128,
386
+ 4,
387
+ 128,
388
+ 4,
389
+ 128,
390
+ 4,
391
+ 128,
392
+ 4,
393
+ 128,
394
+ 4,
395
+ 128,
396
+ 4,
397
+ 128,
398
+ 4,
399
+ 128,
400
+ 4,
401
+ 128,
402
+ 4,
403
+ 128,
404
+ 4,
405
+ 128,
406
+ 4,
407
+ 128,
408
+ 4,
409
+ 128,
410
+ 4,
411
+ 128,
412
+ 4,
413
+ 128,
414
+ 4,
415
+ 128,
416
+ 4,
417
+ 128,
418
+ 4,
419
+ 128,
420
+ 4,
421
+ 128,
422
+ 4,
423
+ 0
424
+ ],
425
+ "num_hash_layers": 3,
426
+ "rope_scaling": {
427
+ "beta_fast": 32,
428
+ "beta_slow": 1,
429
+ "factor": 16,
430
+ "original_max_position_embeddings": 65536,
431
+ "type": "yarn"
432
+ },
433
+ "torch_dtype": "bfloat16"
434
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "do_sample": true,
5
+ "eos_token_id": 1,
6
+ "temperature": 1.0,
7
+ "top_p": 1.0,
8
+ "transformers_version": "5.8.1"
9
+ }
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d986f996385b30f897e047fc54b755f32c6998133add79bafc02215714de3e3a
3
+ size 51059154690
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdf611410ef972e28db7d91d591a31d846751bb41c765f6f679d473db6a3903a
3
+ size 50002265884
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b8e17840571d8fc6ce0703c7d81d50f113d60313c455d1c0795e8b0673e6562
3
+ size 50001322552
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd520418011b6c24a01786e3c57176d3ed00fca736ff90d3d35c5760f9c749d
3
+ size 18795041598
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
recipe.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ GPTQModifier:
4
+ config_groups:
5
+ attention:
6
+ targets: ['re:^model\.layers\.\d+\.self_attn\.(q_a_proj|q_b_proj|kv_proj|o_a_proj|o_b_proj)$',
7
+ 're:^model\.layers\.\d+\.self_attn\.compressor\.(gate_proj|kv_proj)$', 're:^model\.layers\.\d+\.self_attn\.compressor\.indexer\.(gate_proj|kv_proj|q_b_proj|weights_proj)$']
8
+ weights:
9
+ num_bits: 8
10
+ type: float
11
+ symmetric: true
12
+ group_size: null
13
+ strategy: block
14
+ block_structure: [128, 128]
15
+ dynamic: false
16
+ actorder: null
17
+ scale_dtype: null
18
+ zp_dtype: null
19
+ observer: memoryless_minmax
20
+ observer_kwargs: {}
21
+ input_activations:
22
+ num_bits: 8
23
+ type: float
24
+ symmetric: true
25
+ group_size: 128
26
+ strategy: group
27
+ block_structure: null
28
+ dynamic: true
29
+ actorder: null
30
+ scale_dtype: null
31
+ zp_dtype: null
32
+ observer: null
33
+ observer_kwargs: {}
34
+ output_activations: null
35
+ format: null
36
+ experts:
37
+ targets: ['re:^model\.layers\.\d+\.mlp\.experts\.\d+\.(gate_proj|up_proj|down_proj)$']
38
+ weights:
39
+ num_bits: 4
40
+ type: int
41
+ symmetric: true
42
+ group_size: 128
43
+ strategy: group
44
+ block_structure: null
45
+ dynamic: false
46
+ actorder: !!python/object/apply:compressed_tensors.quantization.quant_args.ActivationOrdering [
47
+ static]
48
+ scale_dtype: null
49
+ zp_dtype: null
50
+ observer: memoryless_minmax
51
+ observer_kwargs: {}
52
+ input_activations: null
53
+ output_activations: null
54
+ format: null
55
+ targets: [Linear]
56
+ ignore: [lm_head, 're:.*mtp\..*']
57
+ bypass_divisibility_checks: false
58
+ block_size: 128
59
+ dampening_frac: 0.1
60
+ actorder: static
61
+ offload_hessians: true
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin▁of▁sentence|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|end▁of▁sentence|>",
6
+ "is_local": true,
7
+ "legacy": true,
8
+ "local_files_only": false,
9
+ "model_max_length": 1048576,
10
+ "pad_token": "<|end▁of▁sentence|>",
11
+ "sp_model_kwargs": {},
12
+ "tokenizer_class": "TokenizersBackend",
13
+ "unk_token": null
14
+ }