VoyagerXHF commited on
Commit
a1bdd8d
·
verified ·
1 Parent(s): b7897d5

Update tokenizer_config.json

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +67 -2
tokenizer_config.json CHANGED
@@ -208,6 +208,62 @@
208
  "rstrip": false,
209
  "single_word": false,
210
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  }
212
  },
213
  "additional_special_tokens": [
@@ -230,11 +286,20 @@
230
  "clean_up_tokenization_spaces": false,
231
  "eos_token": "<|im_end|>",
232
  "errors": "replace",
233
- "model_max_length": 131072,
234
  "pad_token": "<|endoftext|>",
235
  "split_special_tokens": false,
236
  "tokenizer_class": "Qwen2Tokenizer",
237
  "unk_token": null,
238
  "add_bos_token": false,
239
- "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
 
 
 
 
 
 
 
 
 
240
  }
 
208
  "rstrip": false,
209
  "single_word": false,
210
  "special": false
211
+ },
212
+ "248070": {
213
+ "content": "<|audio_start|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "248071": {
221
+ "content": "<|audio_end|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "248072": {
229
+ "content": "<tts_pad>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "248073": {
237
+ "content": "<tts_text_bos>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "248074": {
245
+ "content": "<tts_text_eod>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "248075": {
253
+ "content": "<tts_text_bos_single>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "248076": {
261
+ "content": "<|audio_pad|>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
  }
268
  },
269
  "additional_special_tokens": [
 
286
  "clean_up_tokenization_spaces": false,
287
  "eos_token": "<|im_end|>",
288
  "errors": "replace",
289
+ "model_max_length": 262144,
290
  "pad_token": "<|endoftext|>",
291
  "split_special_tokens": false,
292
  "tokenizer_class": "Qwen2Tokenizer",
293
  "unk_token": null,
294
  "add_bos_token": false,
295
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
296
+ "extra_special_tokens": {
297
+ "audio_bos_token": "<|audio_start|>",
298
+ "audio_eos_token": "<|audio_end|>",
299
+ "audio_token": "<|audio_pad|>",
300
+ "image_token": "<|image_pad|>",
301
+ "video_token": "<|video_pad|>",
302
+ "vision_bos_token": "<|vision_start|>",
303
+ "vision_eos_token": "<|vision_end|>"
304
+ }
305
  }