rnjema101 commited on
Commit
d046404
·
verified ·
1 Parent(s): 7cd7bd2

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +1 -61
  2. tokenizer_config.json +2 -482
  3. vocab.json +25 -27
added_tokens.json CHANGED
@@ -1,63 +1,3 @@
1
  {
2
- "\n": 31,
3
- "!": 32,
4
- "\"": 33,
5
- "%": 34,
6
- "(": 35,
7
- ")": 36,
8
- ",": 37,
9
- ".": 38,
10
- "/": 39,
11
- "0": 40,
12
- "1": 41,
13
- "4": 42,
14
- "5": 43,
15
- "6": 44,
16
- "7": 45,
17
- "8": 46,
18
- "9": 47,
19
- ":": 48,
20
- "<unk>": 30,
21
- "?": 49,
22
- "[": 50,
23
- "]": 51,
24
- "c": 52,
25
- "j": 53,
26
- "q": 54,
27
- "v": 55,
28
- "x": 56,
29
- "z": 57,
30
- "«": 58,
31
- "°": 59,
32
- "»": 60,
33
- "â": 61,
34
- "ã": 62,
35
- "ç": 63,
36
- "è": 64,
37
- "é": 65,
38
- "ê": 66,
39
- "ë": 67,
40
- "í": 68,
41
- "î": 69,
42
- "ï": 70,
43
- "ó": 71,
44
- "ô": 72,
45
- "ö": 73,
46
- "ú": 74,
47
- "û": 75,
48
- "ń": 76,
49
- "ɡ": 77,
50
- "ɲ": 78,
51
- "ʁ": 79,
52
- "ʃ": 80,
53
- "́": 81,
54
- "​": 82,
55
- "–": 83,
56
- "‘": 84,
57
- "’": 85,
58
- "“": 86,
59
- "”": 87,
60
- "•": 88,
61
- "→": 89,
62
- "🕒": 90
63
  }
 
1
  {
2
+ "<unk>": 28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  }
tokenizer_config.json CHANGED
@@ -9,500 +9,20 @@
9
  "single_word": false,
10
  "special": true
11
  },
12
- "30": {
13
  "content": "<unk>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
- },
20
- "31": {
21
- "content": "\n",
22
- "lstrip": false,
23
- "normalized": true,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": false
27
- },
28
- "32": {
29
- "content": "!",
30
- "lstrip": false,
31
- "normalized": true,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": false
35
- },
36
- "33": {
37
- "content": "\"",
38
- "lstrip": false,
39
- "normalized": true,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": false
43
- },
44
- "34": {
45
- "content": "%",
46
- "lstrip": false,
47
- "normalized": true,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": false
51
- },
52
- "35": {
53
- "content": "(",
54
- "lstrip": false,
55
- "normalized": true,
56
- "rstrip": false,
57
- "single_word": false,
58
- "special": false
59
- },
60
- "36": {
61
- "content": ")",
62
- "lstrip": false,
63
- "normalized": true,
64
- "rstrip": false,
65
- "single_word": false,
66
- "special": false
67
- },
68
- "37": {
69
- "content": ",",
70
- "lstrip": false,
71
- "normalized": true,
72
- "rstrip": false,
73
- "single_word": false,
74
- "special": false
75
- },
76
- "38": {
77
- "content": ".",
78
- "lstrip": false,
79
- "normalized": true,
80
- "rstrip": false,
81
- "single_word": false,
82
- "special": false
83
- },
84
- "39": {
85
- "content": "/",
86
- "lstrip": false,
87
- "normalized": true,
88
- "rstrip": false,
89
- "single_word": false,
90
- "special": false
91
- },
92
- "40": {
93
- "content": "0",
94
- "lstrip": false,
95
- "normalized": true,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": false
99
- },
100
- "41": {
101
- "content": "1",
102
- "lstrip": false,
103
- "normalized": true,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": false
107
- },
108
- "42": {
109
- "content": "4",
110
- "lstrip": false,
111
- "normalized": true,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": false
115
- },
116
- "43": {
117
- "content": "5",
118
- "lstrip": false,
119
- "normalized": true,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": false
123
- },
124
- "44": {
125
- "content": "6",
126
- "lstrip": false,
127
- "normalized": true,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": false
131
- },
132
- "45": {
133
- "content": "7",
134
- "lstrip": false,
135
- "normalized": true,
136
- "rstrip": false,
137
- "single_word": false,
138
- "special": false
139
- },
140
- "46": {
141
- "content": "8",
142
- "lstrip": false,
143
- "normalized": true,
144
- "rstrip": false,
145
- "single_word": false,
146
- "special": false
147
- },
148
- "47": {
149
- "content": "9",
150
- "lstrip": false,
151
- "normalized": true,
152
- "rstrip": false,
153
- "single_word": false,
154
- "special": false
155
- },
156
- "48": {
157
- "content": ":",
158
- "lstrip": false,
159
- "normalized": true,
160
- "rstrip": false,
161
- "single_word": false,
162
- "special": false
163
- },
164
- "49": {
165
- "content": "?",
166
- "lstrip": false,
167
- "normalized": true,
168
- "rstrip": false,
169
- "single_word": false,
170
- "special": false
171
- },
172
- "50": {
173
- "content": "[",
174
- "lstrip": false,
175
- "normalized": true,
176
- "rstrip": false,
177
- "single_word": false,
178
- "special": false
179
- },
180
- "51": {
181
- "content": "]",
182
- "lstrip": false,
183
- "normalized": true,
184
- "rstrip": false,
185
- "single_word": false,
186
- "special": false
187
- },
188
- "52": {
189
- "content": "c",
190
- "lstrip": false,
191
- "normalized": true,
192
- "rstrip": false,
193
- "single_word": false,
194
- "special": false
195
- },
196
- "53": {
197
- "content": "j",
198
- "lstrip": false,
199
- "normalized": true,
200
- "rstrip": false,
201
- "single_word": false,
202
- "special": false
203
- },
204
- "54": {
205
- "content": "q",
206
- "lstrip": false,
207
- "normalized": true,
208
- "rstrip": false,
209
- "single_word": false,
210
- "special": false
211
- },
212
- "55": {
213
- "content": "v",
214
- "lstrip": false,
215
- "normalized": true,
216
- "rstrip": false,
217
- "single_word": false,
218
- "special": false
219
- },
220
- "56": {
221
- "content": "x",
222
- "lstrip": false,
223
- "normalized": true,
224
- "rstrip": false,
225
- "single_word": false,
226
- "special": false
227
- },
228
- "57": {
229
- "content": "z",
230
- "lstrip": false,
231
- "normalized": true,
232
- "rstrip": false,
233
- "single_word": false,
234
- "special": false
235
- },
236
- "58": {
237
- "content": "«",
238
- "lstrip": false,
239
- "normalized": true,
240
- "rstrip": false,
241
- "single_word": false,
242
- "special": false
243
- },
244
- "59": {
245
- "content": "��",
246
- "lstrip": false,
247
- "normalized": true,
248
- "rstrip": false,
249
- "single_word": false,
250
- "special": false
251
- },
252
- "60": {
253
- "content": "»",
254
- "lstrip": false,
255
- "normalized": true,
256
- "rstrip": false,
257
- "single_word": false,
258
- "special": false
259
- },
260
- "61": {
261
- "content": "â",
262
- "lstrip": false,
263
- "normalized": true,
264
- "rstrip": false,
265
- "single_word": false,
266
- "special": false
267
- },
268
- "62": {
269
- "content": "ã",
270
- "lstrip": false,
271
- "normalized": true,
272
- "rstrip": false,
273
- "single_word": false,
274
- "special": false
275
- },
276
- "63": {
277
- "content": "ç",
278
- "lstrip": false,
279
- "normalized": true,
280
- "rstrip": false,
281
- "single_word": false,
282
- "special": false
283
- },
284
- "64": {
285
- "content": "è",
286
- "lstrip": false,
287
- "normalized": true,
288
- "rstrip": false,
289
- "single_word": false,
290
- "special": false
291
- },
292
- "65": {
293
- "content": "é",
294
- "lstrip": false,
295
- "normalized": true,
296
- "rstrip": false,
297
- "single_word": false,
298
- "special": false
299
- },
300
- "66": {
301
- "content": "ê",
302
- "lstrip": false,
303
- "normalized": true,
304
- "rstrip": false,
305
- "single_word": false,
306
- "special": false
307
- },
308
- "67": {
309
- "content": "ë",
310
- "lstrip": false,
311
- "normalized": true,
312
- "rstrip": false,
313
- "single_word": false,
314
- "special": false
315
- },
316
- "68": {
317
- "content": "í",
318
- "lstrip": false,
319
- "normalized": true,
320
- "rstrip": false,
321
- "single_word": false,
322
- "special": false
323
- },
324
- "69": {
325
- "content": "î",
326
- "lstrip": false,
327
- "normalized": true,
328
- "rstrip": false,
329
- "single_word": false,
330
- "special": false
331
- },
332
- "70": {
333
- "content": "ï",
334
- "lstrip": false,
335
- "normalized": true,
336
- "rstrip": false,
337
- "single_word": false,
338
- "special": false
339
- },
340
- "71": {
341
- "content": "ó",
342
- "lstrip": false,
343
- "normalized": true,
344
- "rstrip": false,
345
- "single_word": false,
346
- "special": false
347
- },
348
- "72": {
349
- "content": "ô",
350
- "lstrip": false,
351
- "normalized": true,
352
- "rstrip": false,
353
- "single_word": false,
354
- "special": false
355
- },
356
- "73": {
357
- "content": "ö",
358
- "lstrip": false,
359
- "normalized": true,
360
- "rstrip": false,
361
- "single_word": false,
362
- "special": false
363
- },
364
- "74": {
365
- "content": "ú",
366
- "lstrip": false,
367
- "normalized": true,
368
- "rstrip": false,
369
- "single_word": false,
370
- "special": false
371
- },
372
- "75": {
373
- "content": "û",
374
- "lstrip": false,
375
- "normalized": true,
376
- "rstrip": false,
377
- "single_word": false,
378
- "special": false
379
- },
380
- "76": {
381
- "content": "ń",
382
- "lstrip": false,
383
- "normalized": true,
384
- "rstrip": false,
385
- "single_word": false,
386
- "special": false
387
- },
388
- "77": {
389
- "content": "ɡ",
390
- "lstrip": false,
391
- "normalized": true,
392
- "rstrip": false,
393
- "single_word": false,
394
- "special": false
395
- },
396
- "78": {
397
- "content": "ɲ",
398
- "lstrip": false,
399
- "normalized": true,
400
- "rstrip": false,
401
- "single_word": false,
402
- "special": false
403
- },
404
- "79": {
405
- "content": "ʁ",
406
- "lstrip": false,
407
- "normalized": true,
408
- "rstrip": false,
409
- "single_word": false,
410
- "special": false
411
- },
412
- "80": {
413
- "content": "ʃ",
414
- "lstrip": false,
415
- "normalized": true,
416
- "rstrip": false,
417
- "single_word": false,
418
- "special": false
419
- },
420
- "81": {
421
- "content": "́",
422
- "lstrip": false,
423
- "normalized": true,
424
- "rstrip": false,
425
- "single_word": false,
426
- "special": false
427
- },
428
- "82": {
429
- "content": "​",
430
- "lstrip": false,
431
- "normalized": true,
432
- "rstrip": false,
433
- "single_word": false,
434
- "special": false
435
- },
436
- "83": {
437
- "content": "–",
438
- "lstrip": false,
439
- "normalized": true,
440
- "rstrip": false,
441
- "single_word": false,
442
- "special": false
443
- },
444
- "84": {
445
- "content": "‘",
446
- "lstrip": false,
447
- "normalized": true,
448
- "rstrip": false,
449
- "single_word": false,
450
- "special": false
451
- },
452
- "85": {
453
- "content": "’",
454
- "lstrip": false,
455
- "normalized": true,
456
- "rstrip": false,
457
- "single_word": false,
458
- "special": false
459
- },
460
- "86": {
461
- "content": "“",
462
- "lstrip": false,
463
- "normalized": true,
464
- "rstrip": false,
465
- "single_word": false,
466
- "special": false
467
- },
468
- "87": {
469
- "content": "”",
470
- "lstrip": false,
471
- "normalized": true,
472
- "rstrip": false,
473
- "single_word": false,
474
- "special": false
475
- },
476
- "88": {
477
- "content": "•",
478
- "lstrip": false,
479
- "normalized": true,
480
- "rstrip": false,
481
- "single_word": false,
482
- "special": false
483
- },
484
- "89": {
485
- "content": "→",
486
- "lstrip": false,
487
- "normalized": true,
488
- "rstrip": false,
489
- "single_word": false,
490
- "special": false
491
- },
492
- "90": {
493
- "content": "🕒",
494
- "lstrip": false,
495
- "normalized": true,
496
- "rstrip": false,
497
- "single_word": false,
498
- "special": false
499
  }
500
  },
501
  "backend": "custom",
502
  "clean_up_tokenization_spaces": true,
503
  "is_local": true,
504
  "is_uroman": false,
505
- "language": "aka",
506
  "model_max_length": 1000000000000000019884624838656,
507
  "normalize": true,
508
  "pad_token": "a",
 
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "28": {
13
  "content": "<unk>",
14
  "lstrip": false,
15
  "normalized": false,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
20
  },
21
  "backend": "custom",
22
  "clean_up_tokenization_spaces": true,
23
  "is_local": true,
24
  "is_uroman": false,
25
+ "language": "nyn",
26
  "model_max_length": 1000000000000000019884624838656,
27
  "normalize": true,
28
  "pad_token": "a",
vocab.json CHANGED
@@ -1,32 +1,30 @@
1
  {
2
- " ": 8,
3
- "'": 5,
4
- "-": 3,
5
- "2": 23,
6
- "3": 7,
7
- "_": 6,
8
  "a": 0,
9
- "b": 29,
10
- "d": 28,
11
- "e": 24,
12
- "f": 15,
13
- "g": 17,
14
- "h": 20,
15
- "i": 26,
16
- "k": 19,
 
 
17
  "l": 21,
18
  "m": 10,
19
- "n": 4,
20
- "o": 16,
21
- "p": 9,
22
- "r": 25,
23
- "s": 22,
24
- "t": 2,
25
- "u": 18,
26
- "w": 12,
27
- "y": 13,
28
- "á": 11,
29
- "ɔ": 27,
30
- "ɛ": 14,
31
- "ʼ": 1
32
  }
 
1
  {
2
+ " ": 27,
3
+ "'": 17,
4
+ "-": 25,
 
 
 
5
  "a": 0,
6
+ "b": 9,
7
+ "c": 24,
8
+ "d": 19,
9
+ "e": 3,
10
+ "f": 23,
11
+ "g": 13,
12
+ "h": 14,
13
+ "i": 2,
14
+ "j": 20,
15
+ "k": 8,
16
  "l": 21,
17
  "m": 10,
18
+ "n": 6,
19
+ "o": 7,
20
+ "p": 22,
21
+ "r": 5,
22
+ "s": 16,
23
+ "t": 15,
24
+ "u": 4,
25
+ "v": 26,
26
+ "w": 11,
27
+ "y": 12,
28
+ "z": 18,
29
+ "|": 1
 
30
  }