areshx commited on
Commit
d40dc5f
·
verified ·
1 Parent(s): b567a83

lang model & tokenizer

Browse files
bpe_3gram.arpa ADDED
The diff for this file is too large to render. See raw diff
 
bpe_tokenizer/bpe-tokenizer.json ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[BLANK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[SOS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[EOS]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "Sequence",
45
+ "normalizers": [
46
+ {
47
+ "type": "NFD"
48
+ },
49
+ {
50
+ "type": "Lowercase"
51
+ },
52
+ {
53
+ "type": "StripAccents"
54
+ },
55
+ {
56
+ "type": "Replace",
57
+ "pattern": {
58
+ "String": " ' s"
59
+ },
60
+ "content": "'s"
61
+ }
62
+ ]
63
+ },
64
+ "pre_tokenizer": {
65
+ "type": "BertPreTokenizer"
66
+ },
67
+ "post_processor": {
68
+ "type": "TemplateProcessing",
69
+ "single": [
70
+ {
71
+ "SpecialToken": {
72
+ "id": "[SOS]",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "Sequence": {
78
+ "id": "A",
79
+ "type_id": 0
80
+ }
81
+ },
82
+ {
83
+ "SpecialToken": {
84
+ "id": "[EOS]",
85
+ "type_id": 0
86
+ }
87
+ }
88
+ ],
89
+ "pair": [
90
+ {
91
+ "Sequence": {
92
+ "id": "A",
93
+ "type_id": 0
94
+ }
95
+ },
96
+ {
97
+ "Sequence": {
98
+ "id": "B",
99
+ "type_id": 1
100
+ }
101
+ }
102
+ ],
103
+ "special_tokens": {
104
+ "[EOS]": {
105
+ "id": "[EOS]",
106
+ "ids": [
107
+ 3
108
+ ],
109
+ "tokens": [
110
+ "[EOS]"
111
+ ]
112
+ },
113
+ "[SOS]": {
114
+ "id": "[SOS]",
115
+ "ids": [
116
+ 2
117
+ ],
118
+ "tokens": [
119
+ "[SOS]"
120
+ ]
121
+ }
122
+ }
123
+ },
124
+ "decoder": {
125
+ "type": "WordPiece",
126
+ "prefix": "##",
127
+ "cleanup": true
128
+ },
129
+ "model": {
130
+ "type": "BPE",
131
+ "dropout": null,
132
+ "unk_token": "[UNK]",
133
+ "continuing_subword_prefix": "##",
134
+ "end_of_word_suffix": null,
135
+ "fuse_unk": false,
136
+ "byte_fallback": false,
137
+ "ignore_merges": false,
138
+ "vocab": {
139
+ "[PAD]": 0,
140
+ "[BLANK]": 1,
141
+ "[SOS]": 2,
142
+ "[EOS]": 3,
143
+ "'": 4,
144
+ "a": 5,
145
+ "b": 6,
146
+ "c": 7,
147
+ "d": 8,
148
+ "e": 9,
149
+ "f": 10,
150
+ "g": 11,
151
+ "h": 12,
152
+ "i": 13,
153
+ "j": 14,
154
+ "k": 15,
155
+ "l": 16,
156
+ "m": 17,
157
+ "n": 18,
158
+ "o": 19,
159
+ "p": 20,
160
+ "q": 21,
161
+ "r": 22,
162
+ "s": 23,
163
+ "t": 24,
164
+ "u": 25,
165
+ "v": 26,
166
+ "w": 27,
167
+ "x": 28,
168
+ "y": 29,
169
+ "z": 30,
170
+ "##r": 31,
171
+ "##u": 32,
172
+ "##c": 33,
173
+ "##h": 34,
174
+ "##t": 35,
175
+ "##e": 36,
176
+ "##o": 37,
177
+ "##s": 38,
178
+ "##v": 39,
179
+ "##i": 40,
180
+ "##d": 41,
181
+ "##a": 42,
182
+ "##l": 43,
183
+ "##b": 44,
184
+ "##n": 45,
185
+ "##p": 46,
186
+ "##y": 47,
187
+ "##g": 48,
188
+ "##f": 49,
189
+ "##k": 50,
190
+ "##w": 51,
191
+ "##z": 52,
192
+ "##m": 53,
193
+ "##j": 54,
194
+ "##x": 55,
195
+ "##q": 56,
196
+ "th": 57,
197
+ "the": 58,
198
+ "##er": 59,
199
+ "##nd": 60,
200
+ "##in": 61,
201
+ "##ed": 62,
202
+ "##ou": 63,
203
+ "##at": 64,
204
+ "##en": 65,
205
+ "and": 66,
206
+ "##or": 67,
207
+ "of": 68,
208
+ "##es": 69,
209
+ "##on": 70,
210
+ "to": 71,
211
+ "##is": 72,
212
+ "##ing": 73,
213
+ "##ar": 74,
214
+ "##an": 75,
215
+ "##it": 76,
216
+ "##as": 77,
217
+ "in": 78,
218
+ "##ll": 79,
219
+ "##re": 80,
220
+ "wh": 81,
221
+ "##om": 82,
222
+ "he": 83,
223
+ "ha": 84,
224
+ "be": 85,
225
+ "##le": 86,
226
+ "##ic": 87,
227
+ "##ot": 88,
228
+ "##ow": 89,
229
+ "##ut": 90,
230
+ "it": 91,
231
+ "##ly": 92,
232
+ "##ld": 93,
233
+ "that": 94,
234
+ "##gh": 95,
235
+ "sh": 96,
236
+ "was": 97,
237
+ "##ve": 98,
238
+ "on": 99
239
+ },
240
+ "merges": [
241
+ [
242
+ "t",
243
+ "##h"
244
+ ],
245
+ [
246
+ "th",
247
+ "##e"
248
+ ],
249
+ [
250
+ "##e",
251
+ "##r"
252
+ ],
253
+ [
254
+ "##n",
255
+ "##d"
256
+ ],
257
+ [
258
+ "##i",
259
+ "##n"
260
+ ],
261
+ [
262
+ "##e",
263
+ "##d"
264
+ ],
265
+ [
266
+ "##o",
267
+ "##u"
268
+ ],
269
+ [
270
+ "##a",
271
+ "##t"
272
+ ],
273
+ [
274
+ "##e",
275
+ "##n"
276
+ ],
277
+ [
278
+ "a",
279
+ "##nd"
280
+ ],
281
+ [
282
+ "##o",
283
+ "##r"
284
+ ],
285
+ [
286
+ "o",
287
+ "##f"
288
+ ],
289
+ [
290
+ "##e",
291
+ "##s"
292
+ ],
293
+ [
294
+ "##o",
295
+ "##n"
296
+ ],
297
+ [
298
+ "t",
299
+ "##o"
300
+ ],
301
+ [
302
+ "##i",
303
+ "##s"
304
+ ],
305
+ [
306
+ "##in",
307
+ "##g"
308
+ ],
309
+ [
310
+ "##a",
311
+ "##r"
312
+ ],
313
+ [
314
+ "##a",
315
+ "##n"
316
+ ],
317
+ [
318
+ "##i",
319
+ "##t"
320
+ ],
321
+ [
322
+ "##a",
323
+ "##s"
324
+ ],
325
+ [
326
+ "i",
327
+ "##n"
328
+ ],
329
+ [
330
+ "##l",
331
+ "##l"
332
+ ],
333
+ [
334
+ "##r",
335
+ "##e"
336
+ ],
337
+ [
338
+ "w",
339
+ "##h"
340
+ ],
341
+ [
342
+ "##o",
343
+ "##m"
344
+ ],
345
+ [
346
+ "h",
347
+ "##e"
348
+ ],
349
+ [
350
+ "h",
351
+ "##a"
352
+ ],
353
+ [
354
+ "b",
355
+ "##e"
356
+ ],
357
+ [
358
+ "##l",
359
+ "##e"
360
+ ],
361
+ [
362
+ "##i",
363
+ "##c"
364
+ ],
365
+ [
366
+ "##o",
367
+ "##t"
368
+ ],
369
+ [
370
+ "##o",
371
+ "##w"
372
+ ],
373
+ [
374
+ "##u",
375
+ "##t"
376
+ ],
377
+ [
378
+ "i",
379
+ "##t"
380
+ ],
381
+ [
382
+ "##l",
383
+ "##y"
384
+ ],
385
+ [
386
+ "##l",
387
+ "##d"
388
+ ],
389
+ [
390
+ "th",
391
+ "##at"
392
+ ],
393
+ [
394
+ "##g",
395
+ "##h"
396
+ ],
397
+ [
398
+ "s",
399
+ "##h"
400
+ ],
401
+ [
402
+ "w",
403
+ "##as"
404
+ ],
405
+ [
406
+ "##v",
407
+ "##e"
408
+ ],
409
+ [
410
+ "o",
411
+ "##n"
412
+ ]
413
+ ]
414
+ }
415
+ }
bpe_tokenizer/bpe-tokenizer.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"name": "bpe-tokenizer", "min_frequency": 0, "vocab_size": 100, "data_dir": "/media/atem/Data/HSE_videos/4_DLA/hw_1_ASR/data/text_datasets", "dataset_name": "librispeech-lm-norm"}