everythingchalna commited on
Commit
35547c6
·
verified ·
1 Parent(s): c44f29d

Upload folder using huggingface_hub

Browse files
tokenizer_v3/tokenizer.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|bos|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|cond|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "<|sep|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "<|eos|>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[UNK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "Sequence",
54
+ "normalizers": [
55
+ {
56
+ "type": "Replace",
57
+ "pattern": {
58
+ "Regex": "="
59
+ },
60
+ "content": " = "
61
+ },
62
+ {
63
+ "type": "Replace",
64
+ "pattern": {
65
+ "Regex": "(?<=[A-Za-z0-9])(?=[A-Z])"
66
+ },
67
+ "content": " "
68
+ }
69
+ ]
70
+ },
71
+ "pre_tokenizer": {
72
+ "type": "Sequence",
73
+ "pretokenizers": [
74
+ {
75
+ "type": "Split",
76
+ "pattern": {
77
+ "String": "\n"
78
+ },
79
+ "behavior": "Isolated",
80
+ "invert": false
81
+ },
82
+ {
83
+ "type": "Split",
84
+ "pattern": {
85
+ "String": " "
86
+ },
87
+ "behavior": "Removed",
88
+ "invert": false
89
+ },
90
+ {
91
+ "type": "Digits",
92
+ "individual_digits": false
93
+ },
94
+ {
95
+ "type": "Split",
96
+ "pattern": {
97
+ "Regex": "\\d{2}|\\d"
98
+ },
99
+ "behavior": "Isolated",
100
+ "invert": false
101
+ }
102
+ ]
103
+ },
104
+ "post_processor": null,
105
+ "decoder": {
106
+ "type": "Fuse"
107
+ },
108
+ "model": {
109
+ "type": "WordLevel",
110
+ "vocab": {
111
+ "[UNK]": 4,
112
+ ".": 5,
113
+ "\n": 6,
114
+ "00": 7,
115
+ "2": 8,
116
+ "0": 9,
117
+ "4": 10,
118
+ "3": 11,
119
+ "1": 12,
120
+ "10": 13,
121
+ "6": 14,
122
+ "5": 15,
123
+ "11": 16,
124
+ "12": 17,
125
+ "13": 18,
126
+ "14": 19,
127
+ "7": 20,
128
+ "15": 21,
129
+ "16": 22,
130
+ "17": 23,
131
+ "18": 24,
132
+ "8": 25,
133
+ "19": 26,
134
+ "20": 27,
135
+ "9": 28,
136
+ "21": 29,
137
+ "22": 30,
138
+ "23": 31,
139
+ "24": 32,
140
+ "25": 33,
141
+ "26": 34,
142
+ "90": 35,
143
+ "27": 36,
144
+ "28": 37,
145
+ "29": 38,
146
+ "H": 39,
147
+ "30": 40,
148
+ "31": 41,
149
+ "32": 42,
150
+ "33": 43,
151
+ "01": 44,
152
+ "34": 45,
153
+ "S": 46,
154
+ "02": 47,
155
+ "03": 48,
156
+ "35": 49,
157
+ "04": 50,
158
+ "36": 51,
159
+ "37": 52,
160
+ "05": 53,
161
+ "38": 54,
162
+ "06": 55,
163
+ "39": 56,
164
+ "09": 57,
165
+ "95": 58,
166
+ "08": 59,
167
+ "42": 60,
168
+ "07": 61,
169
+ "98": 62,
170
+ "40": 63,
171
+ "93": 64,
172
+ "56": 65,
173
+ "94": 66,
174
+ "71": 67,
175
+ "97": 68,
176
+ "85": 69,
177
+ "91": 70,
178
+ "84": 71,
179
+ "41": 72,
180
+ "60": 73,
181
+ "92": 74,
182
+ "46": 75,
183
+ "55": 76,
184
+ "86": 77,
185
+ "49": 78,
186
+ "53": 79,
187
+ "79": 80,
188
+ "51": 81,
189
+ "43": 82,
190
+ "88": 83,
191
+ "89": 84,
192
+ "48": 85,
193
+ "87": 86,
194
+ "45": 87,
195
+ "54": 88,
196
+ "78": 89,
197
+ "63": 90,
198
+ "66": 91,
199
+ "69": 92,
200
+ "99": 93,
201
+ "57": 94,
202
+ "82": 95,
203
+ "62": 96,
204
+ "96": 97,
205
+ "74": 98,
206
+ "83": 99,
207
+ "47": 100,
208
+ "58": 101,
209
+ "80": 102,
210
+ "65": 103,
211
+ "77": 104,
212
+ "73": 105,
213
+ "68": 106,
214
+ "81": 107,
215
+ "75": 108,
216
+ "59": 109,
217
+ "72": 110,
218
+ "64": 111,
219
+ "61": 112,
220
+ "76": 113,
221
+ "52": 114,
222
+ "70": 115,
223
+ "44": 116,
224
+ "50": 117,
225
+ "67": 118,
226
+ "=": 119,
227
+ "-": 120,
228
+ "Se": 121,
229
+ "Al": 122,
230
+ "C": 123,
231
+ "Te": 124,
232
+ "Si": 125,
233
+ "Ti": 126,
234
+ "P": 127,
235
+ "Ga": 128,
236
+ "N": 129,
237
+ "Pd": 130,
238
+ "O": 131,
239
+ "Cl": 132,
240
+ "Ca": 133,
241
+ "Hf": 134,
242
+ "As": 135,
243
+ "In": 136,
244
+ "Pt": 137,
245
+ "Ni": 138,
246
+ "Na": 139,
247
+ "Ge": 140,
248
+ "Zn": 141,
249
+ "Sn": 142,
250
+ "Cu": 143,
251
+ "Zr": 144,
252
+ "Rh": 145,
253
+ "Au": 146,
254
+ "Sb": 147,
255
+ "Ag": 148,
256
+ "V": 149,
257
+ "Y": 150,
258
+ "K": 151,
259
+ "Sc": 152,
260
+ "Ta": 153,
261
+ "Nb": 154,
262
+ "<|bos|>": 155,
263
+ "<|cond|>": 156,
264
+ "<|eos|>": 157,
265
+ "<|sep|>": 158,
266
+ "ads": 159,
267
+ "bin": 160,
268
+ "composition": 161,
269
+ "relax": 162,
270
+ "target_bin": 163,
271
+ "task": 164,
272
+ "Sr": 165,
273
+ "Mo": 166,
274
+ "Co": 167,
275
+ "Pb": 168,
276
+ "Hg": 169,
277
+ "Ru": 170,
278
+ "Ir": 171,
279
+ "Bi": 172,
280
+ "Mn": 173,
281
+ "Fe": 174,
282
+ "Tl": 175,
283
+ "Cd": 176,
284
+ "Cr": 177,
285
+ "Rb": 178,
286
+ "W": 179,
287
+ "Re": 180,
288
+ "Tc": 181,
289
+ "Cs": 182,
290
+ "Os": 183,
291
+ "B": 184
292
+ },
293
+ "unk_token": "[UNK]"
294
+ }
295
+ }
tokenizer_v3/tokenizer_stats.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 181,
3
+ "n_sampled": 10000,
4
+ "chars_per_token": {
5
+ "mean": 1.824003565805524,
6
+ "median": 1.8262004175365345
7
+ },
8
+ "token_length": {
9
+ "mean": 1083.7929,
10
+ "std": 435.14883937520733,
11
+ "min": 176,
12
+ "max": 3214,
13
+ "p50": 1020.0,
14
+ "p95": 1929.0,
15
+ "p99": 2462.0
16
+ },
17
+ "coverage_by_seq_len": {
18
+ "512": 4.98,
19
+ "768": 22.01,
20
+ "1024": 51.19,
21
+ "1536": 87.63,
22
+ "2048": 96.26,
23
+ "3072": 99.93,
24
+ "4096": 100.0
25
+ }
26
+ }