{ "version": "1.0", "truncation": null, "padding": null, "added_tokens": [], "normalizer": { "type": "Sequence", "normalizers": [ { "type": "NFKD" }, { "type": "Replace", "pattern": { "Regex": "\\s+" }, "content": " " }, { "type": "Replace", "pattern": { "Regex": "[–—]" }, "content": "-" }, { "type": "Replace", "pattern": { "Regex": "[^a-zA-Z0-9\\s\\-!\"$%()*+,.\\/:;?@_ÀÁÂÉÈÊËÌÍÎÏÒÓÔÙÚÛŸŃŊŋƆɔƐɛʉǎǐǒǔḿẅ’ʼ£̀́̂̌]" }, "content": "" } ] }, "pre_tokenizer": { "type": "FixedLength", "length": 1 }, "post_processor": null, "decoder": { "type": "Fuse" }, "model": { "type": "WordLevel", "vocab": { " ": 0, "!": 1, "\"": 2, "$": 3, "%": 4, "&": 5, "'": 6, "(": 7, ")": 8, "*": 9, "+": 10, ",": 11, "-": 12, ".": 13, "/": 14, "0": 15, "1": 16, "2": 17, "3": 18, "4": 19, "5": 20, "6": 21, "7": 22, "8": 23, "9": 24, ":": 25, ";": 26, "?": 27, "@": 28, "A": 29, "B": 30, "C": 31, "D": 32, "E": 33, "F": 34, "G": 35, "H": 36, "I": 37, "J": 38, "K": 39, "L": 40, "M": 41, "N": 42, "O": 43, "P": 44, "Q": 45, "R": 46, "S": 47, "T": 48, "U": 49, "V": 50, "W": 51, "X": 52, "Y": 53, "Z": 54, "a": 55, "b": 56, "c": 57, "d": 58, "e": 59, "f": 60, "g": 61, "h": 62, "i": 63, "j": 64, "k": 65, "l": 66, "m": 67, "n": 68, "o": 69, "p": 70, "q": 71, "r": 72, "s": 73, "t": 74, "u": 75, "v": 76, "w": 77, "x": 78, "y": 79, "z": 80, "À": 81, "Á": 82, "Â": 83, "É": 84, "È": 85, "Ê": 86, "Ë": 87, "Ì": 88, "Í": 89, "Î": 90, "Ï": 91, "Ò": 92, "Ó": 93, "Ô": 94, "Ù": 95, "Ú": 96, "Û": 97, "Ÿ": 98, "Ń": 99, "Ŋ": 100, "ŋ": 101, "Ɔ": 102, "ɔ": 103, "Ɛ": 104, "ɛ": 105, "ʉ": 106, "ǎ": 107, "ǐ": 108, "ǒ": 109, "ǔ": 110, "ḿ": 111, "ẅ": 112, "’": 113, "ʼ": 114, "£": 115, "_": 116, "̀": 117, "́": 118, "̂": 119, "̌": 120, "�": 121 }, "unk_token": "�" } }