gijl commited on
Commit
e85a44b
·
verified ·
1 Parent(s): 46bb8ea

Create tokenizer_config.json

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +9 -0
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "name": "Arabic Character Tokenizer",
4
+ "vocab_size": 115
5
+ },
6
+ "stoi": {
7
+ " ": 0, "!": 1, "(": 2, ")": 3, "*": 4, "+": 5, "-": 6, ".": 7, "/": 8, "0": 9, "1": 10, "2": 11, "3": 12, "4": 13, "5": 14, "6": 15, "7": 16, "8": 17, "9": 18, "=": 19, "A": 20, "B": 21, "C": 22, "D": 23, "E": 24, "F": 25, "G": 26, "H": 27, "I": 28, "J": 29, "K": 30, "L": 31, "M": 32, "N": 33, "O": 34, "P": 35, "Q": 36, "R": 37, "S": 38, "T": 39, "U": 40, "V": 41, "W": 42, "X": 43, "Y": 44, "Z": 45, "[": 46, "]": 47, "a": 48, "b": 49, "c": 50, "d": 51, "e": 52, "f": 53, "g": 54, "h": 55, "i": 56, "j": 57, "k": 58, "l": 59, "m": 60, "n": 61, "o": 62, "p": 63, "q": 64, "r": 65, "s": 66, "t": 67, "u": 68, "v": 69, "w": 70, "x": 71, "y": 72, "z": 73, "{": 74, "}": 75, "،": 76, "؟": 77, "ء": 78, "آ": 79, "أ": 80, "ؤ": 81, "إ": 82, "ئ": 83, "ب": 84, "ة": 85, "ت": 86, "ث": 87, "ج": 88, "ح": 89, "خ": 90, "د": 91, "ذ": 92, "ر": 93, "ز": 94, "س": 95, "ش": 96, "ص": 97, "ض": 98, "ط": 99, "ظ": 100, "ع": 101, "غ": 102, "ـ": 103, "ف": 104, "ق": 105, "ك": 106, "ل": 107, "م": 108, "ن": 109, "ه": 110, "و": 111, "ي": 112, "<|endoftext|>": 113, "<|pad|>": 114
8
+ }
9
+ }