AksaraLLM
/

aksara-tokenizer-v3

Ezekiel999 commited on 25 days ago

Commit

faf515e

verified ·

1 Parent(s): 859fcfd

Upload tokenizer_config.json with huggingface_hub

Files changed (1) hide show

tokenizer_config.json ADDED Viewed

+{
+  "version": "3.0.0",
+  "vocab_size": 32773,
+  "pre_tokenizer": "Whitespace",
+  "normalizer": "NFKC (NO lowercase)",
+  "model": "BPE",
+  "brand_tokens": [
+    "AksaraLLM",
+    "aksarallm",
+    "AKSARALLM",
+    "Indonesia",
+    "indonesia",
+    "INDONESIA",
+    "Pancasila",
+    "pancasila",
+    "Nusantara",
+    "nusantara"
+  ],
+  "special_tokens": {
+    "[PAD]": 0,
+    "[EOS]": 1,
+    "[BOS]": 2,
+    "[UNK]": 3,
+    "[SEP]": 4,
+    "[MASK]": 5,
+    "[INST]": 6,
+    "[/INST]": 7,
+    "[SYS]": 8,
+    "[USER]": 9,
+    "[ASST]": 10,
+    "[TURN]": 11,
+    "[LANG_ID]": 12,
+    "[LANG_JV]": 13,
+    "[LANG_SU]": 14,
+    "[LANG_EN]": 15
+  }
+}