aksara-tokenizer-v3 / tokenizer_config.json
Ezekiel999's picture
Upload tokenizer_config.json with huggingface_hub
faf515e verified
{
"version": "3.0.0",
"vocab_size": 32773,
"pre_tokenizer": "Whitespace",
"normalizer": "NFKC (NO lowercase)",
"model": "BPE",
"brand_tokens": [
"AksaraLLM",
"aksarallm",
"AKSARALLM",
"Indonesia",
"indonesia",
"INDONESIA",
"Pancasila",
"pancasila",
"Nusantara",
"nusantara"
],
"special_tokens": {
"[PAD]": 0,
"[EOS]": 1,
"[BOS]": 2,
"[UNK]": 3,
"[SEP]": 4,
"[MASK]": 5,
"[INST]": 6,
"[/INST]": 7,
"[SYS]": 8,
"[USER]": 9,
"[ASST]": 10,
"[TURN]": 11,
"[LANG_ID]": 12,
"[LANG_JV]": 13,
"[LANG_SU]": 14,
"[LANG_EN]": 15
}
}