braza-embedding-ptbr-v1 / tokenizer_config.json
calneymgp's picture
feat: braza-embedding-ptbr-v1 — 474K B2B PT-BR, 36 autoresearch iters, ASSIN2 0.8082, SICK-BR 0.8513
bb33fec verified
{
"backend": "tokenizers",
"bos_token": "<|startoftext|>",
"clean_up_tokenization_spaces": false,
"cls_token": "<|startoftext|>",
"eos_token": "<|return|>",
"is_local": true,
"local_files_only": false,
"mask_token": "[MASK]",
"max_length": 32768,
"model_input_names": [
"input_ids",
"attention_mask"
],
"model_max_length": 128,
"pad_to_multiple_of": null,
"pad_token": "<|endoftext|>",
"pad_token_type_id": 0,
"padding_side": "right",
"sep_token": "<|return|>",
"stride": 0,
"tokenizer_class": "TokenizersBackend",
"truncation_side": "right",
"truncation_strategy": "longest_first"
}