| from transformers import PreTrainedTokenizerFast |
| from tokenizers import Tokenizer, normalizers, pre_tokenizers, trainers, models |
| from tokenizers.normalizers import Lowercase, NFD, StripAccents |
| from tokenizers.pre_tokenizers import Whitespace |
| from typing import Optional, List, Union |
|
|
| class OctagonTokenizer(PreTrainedTokenizerFast): |
| def __init__( |
| self, |
| vocab_file=None, |
| merges_file=None, |
| tokenizer_file=None, |
| unk_token="[UNK]", |
| sep_token="[SEP]", |
| pad_token="[PAD]", |
| cls_token="[CLS]", |
| mask_token="[MASK]", |
| **kwargs |
| ): |
| super().__init__( |
| tokenizer_file=tokenizer_file, |
| unk_token=unk_token, |
| sep_token=sep_token, |
| pad_token=pad_token, |
| cls_token=cls_token, |
| mask_token=mask_token, |
| **kwargs |
| ) |
| |
| @classmethod |
| def train_tokenizer(cls, texts: List[str], vocab_size: int = 30522, save_path: Optional[str] = None): |
| |
| tokenizer = Tokenizer(models.BPE()) |
| |
| |
| tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()]) |
| |
| |
| tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() |
| |
| |
| trainer = trainers.BpeTrainer( |
| vocab_size=vocab_size, |
| special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] |
| ) |
| |
| |
| tokenizer.train_from_iterator(texts, trainer=trainer) |
| |
| |
| if save_path: |
| tokenizer.save(save_path) |
| |
| return cls(tokenizer_file=save_path) if save_path else cls(tokenizer_object=tokenizer) |