| import os |
| from tokenizers import Tokenizer |
| from tokenizers.models import BPE |
| from tokenizers.trainers import BpeTrainer |
| from tokenizers.pre_tokenizers import Whitespace, ByteLevel |
| from tokenizers.processors import TemplateProcessing |
|
|
| def train_sovereign_tokenizer(corpus_path, vocab_size=50257): |
| """ |
| Trains a custom BPE tokenizer optimized for Indic and Ecological technical terms. |
| Target: 50,257 tokens (matches the model_config.yaml). |
| """ |
| |
| |
| tokenizer = Tokenizer(BPE(unk_token="<|unk|>")) |
| |
| |
| |
| tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) |
| |
| |
| |
| trainer = BpeTrainer( |
| vocab_size=vocab_size, |
| min_frequency=2, |
| special_tokens=[ |
| "<|endoftext|>", |
| "<|unk|>", |
| "<|pad|>", |
| "CATEGORY_SN", |
| "CATEGORY_NE", |
| "CATEGORY_IPN" |
| ], |
| show_progress=True, |
| initial_alphabet=ByteLevel.alphabet() |
| ) |
|
|
| |
| print(f"Commencing Tokenizer Training on {corpus_path}...") |
| files = [os.path.join(corpus_path, f) for f in os.listdir(corpus_path) if f.endswith(".txt")] |
| tokenizer.train(files, trainer) |
|
|
| |
| |
| tokenizer.post_processor = TemplateProcessing( |
| single="$A <|endoftext|>", |
| special_tokens=[("<|endoftext|>", 0)], |
| ) |
|
|
| |
| tokenizer.save("data/processed/aravalli_tokenizer.json") |
| print("Sovereign Tokenizer Enacted and Saved to data/processed/") |
|
|
| if __name__ == "__main__": |
| |
| if not os.path.exists("data/raw/"): |
| os.makedirs("data/raw/") |
| train_sovereign_tokenizer("data/raw/") |
|
|