| import os |
| import sentencepiece as spm |
| from transformers import AutoTokenizer, PreTrainedTokenizerFast |
|
|
| class TokenizerSetup: |
| def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None): |
| """Initialize tokenizer setup for custom or pretrained use.""" |
| self.model_path = model_path |
| self.model_type = model_type.lower() |
| self.vocab_size = vocab_size |
| self.hf_model = hf_model |
| self.tokenizer = None |
| |
| |
| valid_types = ["bpe", "unigram", "char", "word"] |
| if self.model_type not in valid_types: |
| print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}") |
| self.model_type = "bpe" |
|
|
| def train_sentencepiece(self, input_file): |
| """Train a SentencePiece tokenizer with specified settings.""" |
| if not os.path.exists(input_file): |
| print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.") |
| return |
| |
| try: |
| spm.SentencePieceTrainer.Train( |
| f"--input={input_file} " |
| f"--model_prefix={self.model_path} " |
| f"--vocab_size={self.vocab_size} " |
| f"--model_type={self.model_type} " |
| f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " |
| f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>" |
| ) |
| print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model") |
| except Exception as e: |
| print(f"⚠️ Error training SentencePiece: {e}") |
|
|
| def load_tokenizer(self): |
| """Load either a SentencePiece or Hugging Face tokenizer.""" |
| try: |
| if self.hf_model: |
| self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model) |
| print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}") |
| else: |
| sp_model = f"{self.model_path}.model" |
| if not os.path.exists(sp_model): |
| print(f"⚠️ {sp_model} not found! Train it first.") |
| return |
| |
| sp = spm.SentencePieceProcessor(model_file=sp_model) |
| self.tokenizer = PreTrainedTokenizerFast( |
| tokenizer_object=sp, |
| pad_token="<pad>", |
| unk_token="<unk>", |
| bos_token="<bos>", |
| eos_token="<eos>" |
| ) |
| print(f"✅ Loaded SentencePiece tokenizer from {sp_model}") |
| except Exception as e: |
| print(f"⚠️ Error loading tokenizer: {e}") |
|
|
| def save_tokenizer(self, save_dir="tokenizer/"): |
| """Save tokenizer files to a directory.""" |
| if not self.tokenizer: |
| print("⚠️ No tokenizer loaded to save!") |
| return |
| |
| try: |
| os.makedirs(save_dir, exist_ok=True) |
| self.tokenizer.save_pretrained(save_dir) |
| if not self.hf_model: |
| for ext in [".model", ".vocab"]: |
| src = f"{self.model_path}{ext}" |
| if os.path.exists(src): |
| os.system(f"cp {src} {save_dir}") |
| print(f"✅ Tokenizer saved to {save_dir}") |
| except Exception as e: |
| print(f"⚠️ Error saving tokenizer: {e}") |
|
|
| def tokenize_text(self, text, return_tensors=True): |
| """Tokenize text and show both IDs and decoded output.""" |
| if not self.tokenizer: |
| print("⚠️ No tokenizer initialized! Load or train one first.") |
| return None |
| |
| try: |
| tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None) |
| ids = tokens["input_ids"] if return_tensors else tokens |
| decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True) |
| print(f"🔹 Token IDs: {ids}") |
| print(f"🔹 Decoded: {decoded}") |
| return tokens |
| except Exception as e: |
| print(f"⚠️ Error tokenizing text: {e}") |
| return None |
|
|
| if __name__ == "__main__": |
| |
| tokenizer_setup = TokenizerSetup( |
| model_path="tokenizer", |
| model_type="bpe", |
| vocab_size=32000, |
| hf_model=None |
| ) |
|
|
| |
| input_file = "../datasets/eclipse_corpuz_1.1.txt" |
| if not os.path.exists(f"{tokenizer_setup.model_path}.model"): |
| tokenizer_setup.train_sentencepiece(input_file) |
|
|
| |
| tokenizer_setup.load_tokenizer() |
|
|
| |
| tokenizer_setup.save_tokenizer("../finetuned_charm15/") |
|
|
| |
| sample_text = "Charm 15 is an AI model optimized for deep learning and security." |
| tokenizer_setup.tokenize_text(sample_text) |