""" HuggingFace-compatible tokenizer wrapper for tiktoken cl100k_base. Wraps tiktoken so it works with HF's generate(), lm-evaluation-harness, and the Hub (tokenizer.json / tokenizer_config.json). Usage: from hf_tokenizer import SentinelBrainTokenizer tok = SentinelBrainTokenizer() ids = tok("Hello world", return_tensors="pt") """ import json import os from typing import Optional, List, Dict, Union import tiktoken from transformers import PreTrainedTokenizer class SentinelBrainTokenizer(PreTrainedTokenizer): """HuggingFace PreTrainedTokenizer wrapping tiktoken cl100k_base.""" vocab_files_names = {"vocab_file": "tiktoken_vocab.json"} model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file: Optional[str] = None, eos_token: str = "<|endoftext|>", pad_token: str = "<|endoftext|>", model_max_length: int = 1024, **kwargs, ): self._enc = tiktoken.get_encoding("cl100k_base") self._vocab_size = self._enc.n_vocab # 100277 # Build token-to-id mapping for special tokens self._special_tokens = { "<|endoftext|>": self._enc.eot_token, # 100257 } super().__init__( eos_token=eos_token, pad_token=pad_token, model_max_length=model_max_length, **kwargs, ) @property def vocab_size(self) -> int: return self._vocab_size def get_vocab(self) -> Dict[str, int]: """Return vocab dict. tiktoken doesn't expose full vocab easily, so we return a partial mapping for special tokens.""" vocab = {} # Add special tokens for tok, idx in self._special_tokens.items(): vocab[tok] = idx return vocab def _tokenize(self, text: str, **kwargs) -> List[str]: """Tokenize into string tokens (HF convention). We return token IDs as strings since tiktoken uses bytes.""" token_ids = self._enc.encode(text, allowed_special={"<|endoftext|>"}) return [str(tid) for tid in token_ids] def _convert_token_to_id(self, token: str) -> int: """Convert string token → ID.""" if token in self._special_tokens: return self._special_tokens[token] try: return int(token) except ValueError: return self._enc.eot_token # fallback def _convert_id_to_token(self, index: int) -> str: """Convert ID → string token.""" try: return self._enc.decode([index]) except Exception: return "<|unk|>" def convert_tokens_to_string(self, tokens: List[str]) -> str: """Convert token strings back to text.""" ids = [] for t in tokens: try: ids.append(int(t)) except ValueError: if t in self._special_tokens: ids.append(self._special_tokens[t]) try: return self._enc.decode(ids) except Exception: return "" def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True, **kwargs) -> Union[List[int], List[List[int]]]: """Fast-path encode using tiktoken directly.""" if isinstance(text, str): ids = self._enc.encode(text, allowed_special={"<|endoftext|>"}) return ids return [self._enc.encode(t, allowed_special={"<|endoftext|>"}) for t in text] def decode(self, token_ids: Union[List[int], int], skip_special_tokens: bool = False, **kwargs) -> str: """Fast-path decode using tiktoken directly.""" if isinstance(token_ids, int): token_ids = [token_ids] if skip_special_tokens: token_ids = [t for t in token_ids if t != self._enc.eot_token] try: return self._enc.decode(token_ids) except Exception: return "" def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple: """Save a minimal vocab file so from_pretrained works.""" if not os.path.isdir(save_directory): os.makedirs(save_directory, exist_ok=True) prefix = filename_prefix + "-" if filename_prefix else "" vocab_file = os.path.join(save_directory, prefix + "tiktoken_vocab.json") vocab_data = { "encoding": "cl100k_base", "vocab_size": self._vocab_size, "eos_token_id": self._enc.eot_token, "special_tokens": self._special_tokens, } with open(vocab_file, "w", encoding="utf-8") as f: json.dump(vocab_data, f, indent=2) return (vocab_file,) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """Load from directory. Falls back to creating fresh tokenizer.""" try: return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) except Exception: return cls(**kwargs)