| """
|
| HuggingFace-compatible tokenizer wrapper for tiktoken cl100k_base.
|
|
|
| Wraps tiktoken so it works with HF's generate(), lm-evaluation-harness,
|
| and the Hub (tokenizer.json / tokenizer_config.json).
|
|
|
| Usage:
|
| from hf_tokenizer import SentinelBrainTokenizer
|
| tok = SentinelBrainTokenizer()
|
| ids = tok("Hello world", return_tensors="pt")
|
| """
|
|
|
| import json
|
| import os
|
| from typing import Optional, List, Dict, Union
|
| import tiktoken
|
| from transformers import PreTrainedTokenizer
|
|
|
|
|
| class SentinelBrainTokenizer(PreTrainedTokenizer):
|
| """HuggingFace PreTrainedTokenizer wrapping tiktoken cl100k_base."""
|
|
|
| vocab_files_names = {"vocab_file": "tiktoken_vocab.json"}
|
| model_input_names = ["input_ids", "attention_mask"]
|
|
|
| def __init__(
|
| self,
|
| vocab_file: Optional[str] = None,
|
| eos_token: str = "<|endoftext|>",
|
| pad_token: str = "<|endoftext|>",
|
| model_max_length: int = 1024,
|
| **kwargs,
|
| ):
|
| self._enc = tiktoken.get_encoding("cl100k_base")
|
| self._vocab_size = self._enc.n_vocab
|
|
|
|
|
| self._special_tokens = {
|
| "<|endoftext|>": self._enc.eot_token,
|
| }
|
|
|
| super().__init__(
|
| eos_token=eos_token,
|
| pad_token=pad_token,
|
| model_max_length=model_max_length,
|
| **kwargs,
|
| )
|
|
|
| @property
|
| def vocab_size(self) -> int:
|
| return self._vocab_size
|
|
|
| def get_vocab(self) -> Dict[str, int]:
|
| """Return vocab dict. tiktoken doesn't expose full vocab easily,
|
| so we return a partial mapping for special tokens."""
|
| vocab = {}
|
|
|
| for tok, idx in self._special_tokens.items():
|
| vocab[tok] = idx
|
| return vocab
|
|
|
| def _tokenize(self, text: str, **kwargs) -> List[str]:
|
| """Tokenize into string tokens (HF convention).
|
| We return token IDs as strings since tiktoken uses bytes."""
|
| token_ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
|
| return [str(tid) for tid in token_ids]
|
|
|
| def _convert_token_to_id(self, token: str) -> int:
|
| """Convert string token → ID."""
|
| if token in self._special_tokens:
|
| return self._special_tokens[token]
|
| try:
|
| return int(token)
|
| except ValueError:
|
| return self._enc.eot_token
|
|
|
| def _convert_id_to_token(self, index: int) -> str:
|
| """Convert ID → string token."""
|
| try:
|
| return self._enc.decode([index])
|
| except Exception:
|
| return "<|unk|>"
|
|
|
| def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| """Convert token strings back to text."""
|
| ids = []
|
| for t in tokens:
|
| try:
|
| ids.append(int(t))
|
| except ValueError:
|
| if t in self._special_tokens:
|
| ids.append(self._special_tokens[t])
|
| try:
|
| return self._enc.decode(ids)
|
| except Exception:
|
| return ""
|
|
|
| def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True,
|
| **kwargs) -> Union[List[int], List[List[int]]]:
|
| """Fast-path encode using tiktoken directly."""
|
| if isinstance(text, str):
|
| ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
|
| return ids
|
| return [self._enc.encode(t, allowed_special={"<|endoftext|>"}) for t in text]
|
|
|
| def decode(self, token_ids: Union[List[int], int], skip_special_tokens: bool = False,
|
| **kwargs) -> str:
|
| """Fast-path decode using tiktoken directly."""
|
| if isinstance(token_ids, int):
|
| token_ids = [token_ids]
|
| if skip_special_tokens:
|
| token_ids = [t for t in token_ids if t != self._enc.eot_token]
|
| try:
|
| return self._enc.decode(token_ids)
|
| except Exception:
|
| return ""
|
|
|
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
|
| """Save a minimal vocab file so from_pretrained works."""
|
| if not os.path.isdir(save_directory):
|
| os.makedirs(save_directory, exist_ok=True)
|
| prefix = filename_prefix + "-" if filename_prefix else ""
|
| vocab_file = os.path.join(save_directory, prefix + "tiktoken_vocab.json")
|
| vocab_data = {
|
| "encoding": "cl100k_base",
|
| "vocab_size": self._vocab_size,
|
| "eos_token_id": self._enc.eot_token,
|
| "special_tokens": self._special_tokens,
|
| }
|
| with open(vocab_file, "w", encoding="utf-8") as f:
|
| json.dump(vocab_data, f, indent=2)
|
| return (vocab_file,)
|
|
|
| @classmethod
|
| def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
|
| """Load from directory. Falls back to creating fresh tokenizer."""
|
| try:
|
| return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
|
| except Exception:
|
| return cls(**kwargs)
|
|
|