sentinel-prime-350m / hf_tokenizer.py
qubitpage's picture
Upload hf_tokenizer.py with huggingface_hub
dfc9678 verified
"""
HuggingFace-compatible tokenizer wrapper for tiktoken cl100k_base.
Wraps tiktoken so it works with HF's generate(), lm-evaluation-harness,
and the Hub (tokenizer.json / tokenizer_config.json).
Usage:
from hf_tokenizer import SentinelBrainTokenizer
tok = SentinelBrainTokenizer()
ids = tok("Hello world", return_tensors="pt")
"""
import json
import os
from typing import Optional, List, Dict, Union
import tiktoken
from transformers import PreTrainedTokenizer
class SentinelBrainTokenizer(PreTrainedTokenizer):
"""HuggingFace PreTrainedTokenizer wrapping tiktoken cl100k_base."""
vocab_files_names = {"vocab_file": "tiktoken_vocab.json"}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file: Optional[str] = None,
eos_token: str = "<|endoftext|>",
pad_token: str = "<|endoftext|>",
model_max_length: int = 1024,
**kwargs,
):
self._enc = tiktoken.get_encoding("cl100k_base")
self._vocab_size = self._enc.n_vocab # 100277
# Build token-to-id mapping for special tokens
self._special_tokens = {
"<|endoftext|>": self._enc.eot_token, # 100257
}
super().__init__(
eos_token=eos_token,
pad_token=pad_token,
model_max_length=model_max_length,
**kwargs,
)
@property
def vocab_size(self) -> int:
return self._vocab_size
def get_vocab(self) -> Dict[str, int]:
"""Return vocab dict. tiktoken doesn't expose full vocab easily,
so we return a partial mapping for special tokens."""
vocab = {}
# Add special tokens
for tok, idx in self._special_tokens.items():
vocab[tok] = idx
return vocab
def _tokenize(self, text: str, **kwargs) -> List[str]:
"""Tokenize into string tokens (HF convention).
We return token IDs as strings since tiktoken uses bytes."""
token_ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
return [str(tid) for tid in token_ids]
def _convert_token_to_id(self, token: str) -> int:
"""Convert string token → ID."""
if token in self._special_tokens:
return self._special_tokens[token]
try:
return int(token)
except ValueError:
return self._enc.eot_token # fallback
def _convert_id_to_token(self, index: int) -> str:
"""Convert ID → string token."""
try:
return self._enc.decode([index])
except Exception:
return "<|unk|>"
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Convert token strings back to text."""
ids = []
for t in tokens:
try:
ids.append(int(t))
except ValueError:
if t in self._special_tokens:
ids.append(self._special_tokens[t])
try:
return self._enc.decode(ids)
except Exception:
return ""
def encode(self, text: Union[str, List[str]], add_special_tokens: bool = True,
**kwargs) -> Union[List[int], List[List[int]]]:
"""Fast-path encode using tiktoken directly."""
if isinstance(text, str):
ids = self._enc.encode(text, allowed_special={"<|endoftext|>"})
return ids
return [self._enc.encode(t, allowed_special={"<|endoftext|>"}) for t in text]
def decode(self, token_ids: Union[List[int], int], skip_special_tokens: bool = False,
**kwargs) -> str:
"""Fast-path decode using tiktoken directly."""
if isinstance(token_ids, int):
token_ids = [token_ids]
if skip_special_tokens:
token_ids = [t for t in token_ids if t != self._enc.eot_token]
try:
return self._enc.decode(token_ids)
except Exception:
return ""
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
"""Save a minimal vocab file so from_pretrained works."""
if not os.path.isdir(save_directory):
os.makedirs(save_directory, exist_ok=True)
prefix = filename_prefix + "-" if filename_prefix else ""
vocab_file = os.path.join(save_directory, prefix + "tiktoken_vocab.json")
vocab_data = {
"encoding": "cl100k_base",
"vocab_size": self._vocab_size,
"eos_token_id": self._enc.eot_token,
"special_tokens": self._special_tokens,
}
with open(vocab_file, "w", encoding="utf-8") as f:
json.dump(vocab_data, f, indent=2)
return (vocab_file,)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
"""Load from directory. Falls back to creating fresh tokenizer."""
try:
return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
except Exception:
return cls(**kwargs)