| """ |
| Chimera 5.1 — Splintr (Rust) Tokenizer Wrapper — o200k_base (OpenAI o1/o3) |
| Wraps splintr's high-performance Rust tokenizer for transformers-compatible API. |
| Vocab: o200k_base (200,073 tokens) — OpenAI's o1/o3 tokenizer. |
| |
| Optimizations: |
| - __slots__ for reduced memory footprint |
| - Cached special token set for fast skip_special_tokens filtering |
| - Batch encode uses list comprehension (minimizes Python overhead) |
| """ |
|
|
| import torch |
| from typing import List, Union, Optional |
|
|
| try: |
| from splintr import Tokenizer as _SplintrTokenizer, O200K_AGENT_TOKENS |
| HAS_SPLINTR = True |
| except ImportError: |
| HAS_SPLINTR = False |
|
|
| __all__ = ["ChimeraTokenizer"] |
|
|
|
|
| class ChimeraTokenizer: |
| """ |
| High-performance Rust-backed tokenizer (splintr) with HuggingFace-like interface. |
| Falls back to a basic tiktoken wrapper if splintr is not installed. |
| """ |
|
|
| def __init__(self, pretrained: str = "o200k_base", vocab_size: int = 200073): |
| if not HAS_SPLINTR: |
| self._tok = None |
| self.vocab_size = int(vocab_size) |
| self.eos_token_id = min(self.vocab_size - 1, 199999) |
| self.pad_token_id = min(self.vocab_size - 1, 200058) |
| self.sep_token_id = min(self.vocab_size - 1, 200060) |
| self.stop_token_id = min(self.vocab_size - 1, 200059) |
| self.user_token_id = min(self.vocab_size - 1, 200020) |
| self.assistant_token_id = min(self.vocab_size - 1, 200021) |
| self.system_token_id = min(self.vocab_size - 1, 200019) |
| self.endofprompt_token_id = min(self.vocab_size - 1, 200018) |
| self.bos_token_id = self.eos_token_id |
| self.eos_token = "<|endoftext|>" |
| self.pad_token = "<|pad|>" |
| self.model_max_length = 4194304 |
| self._special_ids = frozenset({self.eos_token_id, self.pad_token_id, self.sep_token_id, self.stop_token_id, self.user_token_id, self.assistant_token_id, self.system_token_id, self.endofprompt_token_id}) |
| self._byte_offset = 3 |
| return |
| self._tok = _SplintrTokenizer.from_pretrained(pretrained) |
| self.vocab_size = self._tok.vocab_size |
|
|
| |
| self.eos_token_id = 199999 |
| self.pad_token_id = O200K_AGENT_TOKENS.PAD |
| self.sep_token_id = O200K_AGENT_TOKENS.SEP |
| self.stop_token_id = O200K_AGENT_TOKENS.STOP |
| self.user_token_id = O200K_AGENT_TOKENS.USER |
| self.assistant_token_id = O200K_AGENT_TOKENS.ASSISTANT |
| self.system_token_id = 200019 |
| self.endofprompt_token_id = 200018 |
| self.bos_token_id = self.eos_token_id |
|
|
| self.eos_token = "<|endoftext|>" |
| self.pad_token = "<|pad|>" |
| self.model_max_length = 4194304 |
|
|
| |
| self._special_ids = frozenset({ |
| self.eos_token_id, self.pad_token_id, self.sep_token_id, |
| self.stop_token_id, self.user_token_id, |
| self.assistant_token_id, self.system_token_id, |
| self.endofprompt_token_id, |
| }) |
|
|
| def __len__(self) -> int: |
| return self.vocab_size |
|
|
| def encode(self, text: str, add_special_tokens: bool = True, |
| max_length: Optional[int] = None) -> List[int]: |
| if self._tok is None: |
| ids = [self._byte_offset + b for b in text.encode("utf-8", errors="replace")] |
| else: |
| ids = self._tok.encode(text) |
| if add_special_tokens: |
| ids = ids + [self.eos_token_id] |
| if max_length is not None and len(ids) > max_length: |
| ids = ids[:max_length] |
| return ids |
|
|
| def encode_batch(self, texts: List[str], add_special_tokens: bool = True, |
| max_length: Optional[int] = None, |
| padding: bool = False, |
| truncation: bool = False, |
| return_tensors: Optional[str] = None): |
| all_ids = [self.encode(t, add_special_tokens=add_special_tokens, |
| max_length=max_length) |
| for t in texts] |
| if padding: |
| max_len = max(len(ids) for ids in all_ids) |
| all_ids = [ids + [self.pad_token_id] * (max_len - len(ids)) |
| for ids in all_ids] |
| if return_tensors == "pt": |
| return {"input_ids": torch.tensor(all_ids, dtype=torch.long)} |
| return all_ids |
|
|
| def decode(self, token_ids, skip_special_tokens: bool = True) -> str: |
| if isinstance(token_ids, torch.Tensor): |
| token_ids = token_ids.tolist() |
| if skip_special_tokens: |
| token_ids = [t for t in token_ids if t not in self._special_ids] |
| if self._tok is None: |
| data = bytes(max(0, min(255, int(t) - self._byte_offset)) for t in token_ids if int(t) >= self._byte_offset) |
| return data.decode("utf-8", errors="replace") |
| return self._tok.decode(token_ids) |
|
|
| def decode_batch(self, token_ids_list, skip_special_tokens: bool = True) -> List[str]: |
| return [self.decode(ids, skip_special_tokens=skip_special_tokens) |
| for ids in token_ids_list] |
|
|
| def __call__(self, text, **kwargs) -> dict: |
| return_tensors = kwargs.get("return_tensors", "pt") |
| padding = kwargs.get("padding", False) |
| max_length = kwargs.get("max_length", None) |
| add_special_tokens = kwargs.get("add_special_tokens", True) |
| if isinstance(text, str): |
| text = [text] |
| result = self.encode_batch( |
| text, add_special_tokens=add_special_tokens, |
| max_length=max_length, padding=padding, |
| return_tensors=return_tensors |
| ) |
| if isinstance(result, list): |
| return {"input_ids": torch.tensor(result, dtype=torch.long)} |
| return result |
|
|
| def get_vocab(self) -> dict: |
| return { |
| self.eos_token_id: self.eos_token, |
| self.pad_token_id: self.pad_token, |
| self.user_token_id: "<|user|>", |
| self.assistant_token_id: "<|assistant|>", |
| self.system_token_id: "<|system|>", |
| } |
|
|
| def apply_chat_template(self, messages: List[dict], |
| add_generation_prompt: bool = False) -> str: |
| parts = [] |
| for msg in messages: |
| role = msg.get("role", "user") |
| content = msg.get("content", "") |
| if role == "system": |
| parts.append(f"<|system|>\n{content}\n<|endofprompt|>") |
| elif role == "user": |
| parts.append(f"<|user|>\n{content}\n<|endofprompt|>") |
| elif role == "assistant": |
| parts.append(f"<|assistant|>\n{content}\n<|endofprompt|>") |
| text = "\n".join(parts) |
| if add_generation_prompt: |
| text += "\n<|assistant|>\n" |
| return text |
|
|