""" SeqCond tokenizer — tiktoken cl100k_base with 4 additional special tokens. Special tokens (assigned in order after the base vocab): <|im_start|> — marks the start of a chat turn <|im_end|> — marks the end of a chat turn (also used as EOS) <|think_start|> — marks the start of chain-of-thought reasoning <|think_end|> — marks the end of chain-of-thought reasoning Chat template: <|im_start|>user {prompt} <|im_end|><|im_start|>assistant <|think_start|>{thinking}<|think_end|> {answer} <|im_end|> """ import os from typing import Dict, List, Optional, Tuple from transformers import PreTrainedTokenizer _SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>", "<|think_start|>", "<|think_end|>"] _SPECIAL_TOKEN_IDS = { "<|im_start|>": 100278, "<|im_end|>": 100279, "<|think_start|>": 100280, "<|think_end|>": 100281, "<|endoftext|>": 100282, "<|fim_prefix|>": 100283, "<|fim_middle|>": 100284, "<|fim_suffix|>": 100285, "<|endofprompt|>": 100286, } _BASE_VOCAB_SIZE = 100256 _VOCAB_SIZE = max(_SPECIAL_TOKEN_IDS.values()) + 1 def _build_tiktoken_enc(): """Build tiktoken encoding with SeqCond special tokens.""" try: import tiktoken except ImportError as e: raise ImportError("tiktoken is required: pip install tiktoken") from e base = tiktoken.get_encoding("cl100k_base") return tiktoken.Encoding( name="seqcond", pat_str=base._pat_str, mergeable_ranks=base._mergeable_ranks, special_tokens=_SPECIAL_TOKEN_IDS, ) class SeqCondTokenizer(PreTrainedTokenizer): """ Tokenizer for SeqCond models, backed by tiktoken cl100k_base. This is a slow tokenizer that wraps tiktoken. Tokens are represented internally as their stringified integer IDs (e.g. "42", "100256"). This avoids building a full vocab dict while remaining compatible with HuggingFace's PreTrainedTokenizer interface. Requires: pip install tiktoken """ vocab_files_names: Dict[str, str] = {} model_input_names = ["input_ids", "attention_mask"] def __init__( self, eos_token: str = "<|im_end|>", bos_token: Optional[str] = None, unk_token: Optional[str] = None, pad_token: str = "<|im_end|>", add_bos_token: bool = False, **kwargs, ): self._enc = _build_tiktoken_enc() self._id_to_special: Dict[int, str] = {idx: tok for tok, idx in _SPECIAL_TOKEN_IDS.items()} self._special_to_id: Dict[str, int] = {v: k for k, v in self._id_to_special.items()} # Register special tokens before calling super().__init__ kwargs.setdefault("additional_special_tokens", [t for t in _SPECIAL_TOKENS if t not in (eos_token, bos_token, unk_token, pad_token)]) super().__init__( eos_token=eos_token, bos_token=bos_token, unk_token=unk_token, pad_token=pad_token, add_bos_token=add_bos_token, **kwargs, ) @property def vocab_size(self) -> int: return _VOCAB_SIZE # ------------------------------------------------------------------ # Core token ↔ id mappings # ------------------------------------------------------------------ def _tokenize(self, text: str, **kwargs) -> List[str]: """Encode text into a list of token-id strings.""" ids = self._enc.encode(text, allowed_special="all") # Shift non-special BPE IDs by +1 to match convectors.Tiktokenize # offset used during training (ID 0 reserved). shifted = [i if i in self._id_to_special else i + 1 for i in ids] return [str(i) for i in shifted] def _convert_token_to_id(self, token: str) -> int: """Convert a token string (or id-string) to an integer id.""" if token in self._special_to_id: return self._special_to_id[token] try: return int(token) except ValueError: return 0 def _convert_id_to_token(self, index: int) -> str: """Convert an integer id to its token string.""" if index in self._id_to_special: return self._id_to_special[index] return str(index) def convert_tokens_to_string(self, tokens: List[str]) -> str: """Decode a list of token strings back to text.""" ids = [] for t in tokens: if t in self._special_to_id: ids.append(self._special_to_id[t]) else: try: ids.append(int(t)) except ValueError: pass # Reverse the +1 BPE shift before decoding; skip invalid/ID 0 tokens. real_ids = [] for i in ids: if i in self._id_to_special: real_ids.append(i) elif i >= 1: real_ids.append(i - 1) return self._enc.decode(real_ids) def get_vocab(self) -> Dict[str, int]: """ Return a vocab dict. Only special tokens are included with their names; regular BPE tokens are included as their id-string representation. (Building a full 100k-entry reverse BPE map is expensive and rarely needed.) """ vocab = {str(i): i for i in range(self.vocab_size)} for tok, idx in self._special_to_id.items(): vocab[tok] = idx return vocab def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None ) -> Tuple[str, ...]: """ No vocabulary file is needed — the tiktoken encoding is fetched from the tiktoken package at runtime. Returns an empty tuple. """ return () # ------------------------------------------------------------------ # Convenience helpers # ------------------------------------------------------------------ @property def im_start_id(self) -> int: return self._special_to_id["<|im_start|>"] @property def im_end_id(self) -> int: return self._special_to_id["<|im_end|>"] @property def think_start_id(self) -> int: return self._special_to_id["<|think_start|>"] @property def think_end_id(self) -> int: return self._special_to_id["<|think_end|>"] def encode_chat(self, prompt: str, add_think_start: bool = True) -> List[int]: """ Format and encode a user prompt using the standard chat template. Args: prompt: The user's message (plain text). add_think_start: If True (default), append <|think_start|> so the model begins generating its chain-of-thought immediately. Returns: List of token ids (prompt already encoded, ready for prefill). """ text = f"<|im_start|>user\n{prompt}\n<|im_end|><|im_start|>assistant\n" if add_think_start: text += "<|think_start|>" ids = self._enc.encode(text, allowed_special="all") return [i if i in self._id_to_special else i + 1 for i in ids] def apply_chat_template(self, conversation, add_generation_prompt: bool = True, **kwargs) -> List[int]: """ Minimal chat template support for HF pipeline compatibility. Expects conversation as a list of {"role": ..., "content": ...} dicts. Only the last user turn is supported for now. """ text = "" for msg in conversation: role = msg.get("role", "user") content = msg.get("content", "") text += f"<|im_start|>{role}\n{content}\n<|im_end|>" if add_generation_prompt: text += "<|im_start|>assistant\n<|think_start|>" ids = self._enc.encode(text, allowed_special="all") return [i if i in self._id_to_special else i + 1 for i in ids]