nautile-370m / tokenization_seqcond.py
maxchbx's picture
Upload folder using huggingface_hub
7acd624 verified
"""
SeqCond tokenizer — tiktoken cl100k_base with 4 additional special tokens.
Special tokens (assigned in order after the base vocab):
<|im_start|> — marks the start of a chat turn
<|im_end|> — marks the end of a chat turn (also used as EOS)
<|think_start|> — marks the start of chain-of-thought reasoning
<|think_end|> — marks the end of chain-of-thought reasoning
Chat template:
<|im_start|>user
{prompt}
<|im_end|><|im_start|>assistant
<|think_start|>{thinking}<|think_end|>
{answer}
<|im_end|>
"""
import os
from typing import Dict, List, Optional, Tuple
from transformers import PreTrainedTokenizer
_SPECIAL_TOKENS = ["<|im_start|>", "<|im_end|>", "<|think_start|>", "<|think_end|>"]
_SPECIAL_TOKEN_IDS = {
"<|im_start|>": 100278,
"<|im_end|>": 100279,
"<|think_start|>": 100280,
"<|think_end|>": 100281,
"<|endoftext|>": 100282,
"<|fim_prefix|>": 100283,
"<|fim_middle|>": 100284,
"<|fim_suffix|>": 100285,
"<|endofprompt|>": 100286,
}
_BASE_VOCAB_SIZE = 100256
_VOCAB_SIZE = max(_SPECIAL_TOKEN_IDS.values()) + 1
def _build_tiktoken_enc():
"""Build tiktoken encoding with SeqCond special tokens."""
try:
import tiktoken
except ImportError as e:
raise ImportError("tiktoken is required: pip install tiktoken") from e
base = tiktoken.get_encoding("cl100k_base")
return tiktoken.Encoding(
name="seqcond",
pat_str=base._pat_str,
mergeable_ranks=base._mergeable_ranks,
special_tokens=_SPECIAL_TOKEN_IDS,
)
class SeqCondTokenizer(PreTrainedTokenizer):
"""
Tokenizer for SeqCond models, backed by tiktoken cl100k_base.
This is a slow tokenizer that wraps tiktoken. Tokens are represented
internally as their stringified integer IDs (e.g. "42", "100256").
This avoids building a full vocab dict while remaining compatible with
HuggingFace's PreTrainedTokenizer interface.
Requires: pip install tiktoken
"""
vocab_files_names: Dict[str, str] = {}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
eos_token: str = "<|im_end|>",
bos_token: Optional[str] = None,
unk_token: Optional[str] = None,
pad_token: str = "<|im_end|>",
add_bos_token: bool = False,
**kwargs,
):
self._enc = _build_tiktoken_enc()
self._id_to_special: Dict[int, str] = {idx: tok for tok, idx in _SPECIAL_TOKEN_IDS.items()}
self._special_to_id: Dict[str, int] = {v: k for k, v in self._id_to_special.items()}
# Register special tokens before calling super().__init__
kwargs.setdefault("additional_special_tokens", [t for t in _SPECIAL_TOKENS if t not in (eos_token, bos_token, unk_token, pad_token)])
super().__init__(
eos_token=eos_token,
bos_token=bos_token,
unk_token=unk_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
**kwargs,
)
@property
def vocab_size(self) -> int:
return _VOCAB_SIZE
# ------------------------------------------------------------------
# Core token ↔ id mappings
# ------------------------------------------------------------------
def _tokenize(self, text: str, **kwargs) -> List[str]:
"""Encode text into a list of token-id strings."""
ids = self._enc.encode(text, allowed_special="all")
# Shift non-special BPE IDs by +1 to match convectors.Tiktokenize
# offset used during training (ID 0 reserved).
shifted = [i if i in self._id_to_special else i + 1 for i in ids]
return [str(i) for i in shifted]
def _convert_token_to_id(self, token: str) -> int:
"""Convert a token string (or id-string) to an integer id."""
if token in self._special_to_id:
return self._special_to_id[token]
try:
return int(token)
except ValueError:
return 0
def _convert_id_to_token(self, index: int) -> str:
"""Convert an integer id to its token string."""
if index in self._id_to_special:
return self._id_to_special[index]
return str(index)
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Decode a list of token strings back to text."""
ids = []
for t in tokens:
if t in self._special_to_id:
ids.append(self._special_to_id[t])
else:
try:
ids.append(int(t))
except ValueError:
pass
# Reverse the +1 BPE shift before decoding; skip invalid/ID 0 tokens.
real_ids = []
for i in ids:
if i in self._id_to_special:
real_ids.append(i)
elif i >= 1:
real_ids.append(i - 1)
return self._enc.decode(real_ids)
def get_vocab(self) -> Dict[str, int]:
"""
Return a vocab dict. Only special tokens are included with their names;
regular BPE tokens are included as their id-string representation.
(Building a full 100k-entry reverse BPE map is expensive and rarely needed.)
"""
vocab = {str(i): i for i in range(self.vocab_size)}
for tok, idx in self._special_to_id.items():
vocab[tok] = idx
return vocab
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str, ...]:
"""
No vocabulary file is needed — the tiktoken encoding is fetched from
the tiktoken package at runtime. Returns an empty tuple.
"""
return ()
# ------------------------------------------------------------------
# Convenience helpers
# ------------------------------------------------------------------
@property
def im_start_id(self) -> int:
return self._special_to_id["<|im_start|>"]
@property
def im_end_id(self) -> int:
return self._special_to_id["<|im_end|>"]
@property
def think_start_id(self) -> int:
return self._special_to_id["<|think_start|>"]
@property
def think_end_id(self) -> int:
return self._special_to_id["<|think_end|>"]
def encode_chat(self, prompt: str, add_think_start: bool = True) -> List[int]:
"""
Format and encode a user prompt using the standard chat template.
Args:
prompt: The user's message (plain text).
add_think_start: If True (default), append <|think_start|> so the
model begins generating its chain-of-thought immediately.
Returns:
List of token ids (prompt already encoded, ready for prefill).
"""
text = f"<|im_start|>user\n{prompt}\n<|im_end|><|im_start|>assistant\n"
if add_think_start:
text += "<|think_start|>"
ids = self._enc.encode(text, allowed_special="all")
return [i if i in self._id_to_special else i + 1 for i in ids]
def apply_chat_template(self, conversation, add_generation_prompt: bool = True, **kwargs) -> List[int]:
"""
Minimal chat template support for HF pipeline compatibility.
Expects conversation as a list of {"role": ..., "content": ...} dicts.
Only the last user turn is supported for now.
"""
text = ""
for msg in conversation:
role = msg.get("role", "user")
content = msg.get("content", "")
text += f"<|im_start|>{role}\n{content}\n<|im_end|>"
if add_generation_prompt:
text += "<|im_start|>assistant\n<|think_start|>"
ids = self._enc.encode(text, allowed_special="all")
return [i if i in self._id_to_special else i + 1 for i in ids]