nautile-370m / tokenization_seqcond.py

Upload folder using huggingface_hub

7acd624 verified 14 days ago

7.84 kB

	"""
	SeqCond tokenizer — tiktoken cl100k_base with 4 additional special tokens.

	Special tokens (assigned in order after the base vocab):
	<\|im_start\|> — marks the start of a chat turn
	<\|im_end\|> — marks the end of a chat turn (also used as EOS)
	<\|think_start\|> — marks the start of chain-of-thought reasoning
	<\|think_end\|> — marks the end of chain-of-thought reasoning

	Chat template:
	<\|im_start\|>user
	{prompt}
	<\|im_end\|><\|im_start\|>assistant
	<\|think_start\|>{thinking}<\|think_end\|>
	{answer}
	<\|im_end\|>
	"""

	import os
	from typing import Dict, List, Optional, Tuple

	from transformers import PreTrainedTokenizer

	_SPECIAL_TOKENS = ["<\|im_start\|>", "<\|im_end\|>", "<\|think_start\|>", "<\|think_end\|>"]
	_SPECIAL_TOKEN_IDS = {
	"<\|im_start\|>": 100278,
	"<\|im_end\|>": 100279,
	"<\|think_start\|>": 100280,
	"<\|think_end\|>": 100281,
	"<\|endoftext\|>": 100282,
	"<\|fim_prefix\|>": 100283,
	"<\|fim_middle\|>": 100284,
	"<\|fim_suffix\|>": 100285,
	"<\|endofprompt\|>": 100286,
	}
	_BASE_VOCAB_SIZE = 100256
	_VOCAB_SIZE = max(_SPECIAL_TOKEN_IDS.values()) + 1


	def _build_tiktoken_enc():
	"""Build tiktoken encoding with SeqCond special tokens."""
	try:
	import tiktoken
	except ImportError as e:
	raise ImportError("tiktoken is required: pip install tiktoken") from e

	base = tiktoken.get_encoding("cl100k_base")
	return tiktoken.Encoding(
	name="seqcond",
	pat_str=base._pat_str,
	mergeable_ranks=base._mergeable_ranks,
	special_tokens=_SPECIAL_TOKEN_IDS,
	)


	class SeqCondTokenizer(PreTrainedTokenizer):
	"""
	Tokenizer for SeqCond models, backed by tiktoken cl100k_base.

	This is a slow tokenizer that wraps tiktoken. Tokens are represented
	internally as their stringified integer IDs (e.g. "42", "100256").
	This avoids building a full vocab dict while remaining compatible with
	HuggingFace's PreTrainedTokenizer interface.

	Requires: pip install tiktoken
	"""

	vocab_files_names: Dict[str, str] = {}
	model_input_names = ["input_ids", "attention_mask"]

	def __init__(
	self,
	eos_token: str = "<\|im_end\|>",
	bos_token: Optional[str] = None,
	unk_token: Optional[str] = None,
	pad_token: str = "<\|im_end\|>",
	add_bos_token: bool = False,
	**kwargs,
	):
	self._enc = _build_tiktoken_enc()
	self._id_to_special: Dict[int, str] = {idx: tok for tok, idx in _SPECIAL_TOKEN_IDS.items()}
	self._special_to_id: Dict[str, int] = {v: k for k, v in self._id_to_special.items()}

	# Register special tokens before calling super().__init__
	kwargs.setdefault("additional_special_tokens", [t for t in _SPECIAL_TOKENS if t not in (eos_token, bos_token, unk_token, pad_token)])

	super().__init__(
	eos_token=eos_token,
	bos_token=bos_token,
	unk_token=unk_token,
	pad_token=pad_token,
	add_bos_token=add_bos_token,
	**kwargs,
	)

	@property
	def vocab_size(self) -> int:
	return _VOCAB_SIZE

	# ------------------------------------------------------------------
	# Core token ↔ id mappings
	# ------------------------------------------------------------------

	def _tokenize(self, text: str, **kwargs) -> List[str]:
	"""Encode text into a list of token-id strings."""
	ids = self._enc.encode(text, allowed_special="all")
	# Shift non-special BPE IDs by +1 to match convectors.Tiktokenize
	# offset used during training (ID 0 reserved).
	shifted = [i if i in self._id_to_special else i + 1 for i in ids]
	return [str(i) for i in shifted]

	def _convert_token_to_id(self, token: str) -> int:
	"""Convert a token string (or id-string) to an integer id."""
	if token in self._special_to_id:
	return self._special_to_id[token]
	try:
	return int(token)
	except ValueError:
	return 0

	def _convert_id_to_token(self, index: int) -> str:
	"""Convert an integer id to its token string."""
	if index in self._id_to_special:
	return self._id_to_special[index]
	return str(index)

	def convert_tokens_to_string(self, tokens: List[str]) -> str:
	"""Decode a list of token strings back to text."""
	ids = []
	for t in tokens:
	if t in self._special_to_id:
	ids.append(self._special_to_id[t])
	else:
	try:
	ids.append(int(t))
	except ValueError:
	pass
	# Reverse the +1 BPE shift before decoding; skip invalid/ID 0 tokens.
	real_ids = []
	for i in ids:
	if i in self._id_to_special:
	real_ids.append(i)
	elif i >= 1:
	real_ids.append(i - 1)
	return self._enc.decode(real_ids)

	def get_vocab(self) -> Dict[str, int]:
	"""
	Return a vocab dict. Only special tokens are included with their names;
	regular BPE tokens are included as their id-string representation.
	(Building a full 100k-entry reverse BPE map is expensive and rarely needed.)
	"""
	vocab = {str(i): i for i in range(self.vocab_size)}
	for tok, idx in self._special_to_id.items():
	vocab[tok] = idx
	return vocab

	def save_vocabulary(
	self, save_directory: str, filename_prefix: Optional[str] = None
	) -> Tuple[str, ...]:
	"""
	No vocabulary file is needed — the tiktoken encoding is fetched from
	the tiktoken package at runtime. Returns an empty tuple.
	"""
	return ()

	# ------------------------------------------------------------------
	# Convenience helpers
	# ------------------------------------------------------------------

	@property
	def im_start_id(self) -> int:
	return self._special_to_id["<\|im_start\|>"]

	@property
	def im_end_id(self) -> int:
	return self._special_to_id["<\|im_end\|>"]

	@property
	def think_start_id(self) -> int:
	return self._special_to_id["<\|think_start\|>"]

	@property
	def think_end_id(self) -> int:
	return self._special_to_id["<\|think_end\|>"]

	def encode_chat(self, prompt: str, add_think_start: bool = True) -> List[int]:
	"""
	Format and encode a user prompt using the standard chat template.

	Args:
	prompt: The user's message (plain text).
	add_think_start: If True (default), append <\|think_start\|> so the
	model begins generating its chain-of-thought immediately.

	Returns:
	List of token ids (prompt already encoded, ready for prefill).
	"""
	text = f"<\|im_start\|>user\n{prompt}\n<\|im_end\|><\|im_start\|>assistant\n"
	if add_think_start:
	text += "<\|think_start\|>"
	ids = self._enc.encode(text, allowed_special="all")
	return [i if i in self._id_to_special else i + 1 for i in ids]

	def apply_chat_template(self, conversation, add_generation_prompt: bool = True, **kwargs) -> List[int]:
	"""
	Minimal chat template support for HF pipeline compatibility.

	Expects conversation as a list of {"role": ..., "content": ...} dicts.
	Only the last user turn is supported for now.
	"""
	text = ""
	for msg in conversation:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	text += f"<\|im_start\|>{role}\n{content}\n<\|im_end\|>"
	if add_generation_prompt:
	text += "<\|im_start\|>assistant\n<\|think_start\|>"
	ids = self._enc.encode(text, allowed_special="all")
	return [i if i in self._id_to_special else i + 1 for i in ids]