onnx-community
/

needle-onnx

function-calling

Model card Files Files and versions

needle-onnx / dump_tokenizer.py

shreyask's picture

Upload dump_tokenizer.py with huggingface_hub

1267a61 verified 9 days ago

history blame contribute delete

3.03 kB

	"""Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.

	Output layout:
	web/public/models-dev/needle.model — SentencePiece model file (browser fetches this)
	web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests

	The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
	binding and must reproduce the goldens byte-for-byte.
	"""
	import json
	import shutil
	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
	from needle.dataset.tokenizer import get_tokenizer

	REPO = Path(__file__).resolve().parent.parent
	SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
	WEB_MODELS = REPO / "web" / "public" / "models-dev"
	WEB_TESTS = REPO / "web" / "test"


	def main():
	WEB_MODELS.mkdir(parents=True, exist_ok=True)
	WEB_TESTS.mkdir(parents=True, exist_ok=True)

	# 1. Copy the SentencePiece model file
	sp_dst = WEB_MODELS / "needle.model"
	shutil.copy(SP_MODEL_SRC, sp_dst)
	print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")

	# 2. Generate goldens
	tok = get_tokenizer()
	goldens_src = [
	"set a 5 min timer",
	"send email to alice@example.com saying hi",
	"create a note: buy milk",
	'[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
	"",
	"a",
	" leading and trailing ",
	"café naïve",
	"🌵",
	"newline\nhere",
	"tab\there",
	'{"deeply":{"nested":{"json":true}}}',
	"schedule meeting at 2pm",
	"what's the weather?",
	"Repeat: aaaaaaaaaa",
	"Mixed CASE Words",
	"1234567890",
	"<tool_call>",
	"<tools>",
	"encoder.onnx and decoder_step.onnx",
	]
	goldens = []
	for s in goldens_src:
	ids = tok.encode(s)
	# sanity: round-trip should match (with byte_fallback this should always work)
	decoded = tok.decode(ids)
	if decoded != s:
	print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
	continue
	goldens.append({"text": s, "ids": ids})

	goldens_path = WEB_TESTS / "tokenizer-goldens.json"
	goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
	print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")

	# 3. Also dump special token IDs for the TS wrapper to consume
	specials = {
	"pad": tok.pad_token_id,
	"eos": tok.eos_token_id,
	"bos": tok.bos_token_id,
	"tool_call": tok.tool_call_token_id,
	"tools": tok.tools_token_id,
	}
	specials_path = WEB_MODELS / "tokenizer-specials.json"
	specials_path.write_text(json.dumps(specials, indent=2))
	print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")


	if __name__ == "__main__":
	main()