"""Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens. Output layout: web/public/models-dev/needle.model — SentencePiece model file (browser fetches this) web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece binding and must reproduce the goldens byte-for-byte. """ import json import shutil import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle")) from needle.dataset.tokenizer import get_tokenizer REPO = Path(__file__).resolve().parent.parent SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model" WEB_MODELS = REPO / "web" / "public" / "models-dev" WEB_TESTS = REPO / "web" / "test" def main(): WEB_MODELS.mkdir(parents=True, exist_ok=True) WEB_TESTS.mkdir(parents=True, exist_ok=True) # 1. Copy the SentencePiece model file sp_dst = WEB_MODELS / "needle.model" shutil.copy(SP_MODEL_SRC, sp_dst) print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)") # 2. Generate goldens tok = get_tokenizer() goldens_src = [ "set a 5 min timer", "send email to alice@example.com saying hi", "create a note: buy milk", '[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]', "", "a", " leading and trailing ", "café naïve", "🌵", "newline\nhere", "tab\there", '{"deeply":{"nested":{"json":true}}}', "schedule meeting at 2pm", "what's the weather?", "Repeat: aaaaaaaaaa", "Mixed CASE Words", "1234567890", "", "", "encoder.onnx and decoder_step.onnx", ] goldens = [] for s in goldens_src: ids = tok.encode(s) # sanity: round-trip should match (with byte_fallback this should always work) decoded = tok.decode(ids) if decoded != s: print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping") continue goldens.append({"text": s, "ids": ids}) goldens_path = WEB_TESTS / "tokenizer-goldens.json" goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2)) print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}") # 3. Also dump special token IDs for the TS wrapper to consume specials = { "pad": tok.pad_token_id, "eos": tok.eos_token_id, "bos": tok.bos_token_id, "tool_call": tok.tool_call_token_id, "tools": tok.tools_token_id, } specials_path = WEB_MODELS / "tokenizer-specials.json" specials_path.write_text(json.dumps(specials, indent=2)) print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}") if __name__ == "__main__": main()