| """Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens. |
| |
| Output layout: |
| web/public/models-dev/needle.model — SentencePiece model file (browser fetches this) |
| web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests |
| |
| The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece |
| binding and must reproduce the goldens byte-for-byte. |
| """ |
| import json |
| import shutil |
| import sys |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle")) |
| from needle.dataset.tokenizer import get_tokenizer |
|
|
| REPO = Path(__file__).resolve().parent.parent |
| SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model" |
| WEB_MODELS = REPO / "web" / "public" / "models-dev" |
| WEB_TESTS = REPO / "web" / "test" |
|
|
|
|
| def main(): |
| WEB_MODELS.mkdir(parents=True, exist_ok=True) |
| WEB_TESTS.mkdir(parents=True, exist_ok=True) |
|
|
| |
| sp_dst = WEB_MODELS / "needle.model" |
| shutil.copy(SP_MODEL_SRC, sp_dst) |
| print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)") |
|
|
| |
| tok = get_tokenizer() |
| goldens_src = [ |
| "set a 5 min timer", |
| "send email to alice@example.com saying hi", |
| "create a note: buy milk", |
| '[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]', |
| "", |
| "a", |
| " leading and trailing ", |
| "café naïve", |
| "🌵", |
| "newline\nhere", |
| "tab\there", |
| '{"deeply":{"nested":{"json":true}}}', |
| "schedule meeting at 2pm", |
| "what's the weather?", |
| "Repeat: aaaaaaaaaa", |
| "Mixed CASE Words", |
| "1234567890", |
| "<tool_call>", |
| "<tools>", |
| "encoder.onnx and decoder_step.onnx", |
| ] |
| goldens = [] |
| for s in goldens_src: |
| ids = tok.encode(s) |
| |
| decoded = tok.decode(ids) |
| if decoded != s: |
| print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping") |
| continue |
| goldens.append({"text": s, "ids": ids}) |
|
|
| goldens_path = WEB_TESTS / "tokenizer-goldens.json" |
| goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2)) |
| print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}") |
|
|
| |
| specials = { |
| "pad": tok.pad_token_id, |
| "eos": tok.eos_token_id, |
| "bos": tok.bos_token_id, |
| "tool_call": tok.tool_call_token_id, |
| "tools": tok.tools_token_id, |
| } |
| specials_path = WEB_MODELS / "tokenizer-specials.json" |
| specials_path.write_text(json.dumps(specials, indent=2)) |
| print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|