onnx-community
/

needle-onnx

+"""Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.
+Output layout:
+    web/public/models-dev/needle.model         — SentencePiece model file (browser fetches this)
+    web/test/tokenizer-goldens.json            — golden (string, ids) pairs for vitest parity tests
+The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
+binding and must reproduce the goldens byte-for-byte.
+"""
+import json
+import shutil
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
+from needle.dataset.tokenizer import get_tokenizer
+REPO = Path(__file__).resolve().parent.parent
+SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
+WEB_MODELS = REPO / "web" / "public" / "models-dev"
+WEB_TESTS = REPO / "web" / "test"
+def main():
+    WEB_MODELS.mkdir(parents=True, exist_ok=True)
+    WEB_TESTS.mkdir(parents=True, exist_ok=True)
+    # 1. Copy the SentencePiece model file
+    sp_dst = WEB_MODELS / "needle.model"
+    shutil.copy(SP_MODEL_SRC, sp_dst)
+    print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")
+    # 2. Generate goldens
+    tok = get_tokenizer()
+    goldens_src = [
+        "set a 5 min timer",
+        "send email to alice@example.com saying hi",
+        "create a note: buy milk",
+        '[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
+        "",
+        "a",
+        "  leading and trailing  ",
+        "café naïve",
+        "🌵",
+        "newline\nhere",
+        "tab\there",
+        '{"deeply":{"nested":{"json":true}}}',
+        "schedule meeting at 2pm",
+        "what's the weather?",
+        "Repeat: aaaaaaaaaa",
+        "Mixed CASE Words",
+        "1234567890",
+        "<tool_call>",
+        "<tools>",
+        "encoder.onnx and decoder_step.onnx",
+    ]
+    goldens = []
+    for s in goldens_src:
+        ids = tok.encode(s)
+        # sanity: round-trip should match (with byte_fallback this should always work)
+        decoded = tok.decode(ids)
+        if decoded != s:
+            print(f"  WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
+            continue
+        goldens.append({"text": s, "ids": ids})
+    goldens_path = WEB_TESTS / "tokenizer-goldens.json"
+    goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
+    print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")
+    # 3. Also dump special token IDs for the TS wrapper to consume
+    specials = {
+        "pad": tok.pad_token_id,
+        "eos": tok.eos_token_id,
+        "bos": tok.bos_token_id,
+        "tool_call": tok.tool_call_token_id,
+        "tools": tok.tools_token_id,
+    }
+    specials_path = WEB_MODELS / "tokenizer-specials.json"
+    specials_path.write_text(json.dumps(specials, indent=2))
+    print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")
+if __name__ == "__main__":
+    main()