Upload dump_tokenizer.py with huggingface_hub
Browse files- dump_tokenizer.py +85 -0
dump_tokenizer.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.
|
| 2 |
+
|
| 3 |
+
Output layout:
|
| 4 |
+
web/public/models-dev/needle.model — SentencePiece model file (browser fetches this)
|
| 5 |
+
web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests
|
| 6 |
+
|
| 7 |
+
The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
|
| 8 |
+
binding and must reproduce the goldens byte-for-byte.
|
| 9 |
+
"""
|
| 10 |
+
import json
|
| 11 |
+
import shutil
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
|
| 16 |
+
from needle.dataset.tokenizer import get_tokenizer
|
| 17 |
+
|
| 18 |
+
REPO = Path(__file__).resolve().parent.parent
|
| 19 |
+
SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
|
| 20 |
+
WEB_MODELS = REPO / "web" / "public" / "models-dev"
|
| 21 |
+
WEB_TESTS = REPO / "web" / "test"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def main():
|
| 25 |
+
WEB_MODELS.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
WEB_TESTS.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
|
| 28 |
+
# 1. Copy the SentencePiece model file
|
| 29 |
+
sp_dst = WEB_MODELS / "needle.model"
|
| 30 |
+
shutil.copy(SP_MODEL_SRC, sp_dst)
|
| 31 |
+
print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")
|
| 32 |
+
|
| 33 |
+
# 2. Generate goldens
|
| 34 |
+
tok = get_tokenizer()
|
| 35 |
+
goldens_src = [
|
| 36 |
+
"set a 5 min timer",
|
| 37 |
+
"send email to alice@example.com saying hi",
|
| 38 |
+
"create a note: buy milk",
|
| 39 |
+
'[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
|
| 40 |
+
"",
|
| 41 |
+
"a",
|
| 42 |
+
" leading and trailing ",
|
| 43 |
+
"café naïve",
|
| 44 |
+
"🌵",
|
| 45 |
+
"newline\nhere",
|
| 46 |
+
"tab\there",
|
| 47 |
+
'{"deeply":{"nested":{"json":true}}}',
|
| 48 |
+
"schedule meeting at 2pm",
|
| 49 |
+
"what's the weather?",
|
| 50 |
+
"Repeat: aaaaaaaaaa",
|
| 51 |
+
"Mixed CASE Words",
|
| 52 |
+
"1234567890",
|
| 53 |
+
"<tool_call>",
|
| 54 |
+
"<tools>",
|
| 55 |
+
"encoder.onnx and decoder_step.onnx",
|
| 56 |
+
]
|
| 57 |
+
goldens = []
|
| 58 |
+
for s in goldens_src:
|
| 59 |
+
ids = tok.encode(s)
|
| 60 |
+
# sanity: round-trip should match (with byte_fallback this should always work)
|
| 61 |
+
decoded = tok.decode(ids)
|
| 62 |
+
if decoded != s:
|
| 63 |
+
print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
|
| 64 |
+
continue
|
| 65 |
+
goldens.append({"text": s, "ids": ids})
|
| 66 |
+
|
| 67 |
+
goldens_path = WEB_TESTS / "tokenizer-goldens.json"
|
| 68 |
+
goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
|
| 69 |
+
print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")
|
| 70 |
+
|
| 71 |
+
# 3. Also dump special token IDs for the TS wrapper to consume
|
| 72 |
+
specials = {
|
| 73 |
+
"pad": tok.pad_token_id,
|
| 74 |
+
"eos": tok.eos_token_id,
|
| 75 |
+
"bos": tok.bos_token_id,
|
| 76 |
+
"tool_call": tok.tool_call_token_id,
|
| 77 |
+
"tools": tok.tools_token_id,
|
| 78 |
+
}
|
| 79 |
+
specials_path = WEB_MODELS / "tokenizer-specials.json"
|
| 80 |
+
specials_path.write_text(json.dumps(specials, indent=2))
|
| 81 |
+
print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|