shreyask commited on
Commit
1267a61
·
verified ·
1 Parent(s): f6077fc

Upload dump_tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dump_tokenizer.py +85 -0
dump_tokenizer.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.
2
+
3
+ Output layout:
4
+ web/public/models-dev/needle.model — SentencePiece model file (browser fetches this)
5
+ web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests
6
+
7
+ The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
8
+ binding and must reproduce the goldens byte-for-byte.
9
+ """
10
+ import json
11
+ import shutil
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
16
+ from needle.dataset.tokenizer import get_tokenizer
17
+
18
+ REPO = Path(__file__).resolve().parent.parent
19
+ SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
20
+ WEB_MODELS = REPO / "web" / "public" / "models-dev"
21
+ WEB_TESTS = REPO / "web" / "test"
22
+
23
+
24
+ def main():
25
+ WEB_MODELS.mkdir(parents=True, exist_ok=True)
26
+ WEB_TESTS.mkdir(parents=True, exist_ok=True)
27
+
28
+ # 1. Copy the SentencePiece model file
29
+ sp_dst = WEB_MODELS / "needle.model"
30
+ shutil.copy(SP_MODEL_SRC, sp_dst)
31
+ print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")
32
+
33
+ # 2. Generate goldens
34
+ tok = get_tokenizer()
35
+ goldens_src = [
36
+ "set a 5 min timer",
37
+ "send email to alice@example.com saying hi",
38
+ "create a note: buy milk",
39
+ '[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
40
+ "",
41
+ "a",
42
+ " leading and trailing ",
43
+ "café naïve",
44
+ "🌵",
45
+ "newline\nhere",
46
+ "tab\there",
47
+ '{"deeply":{"nested":{"json":true}}}',
48
+ "schedule meeting at 2pm",
49
+ "what's the weather?",
50
+ "Repeat: aaaaaaaaaa",
51
+ "Mixed CASE Words",
52
+ "1234567890",
53
+ "<tool_call>",
54
+ "<tools>",
55
+ "encoder.onnx and decoder_step.onnx",
56
+ ]
57
+ goldens = []
58
+ for s in goldens_src:
59
+ ids = tok.encode(s)
60
+ # sanity: round-trip should match (with byte_fallback this should always work)
61
+ decoded = tok.decode(ids)
62
+ if decoded != s:
63
+ print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
64
+ continue
65
+ goldens.append({"text": s, "ids": ids})
66
+
67
+ goldens_path = WEB_TESTS / "tokenizer-goldens.json"
68
+ goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
69
+ print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")
70
+
71
+ # 3. Also dump special token IDs for the TS wrapper to consume
72
+ specials = {
73
+ "pad": tok.pad_token_id,
74
+ "eos": tok.eos_token_id,
75
+ "bos": tok.bos_token_id,
76
+ "tool_call": tok.tool_call_token_id,
77
+ "tools": tok.tools_token_id,
78
+ }
79
+ specials_path = WEB_MODELS / "tokenizer-specials.json"
80
+ specials_path.write_text(json.dumps(specials, indent=2))
81
+ print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()