File size: 3,028 Bytes
1267a61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | """Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.
Output layout:
web/public/models-dev/needle.model — SentencePiece model file (browser fetches this)
web/test/tokenizer-goldens.json — golden (string, ids) pairs for vitest parity tests
The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
binding and must reproduce the goldens byte-for-byte.
"""
import json
import shutil
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
from needle.dataset.tokenizer import get_tokenizer
REPO = Path(__file__).resolve().parent.parent
SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
WEB_MODELS = REPO / "web" / "public" / "models-dev"
WEB_TESTS = REPO / "web" / "test"
def main():
WEB_MODELS.mkdir(parents=True, exist_ok=True)
WEB_TESTS.mkdir(parents=True, exist_ok=True)
# 1. Copy the SentencePiece model file
sp_dst = WEB_MODELS / "needle.model"
shutil.copy(SP_MODEL_SRC, sp_dst)
print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")
# 2. Generate goldens
tok = get_tokenizer()
goldens_src = [
"set a 5 min timer",
"send email to alice@example.com saying hi",
"create a note: buy milk",
'[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
"",
"a",
" leading and trailing ",
"café naïve",
"🌵",
"newline\nhere",
"tab\there",
'{"deeply":{"nested":{"json":true}}}',
"schedule meeting at 2pm",
"what's the weather?",
"Repeat: aaaaaaaaaa",
"Mixed CASE Words",
"1234567890",
"<tool_call>",
"<tools>",
"encoder.onnx and decoder_step.onnx",
]
goldens = []
for s in goldens_src:
ids = tok.encode(s)
# sanity: round-trip should match (with byte_fallback this should always work)
decoded = tok.decode(ids)
if decoded != s:
print(f" WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
continue
goldens.append({"text": s, "ids": ids})
goldens_path = WEB_TESTS / "tokenizer-goldens.json"
goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")
# 3. Also dump special token IDs for the TS wrapper to consume
specials = {
"pad": tok.pad_token_id,
"eos": tok.eos_token_id,
"bos": tok.bos_token_id,
"tool_call": tok.tool_call_token_id,
"tools": tok.tools_token_id,
}
specials_path = WEB_MODELS / "tokenizer-specials.json"
specials_path.write_text(json.dumps(specials, indent=2))
print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")
if __name__ == "__main__":
main()
|