File size: 3,028 Bytes

1267a61

"""Copy Cactus's SentencePiece model file to web/public/ and emit parity goldens.

Output layout:
    web/public/models-dev/needle.model         — SentencePiece model file (browser fetches this)
    web/test/tokenizer-goldens.json            — golden (string, ids) pairs for vitest parity tests

The browser-side TS tokenizer (Task 6) loads needle.model via a JS SentencePiece
binding and must reproduce the goldens byte-for-byte.
"""
import json
import shutil
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "external" / "needle"))
from needle.dataset.tokenizer import get_tokenizer

REPO = Path(__file__).resolve().parent.parent
SP_MODEL_SRC = REPO / "external" / "needle" / "needle" / "tokenizer" / "needle.model"
WEB_MODELS = REPO / "web" / "public" / "models-dev"
WEB_TESTS = REPO / "web" / "test"


def main():
    WEB_MODELS.mkdir(parents=True, exist_ok=True)
    WEB_TESTS.mkdir(parents=True, exist_ok=True)

    # 1. Copy the SentencePiece model file
    sp_dst = WEB_MODELS / "needle.model"
    shutil.copy(SP_MODEL_SRC, sp_dst)
    print(f"copied {SP_MODEL_SRC.relative_to(REPO)} -> {sp_dst.relative_to(REPO)} ({sp_dst.stat().st_size} bytes)")

    # 2. Generate goldens
    tok = get_tokenizer()
    goldens_src = [
        "set a 5 min timer",
        "send email to alice@example.com saying hi",
        "create a note: buy milk",
        '[{"name":"set_timer","arguments":{"time_human":"5 minutes"}}]',
        "",
        "a",
        "  leading and trailing  ",
        "café naïve",
        "🌵",
        "newline\nhere",
        "tab\there",
        '{"deeply":{"nested":{"json":true}}}',
        "schedule meeting at 2pm",
        "what's the weather?",
        "Repeat: aaaaaaaaaa",
        "Mixed CASE Words",
        "1234567890",
        "<tool_call>",
        "<tools>",
        "encoder.onnx and decoder_step.onnx",
    ]
    goldens = []
    for s in goldens_src:
        ids = tok.encode(s)
        # sanity: round-trip should match (with byte_fallback this should always work)
        decoded = tok.decode(ids)
        if decoded != s:
            print(f"  WARN: round-trip mismatch — {s!r} -> {ids} -> {decoded!r}; skipping")
            continue
        goldens.append({"text": s, "ids": ids})

    goldens_path = WEB_TESTS / "tokenizer-goldens.json"
    goldens_path.write_text(json.dumps(goldens, ensure_ascii=False, indent=2))
    print(f"wrote {len(goldens)} golden pairs -> {goldens_path.relative_to(REPO)}")

    # 3. Also dump special token IDs for the TS wrapper to consume
    specials = {
        "pad": tok.pad_token_id,
        "eos": tok.eos_token_id,
        "bos": tok.bos_token_id,
        "tool_call": tok.tool_call_token_id,
        "tools": tok.tools_token_id,
    }
    specials_path = WEB_MODELS / "tokenizer-specials.json"
    specials_path.write_text(json.dumps(specials, indent=2))
    print(f"wrote special token IDs -> {specials_path.relative_to(REPO)}: {specials}")


if __name__ == "__main__":
    main()