// Multilingual Tokenizer Tax Calculator (v0.8.7 anti-bullshit pack #13) // // Pain: "I bought 1M tokens of API credit for our English chatbot. Then // we added Chinese support and the bill 3x'd overnight." The tokenizer // tax is real and silently asymmetric across languages. tiktokenizer. // vercel.app shows OpenAI's tokenizer; nothing public compares Llama vs // Qwen vs Phi vs Gemma vs GPT for the SAME text in the SAME interface. // // This module loads HuggingFace's transformers.js (browser-side BPE // runtime) lazily and tokenizes user-pasted text against a preset list // of open-weight tokenizers. The output is REAL per-tokenizer token // counts plus the cost asymmetry ratio (vs the user's chosen baseline). // // Pure logic + lazy CDN import. Codes/params only; main.js renders i18n. // ============================================================================= // transformers.js lazy loader // ============================================================================= // // Pinned 3.x major because the API surface (AutoTokenizer.from_pretrained, // .encode) is stable. Loaded from jsdelivr CDN — same pattern used // across HF Spaces. ~3 MB compressed bundle, cached aggressively after // first load. const TRANSFORMERS_CDN_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/transformers.min.js"; let _autoTokenizer = null; let _loadPromise = null; async function loadTransformersJs() { if (_autoTokenizer) return _autoTokenizer; if (_loadPromise) return _loadPromise; _loadPromise = (async () => { const mod = await import(TRANSFORMERS_CDN_URL); _autoTokenizer = mod.AutoTokenizer; return _autoTokenizer; })(); return _loadPromise; } // ============================================================================= // Per-tokenizer cache (avoid re-downloading tokenizer.json on every encode) // ============================================================================= const _tokenizerCache = new Map(); async function loadTokenizer(modelId) { if (_tokenizerCache.has(modelId)) return _tokenizerCache.get(modelId); const AT = await loadTransformersJs(); const tok = await AT.from_pretrained(modelId); _tokenizerCache.set(modelId, tok); return tok; } // ============================================================================= // Public: tokenize one model // ============================================================================= export async function tokenizeWithModel(modelId, text) { if (typeof text !== "string") { return { ok: false, modelId, error: "invalid_input" }; } try { const tok = await loadTokenizer(modelId); // transformers.js returns Int32Array | number[]. Use .length for count. const ids = await tok.encode(text); return { ok: true, modelId, token_count: ids.length }; } catch (e) { return { ok: false, modelId, error: classifyTokenizerError(e), raw: String(e?.message || e).slice(0, 200), }; } } function classifyTokenizerError(e) { const msg = String(e?.message || e).toLowerCase(); if (msg.includes("401") || msg.includes("403") || msg.includes("gated")) return "gated"; if (msg.includes("404") || msg.includes("not found")) return "not_found"; if (msg.includes("timeout") || msg.includes("aborted")) return "timeout"; if (msg.includes("network") || msg.includes("failed to fetch")) return "network"; return "fetch_failed"; } // ============================================================================= // Public: tokenize many models in parallel + compute ratios // ============================================================================= export async function tokenizeAll(modelIds, text, baseline_idx = 0) { if (!Array.isArray(modelIds) || modelIds.length === 0 || typeof text !== "string") { return { code: "empty_input", results: [], baseline: null }; } const results = await Promise.all( modelIds.map(id => tokenizeWithModel(id, text)) ); const okResults = results.filter(r => r.ok); if (okResults.length === 0) { return { code: "all_failed", results, baseline: null }; } // Baseline: first OK tokenizer, or the user-specified index if it's OK. let baseline = okResults[0]; if (baseline_idx >= 0 && baseline_idx < results.length && results[baseline_idx].ok) { baseline = results[baseline_idx]; } // Stamp ratio vs baseline + chars-per-token for each. const charCount = text.length; const byteCount = new TextEncoder().encode(text).length; for (const r of results) { if (!r.ok) continue; r.chars_per_token = r.token_count > 0 ? charCount / r.token_count : null; r.bytes_per_token = r.token_count > 0 ? byteCount / r.token_count : null; r.ratio_vs_baseline = baseline.token_count > 0 ? r.token_count / baseline.token_count : null; } return { code: "ok", results, baseline_id: baseline.modelId, baseline_count: baseline.token_count, chars: charCount, bytes: byteCount, }; } // ============================================================================= // Language detection — Unicode block analysis (no external deps) // ============================================================================= // // Surfaced as context next to the token counts so users see "this text // is 60% CJK, 40% Latin" — explains why one tokenizer is 3× another. const UNICODE_BLOCKS = [ // [name, regex_class] ["latin", /[A-z]/g], ["cjk", /[぀-ゟ゠-ヿ一-鿿ヲ-ン]/g], ["korean", /[가-힯ᄀ-ᇿ]/g], ["arabic", /[؀-ۿݐ-ݿ]/g], ["cyrillic", /[Ѐ-ӿ]/g], ["devanagari", /[ऀ-ॿ]/g], ["thai", /[฀-๿]/g], ["greek", /[Ͱ-Ͽ]/g], ["hebrew", /[֐-׿]/g], ]; export function detectLanguageBlocks(text) { if (typeof text !== "string" || !text) { return { total_chars: 0, blocks: {}, dominant: null }; } const blocks = {}; for (const [name, re] of UNICODE_BLOCKS) { re.lastIndex = 0; const m = text.match(re); blocks[name] = m ? m.length : 0; } const total = text.length; const dominant = Object.entries(blocks) .filter(([, n]) => n > 0) .sort((a, b) => b[1] - a[1])[0]?.[0] || null; return { total_chars: total, blocks, dominant }; } // ============================================================================= // Preset tokenizer list — all open-weight (no HF auth required) // ============================================================================= // // Curated for breadth: one per major tokenizer family. For gated // originals (Llama, Mistral, Gemma) the unsloth open-mirror is used — // tokenizer.json is byte-identical to the original because quantization // touches weights, not tokens (see spec-decode docs for the same // argument). export const PRESET_TOKENIZERS = [ { id: "Qwen/Qwen2.5-7B-Instruct", label: "Qwen2.5", family: "Qwen-BPE (152k vocab, CJK-aware)", }, { id: "microsoft/Phi-3.5-mini-instruct", label: "Phi-3.5", family: "tiktoken-style BPE (32k)", }, { id: "unsloth/Meta-Llama-3.1-8B-Instruct", label: "Llama-3.1", family: "Llama-3 BPE (128k)", }, { id: "unsloth/gemma-2-9b-it", label: "Gemma-2", family: "SentencePiece (256k)", }, { id: "Xenova/gpt-4", label: "GPT-4 (cl100k)", family: "OpenAI tiktoken cl100k_base", }, { id: "Xenova/claude-tokenizer", label: "Claude (approx)", family: "Anthropic open approx (community port)", }, ]; // Sample texts that demonstrate cost asymmetry — identical meaning // across languages so the user sees per-language tax directly. export const SAMPLE_TEXTS = { english: "The quick brown fox jumps over the lazy dog. " + "She sells seashells by the seashore. Pack my box with five dozen liquor jugs.", chinese: "敏捷的棕色狐狸跳过了懒狗。她在海边卖海贝壳。请用五打酒壶装满我的箱子。" + "中文用字符表示词义,所以一段文字所需的字符数远少于英文。", arabic: "الثعلب البني السريع يقفز فوق الكلب الكسول. " + "تبيع أصدافًا بحرية على شاطئ البحر. عبئ صندوقي بخمسين إبريقًا من الخمر.", mixed: "Hello world! 你好世界 مرحبا بالعالم Привет мир नमस्ते दुनिया", code: "def quick_brown_fox(jumps_over: int) -> str:\n" + " return f'The fox jumped {jumps_over} times'\n\n" + "for i in range(10):\n print(quick_brown_fox(i))", };