Spaces:
Running
Running
| // Multilingual Tokenizer Tax Calculator (v0.8.7 anti-bullshit pack #13) | |
| // | |
| // Pain: "I bought 1M tokens of API credit for our English chatbot. Then | |
| // we added Chinese support and the bill 3x'd overnight." The tokenizer | |
| // tax is real and silently asymmetric across languages. tiktokenizer. | |
| // vercel.app shows OpenAI's tokenizer; nothing public compares Llama vs | |
| // Qwen vs Phi vs Gemma vs GPT for the SAME text in the SAME interface. | |
| // | |
| // This module loads HuggingFace's transformers.js (browser-side BPE | |
| // runtime) lazily and tokenizes user-pasted text against a preset list | |
| // of open-weight tokenizers. The output is REAL per-tokenizer token | |
| // counts plus the cost asymmetry ratio (vs the user's chosen baseline). | |
| // | |
| // Pure logic + lazy CDN import. Codes/params only; main.js renders i18n. | |
| // ============================================================================= | |
| // transformers.js lazy loader | |
| // ============================================================================= | |
| // | |
| // Pinned 3.x major because the API surface (AutoTokenizer.from_pretrained, | |
| // .encode) is stable. Loaded from jsdelivr CDN — same pattern used | |
| // across HF Spaces. ~3 MB compressed bundle, cached aggressively after | |
| // first load. | |
| const TRANSFORMERS_CDN_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/transformers.min.js"; | |
| let _autoTokenizer = null; | |
| let _loadPromise = null; | |
| async function loadTransformersJs() { | |
| if (_autoTokenizer) return _autoTokenizer; | |
| if (_loadPromise) return _loadPromise; | |
| _loadPromise = (async () => { | |
| const mod = await import(TRANSFORMERS_CDN_URL); | |
| _autoTokenizer = mod.AutoTokenizer; | |
| return _autoTokenizer; | |
| })(); | |
| return _loadPromise; | |
| } | |
| // ============================================================================= | |
| // Per-tokenizer cache (avoid re-downloading tokenizer.json on every encode) | |
| // ============================================================================= | |
| const _tokenizerCache = new Map(); | |
| async function loadTokenizer(modelId) { | |
| if (_tokenizerCache.has(modelId)) return _tokenizerCache.get(modelId); | |
| const AT = await loadTransformersJs(); | |
| const tok = await AT.from_pretrained(modelId); | |
| _tokenizerCache.set(modelId, tok); | |
| return tok; | |
| } | |
| // ============================================================================= | |
| // Public: tokenize one model | |
| // ============================================================================= | |
| export async function tokenizeWithModel(modelId, text) { | |
| if (typeof text !== "string") { | |
| return { ok: false, modelId, error: "invalid_input" }; | |
| } | |
| try { | |
| const tok = await loadTokenizer(modelId); | |
| // transformers.js returns Int32Array | number[]. Use .length for count. | |
| const ids = await tok.encode(text); | |
| return { ok: true, modelId, token_count: ids.length }; | |
| } catch (e) { | |
| return { | |
| ok: false, | |
| modelId, | |
| error: classifyTokenizerError(e), | |
| raw: String(e?.message || e).slice(0, 200), | |
| }; | |
| } | |
| } | |
| function classifyTokenizerError(e) { | |
| const msg = String(e?.message || e).toLowerCase(); | |
| if (msg.includes("401") || msg.includes("403") || msg.includes("gated")) return "gated"; | |
| if (msg.includes("404") || msg.includes("not found")) return "not_found"; | |
| if (msg.includes("timeout") || msg.includes("aborted")) return "timeout"; | |
| if (msg.includes("network") || msg.includes("failed to fetch")) return "network"; | |
| return "fetch_failed"; | |
| } | |
| // ============================================================================= | |
| // Public: tokenize many models in parallel + compute ratios | |
| // ============================================================================= | |
| export async function tokenizeAll(modelIds, text, baseline_idx = 0) { | |
| if (!Array.isArray(modelIds) || modelIds.length === 0 || typeof text !== "string") { | |
| return { code: "empty_input", results: [], baseline: null }; | |
| } | |
| const results = await Promise.all( | |
| modelIds.map(id => tokenizeWithModel(id, text)) | |
| ); | |
| const okResults = results.filter(r => r.ok); | |
| if (okResults.length === 0) { | |
| return { code: "all_failed", results, baseline: null }; | |
| } | |
| // Baseline: first OK tokenizer, or the user-specified index if it's OK. | |
| let baseline = okResults[0]; | |
| if (baseline_idx >= 0 && baseline_idx < results.length && results[baseline_idx].ok) { | |
| baseline = results[baseline_idx]; | |
| } | |
| // Stamp ratio vs baseline + chars-per-token for each. | |
| const charCount = text.length; | |
| const byteCount = new TextEncoder().encode(text).length; | |
| for (const r of results) { | |
| if (!r.ok) continue; | |
| r.chars_per_token = r.token_count > 0 ? charCount / r.token_count : null; | |
| r.bytes_per_token = r.token_count > 0 ? byteCount / r.token_count : null; | |
| r.ratio_vs_baseline = baseline.token_count > 0 | |
| ? r.token_count / baseline.token_count | |
| : null; | |
| } | |
| return { | |
| code: "ok", | |
| results, | |
| baseline_id: baseline.modelId, | |
| baseline_count: baseline.token_count, | |
| chars: charCount, | |
| bytes: byteCount, | |
| }; | |
| } | |
| // ============================================================================= | |
| // Language detection — Unicode block analysis (no external deps) | |
| // ============================================================================= | |
| // | |
| // Surfaced as context next to the token counts so users see "this text | |
| // is 60% CJK, 40% Latin" — explains why one tokenizer is 3× another. | |
| const UNICODE_BLOCKS = [ | |
| // [name, regex_class] | |
| ["latin", /[A-z]/g], | |
| ["cjk", /[-ゟ゠-ヿ一-鿿ヲ-ン]/g], | |
| ["korean", /[가-ᄀ-ᇿ]/g], | |
| ["arabic", /[-ۿݐ-ݿ]/g], | |
| ["cyrillic", /[Ѐ-ӿ]/g], | |
| ["devanagari", /[ऀ-ॿ]/g], | |
| ["thai", /[-]/g], | |
| ["greek", /[Ͱ-Ͽ]/g], | |
| ["hebrew", /[-]/g], | |
| ]; | |
| export function detectLanguageBlocks(text) { | |
| if (typeof text !== "string" || !text) { | |
| return { total_chars: 0, blocks: {}, dominant: null }; | |
| } | |
| const blocks = {}; | |
| for (const [name, re] of UNICODE_BLOCKS) { | |
| re.lastIndex = 0; | |
| const m = text.match(re); | |
| blocks[name] = m ? m.length : 0; | |
| } | |
| const total = text.length; | |
| const dominant = Object.entries(blocks) | |
| .filter(([, n]) => n > 0) | |
| .sort((a, b) => b[1] - a[1])[0]?.[0] || null; | |
| return { total_chars: total, blocks, dominant }; | |
| } | |
| // ============================================================================= | |
| // Preset tokenizer list — all open-weight (no HF auth required) | |
| // ============================================================================= | |
| // | |
| // Curated for breadth: one per major tokenizer family. For gated | |
| // originals (Llama, Mistral, Gemma) the unsloth open-mirror is used — | |
| // tokenizer.json is byte-identical to the original because quantization | |
| // touches weights, not tokens (see spec-decode docs for the same | |
| // argument). | |
| export const PRESET_TOKENIZERS = [ | |
| { | |
| id: "Qwen/Qwen2.5-7B-Instruct", | |
| label: "Qwen2.5", | |
| family: "Qwen-BPE (152k vocab, CJK-aware)", | |
| }, | |
| { | |
| id: "microsoft/Phi-3.5-mini-instruct", | |
| label: "Phi-3.5", | |
| family: "tiktoken-style BPE (32k)", | |
| }, | |
| { | |
| id: "unsloth/Meta-Llama-3.1-8B-Instruct", | |
| label: "Llama-3.1", | |
| family: "Llama-3 BPE (128k)", | |
| }, | |
| { | |
| id: "unsloth/gemma-2-9b-it", | |
| label: "Gemma-2", | |
| family: "SentencePiece (256k)", | |
| }, | |
| { | |
| id: "Xenova/gpt-4", | |
| label: "GPT-4 (cl100k)", | |
| family: "OpenAI tiktoken cl100k_base", | |
| }, | |
| { | |
| id: "Xenova/claude-tokenizer", | |
| label: "Claude (approx)", | |
| family: "Anthropic open approx (community port)", | |
| }, | |
| ]; | |
| // Sample texts that demonstrate cost asymmetry — identical meaning | |
| // across languages so the user sees per-language tax directly. | |
| export const SAMPLE_TEXTS = { | |
| english: "The quick brown fox jumps over the lazy dog. " + | |
| "She sells seashells by the seashore. Pack my box with five dozen liquor jugs.", | |
| chinese: "敏捷的棕色狐狸跳过了懒狗。她在海边卖海贝壳。请用五打酒壶装满我的箱子。" + | |
| "中文用字符表示词义,所以一段文字所需的字符数远少于英文。", | |
| arabic: "الثعلب البني السريع يقفز فوق الكلب الكسول. " + | |
| "تبيع أصدافًا بحرية على شاطئ البحر. عبئ صندوقي بخمسين إبريقًا من الخمر.", | |
| mixed: "Hello world! 你好世界 مرحبا بالعالم Привет мир नमस्ते दुनिया", | |
| code: "def quick_brown_fox(jumps_over: int) -> str:\n" + | |
| " return f'The fox jumped {jumps_over} times'\n\n" + | |
| "for i in range(10):\n print(quick_brown_fox(i))", | |
| }; | |