taf-agent / js /tokenizer_tax.js
karlexmarin's picture
v0.8.7 Multilingual Tokenizer Tax Calculator — anti-bullshit pack #13
cd27f27
// Multilingual Tokenizer Tax Calculator (v0.8.7 anti-bullshit pack #13)
//
// Pain: "I bought 1M tokens of API credit for our English chatbot. Then
// we added Chinese support and the bill 3x'd overnight." The tokenizer
// tax is real and silently asymmetric across languages. tiktokenizer.
// vercel.app shows OpenAI's tokenizer; nothing public compares Llama vs
// Qwen vs Phi vs Gemma vs GPT for the SAME text in the SAME interface.
//
// This module loads HuggingFace's transformers.js (browser-side BPE
// runtime) lazily and tokenizes user-pasted text against a preset list
// of open-weight tokenizers. The output is REAL per-tokenizer token
// counts plus the cost asymmetry ratio (vs the user's chosen baseline).
//
// Pure logic + lazy CDN import. Codes/params only; main.js renders i18n.
// =============================================================================
// transformers.js lazy loader
// =============================================================================
//
// Pinned 3.x major because the API surface (AutoTokenizer.from_pretrained,
// .encode) is stable. Loaded from jsdelivr CDN — same pattern used
// across HF Spaces. ~3 MB compressed bundle, cached aggressively after
// first load.
const TRANSFORMERS_CDN_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/transformers.min.js";
let _autoTokenizer = null;
let _loadPromise = null;
async function loadTransformersJs() {
if (_autoTokenizer) return _autoTokenizer;
if (_loadPromise) return _loadPromise;
_loadPromise = (async () => {
const mod = await import(TRANSFORMERS_CDN_URL);
_autoTokenizer = mod.AutoTokenizer;
return _autoTokenizer;
})();
return _loadPromise;
}
// =============================================================================
// Per-tokenizer cache (avoid re-downloading tokenizer.json on every encode)
// =============================================================================
const _tokenizerCache = new Map();
async function loadTokenizer(modelId) {
if (_tokenizerCache.has(modelId)) return _tokenizerCache.get(modelId);
const AT = await loadTransformersJs();
const tok = await AT.from_pretrained(modelId);
_tokenizerCache.set(modelId, tok);
return tok;
}
// =============================================================================
// Public: tokenize one model
// =============================================================================
export async function tokenizeWithModel(modelId, text) {
if (typeof text !== "string") {
return { ok: false, modelId, error: "invalid_input" };
}
try {
const tok = await loadTokenizer(modelId);
// transformers.js returns Int32Array | number[]. Use .length for count.
const ids = await tok.encode(text);
return { ok: true, modelId, token_count: ids.length };
} catch (e) {
return {
ok: false,
modelId,
error: classifyTokenizerError(e),
raw: String(e?.message || e).slice(0, 200),
};
}
}
function classifyTokenizerError(e) {
const msg = String(e?.message || e).toLowerCase();
if (msg.includes("401") || msg.includes("403") || msg.includes("gated")) return "gated";
if (msg.includes("404") || msg.includes("not found")) return "not_found";
if (msg.includes("timeout") || msg.includes("aborted")) return "timeout";
if (msg.includes("network") || msg.includes("failed to fetch")) return "network";
return "fetch_failed";
}
// =============================================================================
// Public: tokenize many models in parallel + compute ratios
// =============================================================================
export async function tokenizeAll(modelIds, text, baseline_idx = 0) {
if (!Array.isArray(modelIds) || modelIds.length === 0 || typeof text !== "string") {
return { code: "empty_input", results: [], baseline: null };
}
const results = await Promise.all(
modelIds.map(id => tokenizeWithModel(id, text))
);
const okResults = results.filter(r => r.ok);
if (okResults.length === 0) {
return { code: "all_failed", results, baseline: null };
}
// Baseline: first OK tokenizer, or the user-specified index if it's OK.
let baseline = okResults[0];
if (baseline_idx >= 0 && baseline_idx < results.length && results[baseline_idx].ok) {
baseline = results[baseline_idx];
}
// Stamp ratio vs baseline + chars-per-token for each.
const charCount = text.length;
const byteCount = new TextEncoder().encode(text).length;
for (const r of results) {
if (!r.ok) continue;
r.chars_per_token = r.token_count > 0 ? charCount / r.token_count : null;
r.bytes_per_token = r.token_count > 0 ? byteCount / r.token_count : null;
r.ratio_vs_baseline = baseline.token_count > 0
? r.token_count / baseline.token_count
: null;
}
return {
code: "ok",
results,
baseline_id: baseline.modelId,
baseline_count: baseline.token_count,
chars: charCount,
bytes: byteCount,
};
}
// =============================================================================
// Language detection — Unicode block analysis (no external deps)
// =============================================================================
//
// Surfaced as context next to the token counts so users see "this text
// is 60% CJK, 40% Latin" — explains why one tokenizer is 3× another.
const UNICODE_BLOCKS = [
// [name, regex_class]
["latin", /[A-z]/g],
["cjk", /[぀-ゟ゠-ヿ一-鿿ヲ-ン]/g],
["korean", /[가-힯ᄀ-ᇿ]/g],
["arabic", /[؀-ۿݐ-ݿ]/g],
["cyrillic", /[Ѐ-ӿ]/g],
["devanagari", /[ऀ-ॿ]/g],
["thai", /[฀-๿]/g],
["greek", /[Ͱ-Ͽ]/g],
["hebrew", /[֐-׿]/g],
];
export function detectLanguageBlocks(text) {
if (typeof text !== "string" || !text) {
return { total_chars: 0, blocks: {}, dominant: null };
}
const blocks = {};
for (const [name, re] of UNICODE_BLOCKS) {
re.lastIndex = 0;
const m = text.match(re);
blocks[name] = m ? m.length : 0;
}
const total = text.length;
const dominant = Object.entries(blocks)
.filter(([, n]) => n > 0)
.sort((a, b) => b[1] - a[1])[0]?.[0] || null;
return { total_chars: total, blocks, dominant };
}
// =============================================================================
// Preset tokenizer list — all open-weight (no HF auth required)
// =============================================================================
//
// Curated for breadth: one per major tokenizer family. For gated
// originals (Llama, Mistral, Gemma) the unsloth open-mirror is used —
// tokenizer.json is byte-identical to the original because quantization
// touches weights, not tokens (see spec-decode docs for the same
// argument).
export const PRESET_TOKENIZERS = [
{
id: "Qwen/Qwen2.5-7B-Instruct",
label: "Qwen2.5",
family: "Qwen-BPE (152k vocab, CJK-aware)",
},
{
id: "microsoft/Phi-3.5-mini-instruct",
label: "Phi-3.5",
family: "tiktoken-style BPE (32k)",
},
{
id: "unsloth/Meta-Llama-3.1-8B-Instruct",
label: "Llama-3.1",
family: "Llama-3 BPE (128k)",
},
{
id: "unsloth/gemma-2-9b-it",
label: "Gemma-2",
family: "SentencePiece (256k)",
},
{
id: "Xenova/gpt-4",
label: "GPT-4 (cl100k)",
family: "OpenAI tiktoken cl100k_base",
},
{
id: "Xenova/claude-tokenizer",
label: "Claude (approx)",
family: "Anthropic open approx (community port)",
},
];
// Sample texts that demonstrate cost asymmetry — identical meaning
// across languages so the user sees per-language tax directly.
export const SAMPLE_TEXTS = {
english: "The quick brown fox jumps over the lazy dog. " +
"She sells seashells by the seashore. Pack my box with five dozen liquor jugs.",
chinese: "敏捷的棕色狐狸跳过了懒狗。她在海边卖海贝壳。请用五打酒壶装满我的箱子。" +
"中文用字符表示词义,所以一段文字所需的字符数远少于英文。",
arabic: "الثعلب البني السريع يقفز فوق الكلب الكسول. " +
"تبيع أصدافًا بحرية على شاطئ البحر. عبئ صندوقي بخمسين إبريقًا من الخمر.",
mixed: "Hello world! 你好世界 مرحبا بالعالم Привет мир नमस्ते दुनिया",
code: "def quick_brown_fox(jumps_over: int) -> str:\n" +
" return f'The fox jumped {jumps_over} times'\n\n" +
"for i in range(10):\n print(quick_brown_fox(i))",
};