Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / js /tokenizer_tax.js

karlexmarin's picture

v0.8.7 Multilingual Tokenizer Tax Calculator — anti-bullshit pack #13

cd27f27 18 days ago

history blame contribute delete

8.54 kB

	// Multilingual Tokenizer Tax Calculator (v0.8.7 anti-bullshit pack #13)
	//
	// Pain: "I bought 1M tokens of API credit for our English chatbot. Then
	// we added Chinese support and the bill 3x'd overnight." The tokenizer
	// tax is real and silently asymmetric across languages. tiktokenizer.
	// vercel.app shows OpenAI's tokenizer; nothing public compares Llama vs
	// Qwen vs Phi vs Gemma vs GPT for the SAME text in the SAME interface.
	//
	// This module loads HuggingFace's transformers.js (browser-side BPE
	// runtime) lazily and tokenizes user-pasted text against a preset list
	// of open-weight tokenizers. The output is REAL per-tokenizer token
	// counts plus the cost asymmetry ratio (vs the user's chosen baseline).
	//
	// Pure logic + lazy CDN import. Codes/params only; main.js renders i18n.

	// =============================================================================
	// transformers.js lazy loader
	// =============================================================================
	//
	// Pinned 3.x major because the API surface (AutoTokenizer.from_pretrained,
	// .encode) is stable. Loaded from jsdelivr CDN — same pattern used
	// across HF Spaces. ~3 MB compressed bundle, cached aggressively after
	// first load.

	const TRANSFORMERS_CDN_URL = "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.2/dist/transformers.min.js";

	let _autoTokenizer = null;
	let _loadPromise = null;

	async function loadTransformersJs() {
	if (_autoTokenizer) return _autoTokenizer;
	if (_loadPromise) return _loadPromise;
	_loadPromise = (async () => {
	const mod = await import(TRANSFORMERS_CDN_URL);
	_autoTokenizer = mod.AutoTokenizer;
	return _autoTokenizer;
	})();
	return _loadPromise;
	}

	// =============================================================================
	// Per-tokenizer cache (avoid re-downloading tokenizer.json on every encode)
	// =============================================================================

	const _tokenizerCache = new Map();

	async function loadTokenizer(modelId) {
	if (_tokenizerCache.has(modelId)) return _tokenizerCache.get(modelId);
	const AT = await loadTransformersJs();
	const tok = await AT.from_pretrained(modelId);
	_tokenizerCache.set(modelId, tok);
	return tok;
	}

	// =============================================================================
	// Public: tokenize one model
	// =============================================================================

	export async function tokenizeWithModel(modelId, text) {
	if (typeof text !== "string") {
	return { ok: false, modelId, error: "invalid_input" };
	}
	try {
	const tok = await loadTokenizer(modelId);
	// transformers.js returns Int32Array \| number[]. Use .length for count.
	const ids = await tok.encode(text);
	return { ok: true, modelId, token_count: ids.length };
	} catch (e) {
	return {
	ok: false,
	modelId,
	error: classifyTokenizerError(e),
	raw: String(e?.message \|\| e).slice(0, 200),
	};
	}
	}

	function classifyTokenizerError(e) {
	const msg = String(e?.message \|\| e).toLowerCase();
	if (msg.includes("401") \|\| msg.includes("403") \|\| msg.includes("gated")) return "gated";
	if (msg.includes("404") \|\| msg.includes("not found")) return "not_found";
	if (msg.includes("timeout") \|\| msg.includes("aborted")) return "timeout";
	if (msg.includes("network") \|\| msg.includes("failed to fetch")) return "network";
	return "fetch_failed";
	}

	// =============================================================================
	// Public: tokenize many models in parallel + compute ratios
	// =============================================================================

	export async function tokenizeAll(modelIds, text, baseline_idx = 0) {
	if (!Array.isArray(modelIds) \|\| modelIds.length === 0 \|\| typeof text !== "string") {
	return { code: "empty_input", results: [], baseline: null };
	}
	const results = await Promise.all(
	modelIds.map(id => tokenizeWithModel(id, text))
	);
	const okResults = results.filter(r => r.ok);
	if (okResults.length === 0) {
	return { code: "all_failed", results, baseline: null };
	}
	// Baseline: first OK tokenizer, or the user-specified index if it's OK.
	let baseline = okResults[0];
	if (baseline_idx >= 0 && baseline_idx < results.length && results[baseline_idx].ok) {
	baseline = results[baseline_idx];
	}
	// Stamp ratio vs baseline + chars-per-token for each.
	const charCount = text.length;
	const byteCount = new TextEncoder().encode(text).length;
	for (const r of results) {
	if (!r.ok) continue;
	r.chars_per_token = r.token_count > 0 ? charCount / r.token_count : null;
	r.bytes_per_token = r.token_count > 0 ? byteCount / r.token_count : null;
	r.ratio_vs_baseline = baseline.token_count > 0
	? r.token_count / baseline.token_count
	: null;
	}
	return {
	code: "ok",
	results,
	baseline_id: baseline.modelId,
	baseline_count: baseline.token_count,
	chars: charCount,
	bytes: byteCount,
	};
	}

	// =============================================================================
	// Language detection — Unicode block analysis (no external deps)
	// =============================================================================
	//
	// Surfaced as context next to the token counts so users see "this text
	// is 60% CJK, 40% Latin" — explains why one tokenizer is 3× another.

	const UNICODE_BLOCKS = [
	// [name, regex_class]
	["latin", /[A-z]/g],
	["cjk", /[぀-ゟ゠-ヿ一-鿿ｦ-ﾝ]/g],
	["korean", /[가-힯ᄀ-ᇿ]/g],
	["arabic", /[؀-ۿݐ-ݿ]/g],
	["cyrillic", /[Ѐ-ӿ]/g],
	["devanagari", /[ऀ-ॿ]/g],
	["thai", /[฀-๿]/g],
	["greek", /[Ͱ-Ͽ]/g],
	["hebrew", /[֐-׿]/g],
	];

	export function detectLanguageBlocks(text) {
	if (typeof text !== "string" \|\| !text) {
	return { total_chars: 0, blocks: {}, dominant: null };
	}
	const blocks = {};
	for (const [name, re] of UNICODE_BLOCKS) {
	re.lastIndex = 0;
	const m = text.match(re);
	blocks[name] = m ? m.length : 0;
	}
	const total = text.length;
	const dominant = Object.entries(blocks)
	.filter(([, n]) => n > 0)
	.sort((a, b) => b[1] - a[1])[0]?.[0] \|\| null;
	return { total_chars: total, blocks, dominant };
	}

	// =============================================================================
	// Preset tokenizer list — all open-weight (no HF auth required)
	// =============================================================================
	//
	// Curated for breadth: one per major tokenizer family. For gated
	// originals (Llama, Mistral, Gemma) the unsloth open-mirror is used —
	// tokenizer.json is byte-identical to the original because quantization
	// touches weights, not tokens (see spec-decode docs for the same
	// argument).

	export const PRESET_TOKENIZERS = [
	{
	id: "Qwen/Qwen2.5-7B-Instruct",
	label: "Qwen2.5",
	family: "Qwen-BPE (152k vocab, CJK-aware)",
	},
	{
	id: "microsoft/Phi-3.5-mini-instruct",
	label: "Phi-3.5",
	family: "tiktoken-style BPE (32k)",
	},
	{
	id: "unsloth/Meta-Llama-3.1-8B-Instruct",
	label: "Llama-3.1",
	family: "Llama-3 BPE (128k)",
	},
	{
	id: "unsloth/gemma-2-9b-it",
	label: "Gemma-2",
	family: "SentencePiece (256k)",
	},
	{
	id: "Xenova/gpt-4",
	label: "GPT-4 (cl100k)",
	family: "OpenAI tiktoken cl100k_base",
	},
	{
	id: "Xenova/claude-tokenizer",
	label: "Claude (approx)",
	family: "Anthropic open approx (community port)",
	},
	];

	// Sample texts that demonstrate cost asymmetry — identical meaning
	// across languages so the user sees per-language tax directly.
	export const SAMPLE_TEXTS = {
	english: "The quick brown fox jumps over the lazy dog. " +
	"She sells seashells by the seashore. Pack my box with five dozen liquor jugs.",
	chinese: "敏捷的棕色狐狸跳过了懒狗。她在海边卖海贝壳。请用五打酒壶装满我的箱子。" +
	"中文用字符表示词义,所以一段文字所需的字符数远少于英文。",
	arabic: "الثعلب البني السريع يقفز فوق الكلب الكسول. " +
	"تبيع أصدافًا بحرية على شاطئ البحر. عبئ صندوقي بخمسين إبريقًا من الخمر.",
	mixed: "Hello world! 你好世界 مرحبا بالعالم Привет мир नमस्ते दुनिया",
	code: "def quick_brown_fox(jumps_over: int) -> str:\n" +
	" return f'The fox jumped {jumps_over} times'\n\n" +
	"for i in range(10):\n print(quick_brown_fox(i))",
	};