// js/engine.js — wraps transformers.js text encoder + hyparquet embedding // loader behind a small synchronous-looking API. // // References (canonical 2025/26 loaders): // transformers.js v4 release notes (Feb 2026): // https://huggingface.co/blog/transformersjs-v4 // CLIPTextModelWithProjection API (added in transformers.js PR #227, // unchanged in v4): // https://github.com/huggingface/transformers.js/pull/227 // hyparquet browser example: // https://github.com/hyparam/hyparquet#browser-example // // We deliberately skip the high-level `pipeline()` API because we want // just the text-side projection and direct control over the output tensor. import { MODELS, DEFAULT_MODEL_ID, DATASET_REPO, DATASET_REVISION, DEVICE, imagesUrl, TRANSFORMERS_JS_CDN, HYPARQUET_CDN, } from './config.js'; // Lazy-loaded once. let transformersJsPromise = null; let hyparquetPromise = null; function loadTransformersJs() { if (!transformersJsPromise) transformersJsPromise = import(TRANSFORMERS_JS_CDN); return transformersJsPromise; } function loadHyparquet() { if (!hyparquetPromise) hyparquetPromise = import(HYPARQUET_CDN); return hyparquetPromise; } /** * Build the per-model embeddings URL from the dataset repo + the model's * relative path. Centralised so the model registry only knows the path, * not the full URL. */ function embeddingsUrlFor(model) { return `https://huggingface.co/datasets/${DATASET_REPO}/resolve/${DATASET_REVISION}/${model.embeddingPath}`; } /** * Load a text encoder + tokenizer for the given model entry. The model * arg is one of the MODELS registry rows; we use its `repo`, `revision`, * and `dtype` fields. Caller is expected to filter MODELS to only * `available: true` entries. * * @param {object} model — entry from MODELS in config.js * @param {(stage: string, progress: number | null) => void} [onProgress] * stage ∈ {'tokenizer', 'model'}, progress in [0, 1] or null when unknown. * @returns {Promise<{ tokenizer: any, model: any, encode: (text: string) => Promise }>} */ export async function loadEncoder(model, onProgress = () => {}) { const tfjs = await loadTransformersJs(); const { AutoTokenizer, env } = tfjs; // Force remote loading from the Hub (no local proxy on a static Space). env.allowLocalModels = false; env.allowRemoteModels = true; onProgress('tokenizer', null); const tokenizer = await AutoTokenizer.from_pretrained(model.repo, { revision: model.revision || 'main', }); // Pick the right model loader. `AutoModel` works for encoder-only ONNX // exports (like IconClip) but fails on full multimodal CLIPs (Xenova's // ports) because they demand pixel_values. `CLIPTextModelWithProjection` // loads only the text tower of a CLIPModel and exposes `text_embeds`. const ModelClass = tfjs[model.modelClass || 'AutoModel']; if (!ModelClass) { throw new Error(`Unknown modelClass '${model.modelClass}' for ${model.repo}`); } onProgress('model', null); const onnxModel = await ModelClass.from_pretrained(model.repo, { revision: model.revision || 'main', dtype: model.dtype || 'q8', device: DEVICE, progress_callback: (p) => { if (p && typeof p.progress === 'number') { onProgress('model', p.progress / 100); } }, }); /** @param {string} text */ async function encode(text) { // CLIP text encoders use a fixed 77-token context window — that's // the size of the learned positional-embedding table the model was // trained against. Every CLIP-family ONNX export on HF requires the // input to be padded to 77; transformers.js's standard recipe for // these models is `padding: 'max_length', max_length: 77`. const inputs = tokenizer([text], { padding: 'max_length', max_length: 77, truncation: true, }); const out = await onnxModel(inputs); // The output key varies per model (see config.modelClass): IconClip's // encoder-only ONNX exposes `embeddings`; CLIPTextModelWithProjection // exposes `text_embeds`. We prefer the model's declared `outputKey` // but fall back across known names so the engine is forgiving. const tensor = (model.outputKey && out[model.outputKey]) || out.embeddings || out.text_embeds || out.last_hidden_state; if (!tensor || !tensor.data) { throw new Error( `Unexpected ONNX output for ${model.repo} — expected ` + `'${model.outputKey || 'embeddings'}', got keys: ` + Object.keys(out).join(', '), ); } return tensor.data; } return { tokenizer, model: onnxModel, encode }; } /** * Load + decode a model's embedding parquet from the benchmark dataset. * Schema expected per row: `_id: string, embedding: list` * (dim derived from the first row). Returns a contiguous Float32Array * matrix + parallel ids array for cosine.js's row-major linear access. * * @param {object} model — entry from MODELS (uses `embeddingPath`) * @param {(loaded: number, total: number) => void} [onProgress] * @returns {Promise<{ ids: string[], matrix: Float32Array, dim: number }>} */ export async function loadEmbeddings(model, onProgress = () => {}) { const url = embeddingsUrlFor(model); const cached = await openCache().then((c) => c.match(url)); let buf; if (cached) { buf = await cached.arrayBuffer(); onProgress(buf.byteLength, buf.byteLength); } else { const resp = await fetch(url); if (!resp.ok) { throw new Error(`Embeddings fetch failed: ${resp.status} ${resp.statusText} (${url})`); } const total = Number(resp.headers.get('content-length')) || 0; const reader = resp.body.getReader(); const chunks = []; let loaded = 0; for (;;) { const { done, value } = await reader.read(); if (done) break; chunks.push(value); loaded += value.byteLength; onProgress(loaded, total); } buf = new Uint8Array(loaded); let off = 0; for (const c of chunks) { buf.set(c, off); off += c.byteLength; } buf = buf.buffer; // Fire and forget; Cache.put expects a Response and may fail if the // browser disallows opaque caching — swallow that, it's an optimisation. openCache() .then((c) => c.put(url, new Response(buf.slice(0)))) .catch(() => {}); } const { parquetReadObjects } = await loadHyparquet(); // hyparquet wants an AsyncBuffer — for an in-memory blob the simplest // adapter is to make `slice` resolve synchronously via Promise.resolve. const file = { byteLength: buf.byteLength, slice: (start, end) => Promise.resolve(buf.slice(start, end ?? buf.byteLength)), }; const rows = await parquetReadObjects({ file, columns: ['_id', 'embedding'] }); if (rows.length === 0) { throw new Error('Embeddings parquet is empty.'); } const dim = rows[0].embedding.length; const matrix = new Float32Array(rows.length * dim); const ids = new Array(rows.length); for (let r = 0; r < rows.length; r++) { ids[r] = rows[r]._id; const v = rows[r].embedding; const off = r * dim; for (let i = 0; i < dim; i++) matrix[off + i] = v[i]; } return { ids, matrix, dim }; } /** * Load (and cache) the SVG dict for a single library. Lazy — only called * when that library is enabled in the filter chips. * * @param {string} librarySlug * @returns {Promise>} Map<_id, svg_text> */ const svgCache = new Map(); // slug → Map<_id, svg_text> export async function loadLibrarySvgs(librarySlug) { if (svgCache.has(librarySlug)) return svgCache.get(librarySlug); const url = imagesUrl(librarySlug); const cached = await openCache().then((c) => c.match(url)); let buf; if (cached) { buf = await cached.arrayBuffer(); } else { const resp = await fetch(url); if (!resp.ok) { throw new Error(`Images fetch failed for ${librarySlug}: ${resp.status}`); } buf = await resp.arrayBuffer(); openCache() .then((c) => c.put(url, new Response(buf.slice(0)))) .catch(() => {}); } const { parquetReadObjects } = await loadHyparquet(); const file = { byteLength: buf.byteLength, slice: (start, end) => Promise.resolve(buf.slice(start, end ?? buf.byteLength)), }; const rows = await parquetReadObjects({ file, columns: ['_id', 'svg_text'] }); const map = new Map(); for (const row of rows) map.set(row._id, row.svg_text); svgCache.set(librarySlug, map); return map; } /** Get a cached SVG without triggering a fetch. Returns null if not loaded. */ export function getCachedSvg(id) { const colon = id.indexOf(':'); if (colon === -1) return null; const slug = id.slice(0, colon); const map = svgCache.get(slug); return map ? map.get(id) ?? null : null; } // Bump the cache version any time the upstream parquet schema or // compression changes — v2 invalidates the v1 ZSTD-compressed copies that // older visits cached before we switched to snappy. const CACHE_NAME = 'iconclip-demo-v2'; function openCache() { if (typeof caches === 'undefined') { return Promise.resolve({ match: () => Promise.resolve(undefined), put: () => Promise.resolve(), }); } // Best-effort cleanup of older cache versions on first run. caches.keys().then((keys) => { for (const k of keys) { if (k.startsWith('iconclip-demo-') && k !== CACHE_NAME) { caches.delete(k).catch(() => {}); } } }).catch(() => {}); return caches.open(CACHE_NAME); }