// DuckDB-WASM singleton wrapper. Lazy-instantiated on first query so the // ~5 MB worker + WASM bundles don't show up in the home-page critical path. import * as duckdb from '@duckdb/duckdb-wasm'; import type { AsyncDuckDB, AsyncDuckDBConnection } from '@duckdb/duckdb-wasm'; let dbPromise: Promise | null = null; async function getDb(): Promise { if (dbPromise) return dbPromise; dbPromise = (async () => { const bundles = duckdb.getJsDelivrBundles(); const bundle = await duckdb.selectBundle(bundles); // Worker URL needs to be served from the same origin to satisfy the // browser's Worker constructor; wrap the jsdelivr URL in a blob to do so. const workerUrl = URL.createObjectURL( new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' }) ); const worker = new Worker(workerUrl); const logger = new duckdb.ConsoleLogger(duckdb.LogLevel.WARNING); const db = new duckdb.AsyncDuckDB(logger, worker); await db.instantiate(bundle.mainModule, bundle.pthreadWorker); URL.revokeObjectURL(workerUrl); return db; })().catch((err) => { dbPromise = null; throw err; }); return dbPromise; } // Rewrite the dataset README's `hf://datasets/org/repo/path` URLs to the // public HTTPS form duckdb-wasm can actually fetch. export function rewriteHfUrls(sql: string): string { return sql.replace( /hf:\/\/datasets\/([^/\s'"]+)\/([^/\s'"]+)\/([^'"\s]+)/g, 'https://huggingface.co/datasets/$1/$2/resolve/main/$3' ); } export type QueryResult = { columns: string[]; rows: Record[]; rewrittenSql: string; }; /** * Run a DuckDB SQL query against HTTPS-served parquet files. Returns plain * JS objects with BigInts downcast to numbers and Dates ISO-stringified, so * the caller doesn't have to think about Arrow types. */ export async function runQuery(sql: string, opts: { limit?: number } = {}): Promise { const limit = opts.limit ?? 1000; const db = await getDb(); const conn: AsyncDuckDBConnection = await db.connect(); try { const rewrittenSql = rewriteHfUrls(sql); // Wrap in a CTE to enforce a row cap without rewriting user SQL further. const limited = /\blimit\b/i.test(rewrittenSql) ? rewrittenSql : `WITH _q AS (${stripTrailingSemicolon(rewrittenSql)}) SELECT * FROM _q LIMIT ${limit}`; const result = await conn.query(limited); const columns = result.schema.fields.map((f) => f.name); const rows: Record[] = []; for (const row of result.toArray()) { const obj: Record = {}; const r = row.toJSON() as Record; for (const [k, v] of Object.entries(r)) { obj[k] = normalize(v); } rows.push(obj); } return { columns, rows, rewrittenSql }; } finally { await conn.close(); } } function stripTrailingSemicolon(s: string): string { return s.trim().replace(/;+\s*$/, ''); } function normalize(v: unknown): unknown { if (v == null) return v; if (typeof v === 'bigint') return Number(v); if (v instanceof Date) return v.toISOString(); if (Array.isArray(v)) return v.map(normalize); if (typeof v === 'object') { const out: Record = {}; for (const [k, vv] of Object.entries(v as Record)) out[k] = normalize(vv); return out; } return v; }