Spaces:
Running
Running
| // DuckDB-WASM singleton wrapper. Lazy-instantiated on first query so the | |
| // ~5 MB worker + WASM bundles don't show up in the home-page critical path. | |
| import * as duckdb from '@duckdb/duckdb-wasm'; | |
| import type { AsyncDuckDB, AsyncDuckDBConnection } from '@duckdb/duckdb-wasm'; | |
| let dbPromise: Promise<AsyncDuckDB> | null = null; | |
| async function getDb(): Promise<AsyncDuckDB> { | |
| if (dbPromise) return dbPromise; | |
| dbPromise = (async () => { | |
| const bundles = duckdb.getJsDelivrBundles(); | |
| const bundle = await duckdb.selectBundle(bundles); | |
| // Worker URL needs to be served from the same origin to satisfy the | |
| // browser's Worker constructor; wrap the jsdelivr URL in a blob to do so. | |
| const workerUrl = URL.createObjectURL( | |
| new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' }) | |
| ); | |
| const worker = new Worker(workerUrl); | |
| const logger = new duckdb.ConsoleLogger(duckdb.LogLevel.WARNING); | |
| const db = new duckdb.AsyncDuckDB(logger, worker); | |
| await db.instantiate(bundle.mainModule, bundle.pthreadWorker); | |
| URL.revokeObjectURL(workerUrl); | |
| return db; | |
| })().catch((err) => { | |
| dbPromise = null; | |
| throw err; | |
| }); | |
| return dbPromise; | |
| } | |
| // Rewrite the dataset README's `hf://datasets/org/repo/path` URLs to the | |
| // public HTTPS form duckdb-wasm can actually fetch. | |
| export function rewriteHfUrls(sql: string): string { | |
| return sql.replace( | |
| /hf:\/\/datasets\/([^/\s'"]+)\/([^/\s'"]+)\/([^'"\s]+)/g, | |
| 'https://huggingface.co/datasets/$1/$2/resolve/main/$3' | |
| ); | |
| } | |
| export type QueryResult = { | |
| columns: string[]; | |
| rows: Record<string, unknown>[]; | |
| rewrittenSql: string; | |
| }; | |
| /** | |
| * Run a DuckDB SQL query against HTTPS-served parquet files. Returns plain | |
| * JS objects with BigInts downcast to numbers and Dates ISO-stringified, so | |
| * the caller doesn't have to think about Arrow types. | |
| */ | |
| export async function runQuery(sql: string, opts: { limit?: number } = {}): Promise<QueryResult> { | |
| const limit = opts.limit ?? 1000; | |
| const db = await getDb(); | |
| const conn: AsyncDuckDBConnection = await db.connect(); | |
| try { | |
| const rewrittenSql = rewriteHfUrls(sql); | |
| // Wrap in a CTE to enforce a row cap without rewriting user SQL further. | |
| const limited = /\blimit\b/i.test(rewrittenSql) | |
| ? rewrittenSql | |
| : `WITH _q AS (${stripTrailingSemicolon(rewrittenSql)}) SELECT * FROM _q LIMIT ${limit}`; | |
| const result = await conn.query(limited); | |
| const columns = result.schema.fields.map((f) => f.name); | |
| const rows: Record<string, unknown>[] = []; | |
| for (const row of result.toArray()) { | |
| const obj: Record<string, unknown> = {}; | |
| const r = row.toJSON() as Record<string, unknown>; | |
| for (const [k, v] of Object.entries(r)) { | |
| obj[k] = normalize(v); | |
| } | |
| rows.push(obj); | |
| } | |
| return { columns, rows, rewrittenSql }; | |
| } finally { | |
| await conn.close(); | |
| } | |
| } | |
| function stripTrailingSemicolon(s: string): string { | |
| return s.trim().replace(/;+\s*$/, ''); | |
| } | |
| function normalize(v: unknown): unknown { | |
| if (v == null) return v; | |
| if (typeof v === 'bigint') return Number(v); | |
| if (v instanceof Date) return v.toISOString(); | |
| if (Array.isArray(v)) return v.map(normalize); | |
| if (typeof v === 'object') { | |
| const out: Record<string, unknown> = {}; | |
| for (const [k, vv] of Object.entries(v as Record<string, unknown>)) out[k] = normalize(vv); | |
| return out; | |
| } | |
| return v; | |
| } | |