Spaces:
Running
Running
File size: 3,305 Bytes
5a67aa1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | // DuckDB-WASM singleton wrapper. Lazy-instantiated on first query so the
// ~5 MB worker + WASM bundles don't show up in the home-page critical path.
import * as duckdb from '@duckdb/duckdb-wasm';
import type { AsyncDuckDB, AsyncDuckDBConnection } from '@duckdb/duckdb-wasm';
let dbPromise: Promise<AsyncDuckDB> | null = null;
async function getDb(): Promise<AsyncDuckDB> {
if (dbPromise) return dbPromise;
dbPromise = (async () => {
const bundles = duckdb.getJsDelivrBundles();
const bundle = await duckdb.selectBundle(bundles);
// Worker URL needs to be served from the same origin to satisfy the
// browser's Worker constructor; wrap the jsdelivr URL in a blob to do so.
const workerUrl = URL.createObjectURL(
new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' })
);
const worker = new Worker(workerUrl);
const logger = new duckdb.ConsoleLogger(duckdb.LogLevel.WARNING);
const db = new duckdb.AsyncDuckDB(logger, worker);
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
URL.revokeObjectURL(workerUrl);
return db;
})().catch((err) => {
dbPromise = null;
throw err;
});
return dbPromise;
}
// Rewrite the dataset README's `hf://datasets/org/repo/path` URLs to the
// public HTTPS form duckdb-wasm can actually fetch.
export function rewriteHfUrls(sql: string): string {
return sql.replace(
/hf:\/\/datasets\/([^/\s'"]+)\/([^/\s'"]+)\/([^'"\s]+)/g,
'https://huggingface.co/datasets/$1/$2/resolve/main/$3'
);
}
export type QueryResult = {
columns: string[];
rows: Record<string, unknown>[];
rewrittenSql: string;
};
/**
* Run a DuckDB SQL query against HTTPS-served parquet files. Returns plain
* JS objects with BigInts downcast to numbers and Dates ISO-stringified, so
* the caller doesn't have to think about Arrow types.
*/
export async function runQuery(sql: string, opts: { limit?: number } = {}): Promise<QueryResult> {
const limit = opts.limit ?? 1000;
const db = await getDb();
const conn: AsyncDuckDBConnection = await db.connect();
try {
const rewrittenSql = rewriteHfUrls(sql);
// Wrap in a CTE to enforce a row cap without rewriting user SQL further.
const limited = /\blimit\b/i.test(rewrittenSql)
? rewrittenSql
: `WITH _q AS (${stripTrailingSemicolon(rewrittenSql)}) SELECT * FROM _q LIMIT ${limit}`;
const result = await conn.query(limited);
const columns = result.schema.fields.map((f) => f.name);
const rows: Record<string, unknown>[] = [];
for (const row of result.toArray()) {
const obj: Record<string, unknown> = {};
const r = row.toJSON() as Record<string, unknown>;
for (const [k, v] of Object.entries(r)) {
obj[k] = normalize(v);
}
rows.push(obj);
}
return { columns, rows, rewrittenSql };
} finally {
await conn.close();
}
}
function stripTrailingSemicolon(s: string): string {
return s.trim().replace(/;+\s*$/, '');
}
function normalize(v: unknown): unknown {
if (v == null) return v;
if (typeof v === 'bigint') return Number(v);
if (v instanceof Date) return v.toISOString();
if (Array.isArray(v)) return v.map(normalize);
if (typeof v === 'object') {
const out: Record<string, unknown> = {};
for (const [k, vv] of Object.entries(v as Record<string, unknown>)) out[k] = normalize(vv);
return out;
}
return v;
}
|