Spaces:
Running
Running
File size: 1,611 Bytes
6e64fba fe2e832 31d3580 6e64fba 31d3580 6e64fba 31d3580 fe2e832 31d3580 6e64fba 31d3580 6e64fba 31d3580 6e64fba 31d3580 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | import { parquetReadObjects } from 'hyparquet';
import { compressors } from 'hyparquet-compressors';
export type ParquetFetchOptions = {
fetch?: typeof fetch;
signal?: AbortSignal;
};
/**
* Fetch a parquet file over HTTP and return its rows as plain JS objects.
* Backed by `hyparquet` (pure-JS, ~30KB) instead of `parquet-wasm` (6.3MB
* WASM + init) — the index parquets are tiny (~1–2KB) so a JS reader is
* dramatically faster end-to-end despite being slower per-byte.
*
* Timestamp columns surface as `Date` from hyparquet; we normalize them to
* ISO strings here so the rest of the app sees a stable shape. BigInts are
* downcast to numbers (every numeric column we read fits in a JS number).
*/
export async function fetchParquetRows<T = Record<string, unknown>>(
url: string,
opts: ParquetFetchOptions = {}
): Promise<T[]> {
const f = opts.fetch ?? fetch;
const res = await f(url, { signal: opts.signal });
if (!res.ok) throw new Error(`parquet fetch ${url}: ${res.status} ${res.statusText}`);
const buffer = await res.arrayBuffer();
const file = {
byteLength: buffer.byteLength,
slice(start: number, end?: number): Promise<ArrayBuffer> {
return Promise.resolve(buffer.slice(start, end));
}
};
const rows = await parquetReadObjects({ file, compressors });
return rows.map((row) => {
const out: Record<string, unknown> = {};
for (const [k, v] of Object.entries(row)) {
if (v == null) out[k] = v;
else if (v instanceof Date) out[k] = v.toISOString();
else if (typeof v === 'bigint') out[k] = Number(v);
else out[k] = v;
}
return out;
}) as T[];
}
|