Spaces:
Running
Running
| import { parquetReadObjects } from 'hyparquet'; | |
| import { compressors } from 'hyparquet-compressors'; | |
| export type ParquetFetchOptions = { | |
| fetch?: typeof fetch; | |
| signal?: AbortSignal; | |
| }; | |
| /** | |
| * Fetch a parquet file over HTTP and return its rows as plain JS objects. | |
| * Backed by `hyparquet` (pure-JS, ~30KB) instead of `parquet-wasm` (6.3MB | |
| * WASM + init) — the index parquets are tiny (~1–2KB) so a JS reader is | |
| * dramatically faster end-to-end despite being slower per-byte. | |
| * | |
| * Timestamp columns surface as `Date` from hyparquet; we normalize them to | |
| * ISO strings here so the rest of the app sees a stable shape. BigInts are | |
| * downcast to numbers (every numeric column we read fits in a JS number). | |
| */ | |
| export async function fetchParquetRows<T = Record<string, unknown>>( | |
| url: string, | |
| opts: ParquetFetchOptions = {} | |
| ): Promise<T[]> { | |
| const f = opts.fetch ?? fetch; | |
| const res = await f(url, { signal: opts.signal }); | |
| if (!res.ok) throw new Error(`parquet fetch ${url}: ${res.status} ${res.statusText}`); | |
| const buffer = await res.arrayBuffer(); | |
| const file = { | |
| byteLength: buffer.byteLength, | |
| slice(start: number, end?: number): Promise<ArrayBuffer> { | |
| return Promise.resolve(buffer.slice(start, end)); | |
| } | |
| }; | |
| const rows = await parquetReadObjects({ file, compressors }); | |
| return rows.map((row) => { | |
| const out: Record<string, unknown> = {}; | |
| for (const [k, v] of Object.entries(row)) { | |
| if (v == null) out[k] = v; | |
| else if (v instanceof Date) out[k] = v.toISOString(); | |
| else if (typeof v === 'bigint') out[k] = Number(v); | |
| else out[k] = v; | |
| } | |
| return out; | |
| }) as T[]; | |
| } | |