Spaces:

blanchon
/

opencs2-dataset-viewer

Running

File size: 1,611 Bytes

6e64fba
fe2e832
31d3580
 
 
 
 
 
 
 
6e64fba
 
 
 
 
 
 
31d3580
 
 
 
 
 
 
 
6e64fba
 
 
 
 
 
 
31d3580
fe2e832
31d3580
6e64fba
31d3580
6e64fba
31d3580
6e64fba
31d3580

import { parquetReadObjects } from 'hyparquet';
import { compressors } from 'hyparquet-compressors';

export type ParquetFetchOptions = {
	fetch?: typeof fetch;
	signal?: AbortSignal;
};

/**
 * Fetch a parquet file over HTTP and return its rows as plain JS objects.
 * Backed by `hyparquet` (pure-JS, ~30KB) instead of `parquet-wasm` (6.3MB
 * WASM + init) — the index parquets are tiny (~1–2KB) so a JS reader is
 * dramatically faster end-to-end despite being slower per-byte.
 *
 * Timestamp columns surface as `Date` from hyparquet; we normalize them to
 * ISO strings here so the rest of the app sees a stable shape. BigInts are
 * downcast to numbers (every numeric column we read fits in a JS number).
 */
export async function fetchParquetRows<T = Record<string, unknown>>(
	url: string,
	opts: ParquetFetchOptions = {}
): Promise<T[]> {
	const f = opts.fetch ?? fetch;
	const res = await f(url, { signal: opts.signal });
	if (!res.ok) throw new Error(`parquet fetch ${url}: ${res.status} ${res.statusText}`);
	const buffer = await res.arrayBuffer();
	const file = {
		byteLength: buffer.byteLength,
		slice(start: number, end?: number): Promise<ArrayBuffer> {
			return Promise.resolve(buffer.slice(start, end));
		}
	};

	const rows = await parquetReadObjects({ file, compressors });

	return rows.map((row) => {
		const out: Record<string, unknown> = {};
		for (const [k, v] of Object.entries(row)) {
			if (v == null) out[k] = v;
			else if (v instanceof Date) out[k] = v.toISOString();
			else if (typeof v === 'bigint') out[k] = Number(v);
			else out[k] = v;
		}
		return out;
	}) as T[];
}