Spaces:

blanchon
/

opencs2-dataset-viewer

Running

App Files Files Community

opencs2-dataset-viewer / src /lib /api /parquet.ts

blanchon

Add hyparquet-compressors so ZSTD parquets actually parse

fe2e832 24 days ago

raw

history blame contribute delete

1.61 kB

	import { parquetReadObjects } from 'hyparquet';
	import { compressors } from 'hyparquet-compressors';

	export type ParquetFetchOptions = {
	fetch?: typeof fetch;
	signal?: AbortSignal;
	};

	/**
	* Fetch a parquet file over HTTP and return its rows as plain JS objects.
	* Backed by `hyparquet` (pure-JS, ~30KB) instead of `parquet-wasm` (6.3MB
	* WASM + init) — the index parquets are tiny (~1–2KB) so a JS reader is
	* dramatically faster end-to-end despite being slower per-byte.
	*
	* Timestamp columns surface as `Date` from hyparquet; we normalize them to
	* ISO strings here so the rest of the app sees a stable shape. BigInts are
	* downcast to numbers (every numeric column we read fits in a JS number).
	*/
	export async function fetchParquetRows<T = Record<string, unknown>>(
	url: string,
	opts: ParquetFetchOptions = {}
	): Promise<T[]> {
	const f = opts.fetch ?? fetch;
	const res = await f(url, { signal: opts.signal });
	if (!res.ok) throw new Error(`parquet fetch ${url}: ${res.status} ${res.statusText}`);
	const buffer = await res.arrayBuffer();
	const file = {
	byteLength: buffer.byteLength,
	slice(start: number, end?: number): Promise<ArrayBuffer> {
	return Promise.resolve(buffer.slice(start, end));
	}
	};

	const rows = await parquetReadObjects({ file, compressors });

	return rows.map((row) => {
	const out: Record<string, unknown> = {};
	for (const [k, v] of Object.entries(row)) {
	if (v == null) out[k] = v;
	else if (v instanceof Date) out[k] = v.toISOString();
	else if (typeof v === 'bigint') out[k] = Number(v);
	else out[k] = v;
	}
	return out;
	}) as T[];
	}