blanchon commited on
Commit
fe2e832
·
1 Parent(s): 6e64fba

Add hyparquet-compressors so ZSTD parquets actually parse

Browse files

The dataset's parquets are ZSTD-compressed and hyparquet's core only ships
support for the trivially-decodable codecs. Pulled in `hyparquet-compressors`
(brotli/snappy/lz4/zstd/...) and pass its `compressors` map into
`parquetReadObjects`. Adds ~100KB gzip — still vastly under the 6.3MB
parquet-wasm blob it replaced.

Files changed (3) hide show
  1. bun.lock +7 -0
  2. package.json +1 -0
  3. src/lib/api/parquet.ts +2 -1
bun.lock CHANGED
@@ -6,6 +6,7 @@
6
  "name": "app",
7
  "dependencies": {
8
  "hyparquet": "^1.25.6",
 
9
  "mediabunny": "^1.42.0",
10
  },
11
  "devDependencies": {
@@ -322,10 +323,16 @@
322
 
323
  "fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="],
324
 
 
 
325
  "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="],
326
 
327
  "hyparquet": ["hyparquet@1.25.6", "", {}, "sha512-Q9W5IjkVch3ZMnYd4qFv2q8suu5Jc36yt7J+zUNM9grwnP1S189icp0jdEQKM5HJvQkTVy8NMiQ8n/dM5QAt1A=="],
328
 
 
 
 
 
329
  "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": "2.1.2" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="],
330
 
331
  "inline-style-parser": ["inline-style-parser@0.2.7", "", {}, "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA=="],
 
6
  "name": "app",
7
  "dependencies": {
8
  "hyparquet": "^1.25.6",
9
+ "hyparquet-compressors": "^1.1.1",
10
  "mediabunny": "^1.42.0",
11
  },
12
  "devDependencies": {
 
323
 
324
  "fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="],
325
 
326
+ "fzstd": ["fzstd@0.1.1", "", {}, "sha512-dkuVSOKKwh3eas5VkJy1AW1vFpet8TA/fGmVA5krThl8YcOVE/8ZIoEA1+U1vEn5ckxxhLirSdY837azmbaNHA=="],
327
+
328
  "graceful-fs": ["graceful-fs@4.2.11", "", {}, "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ=="],
329
 
330
  "hyparquet": ["hyparquet@1.25.6", "", {}, "sha512-Q9W5IjkVch3ZMnYd4qFv2q8suu5Jc36yt7J+zUNM9grwnP1S189icp0jdEQKM5HJvQkTVy8NMiQ8n/dM5QAt1A=="],
331
 
332
+ "hyparquet-compressors": ["hyparquet-compressors@1.1.1", "", { "dependencies": { "fzstd": "0.1.1", "hysnappy": "1.0.0" } }, "sha512-yx7aA3Rhj0YycbdV71+XznQSLAefa4cT0urpgNXy4aM6eSeCknaVDNne8y45Uz74Fb15yyXUzOStlceOJBan7A=="],
333
+
334
+ "hysnappy": ["hysnappy@1.0.0", "", {}, "sha512-MNrC4NfwDGPb889O6gIfEtbvEZCSWUsSEhsz4Oq2FRcpGtXHfeVz3KciSPp5Pnnz1NjFMgDQNfxdJozymJEDDA=="],
335
+
336
  "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": "2.1.2" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="],
337
 
338
  "inline-style-parser": ["inline-style-parser@0.2.7", "", {}, "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA=="],
package.json CHANGED
@@ -42,6 +42,7 @@
42
  },
43
  "dependencies": {
44
  "hyparquet": "^1.25.6",
 
45
  "mediabunny": "^1.42.0"
46
  }
47
  }
 
42
  },
43
  "dependencies": {
44
  "hyparquet": "^1.25.6",
45
+ "hyparquet-compressors": "^1.1.1",
46
  "mediabunny": "^1.42.0"
47
  }
48
  }
src/lib/api/parquet.ts CHANGED
@@ -1,4 +1,5 @@
1
  import { parquetReadObjects } from 'hyparquet';
 
2
 
3
  export type ParquetFetchOptions = {
4
  fetch?: typeof fetch;
@@ -30,7 +31,7 @@ export async function fetchParquetRows<T = Record<string, unknown>>(
30
  }
31
  };
32
 
33
- const rows = await parquetReadObjects({ file });
34
 
35
  return rows.map((row) => {
36
  const out: Record<string, unknown> = {};
 
1
  import { parquetReadObjects } from 'hyparquet';
2
+ import { compressors } from 'hyparquet-compressors';
3
 
4
  export type ParquetFetchOptions = {
5
  fetch?: typeof fetch;
 
31
  }
32
  };
33
 
34
+ const rows = await parquetReadObjects({ file, compressors });
35
 
36
  return rows.map((row) => {
37
  const out: Record<string, unknown> = {};