cmevs-code / tools /update_croissant_with_real_hashes.py

anon-cmevs-2026

Initial code release for NeurIPS 2026 D&B reviewer reference

5c1bb37 verified 17 days ago

3.82 kB

	#!/usr/bin/env python3
	"""Patch croissant.json with real archive-level SHA256 digests.

	After the H100 batch hash run finishes, you have:
	- manifests/SHA256SUMS_<archive_id>.txt per-archive line-by-line manifest
	- manifests/ARCHIVE_DIGESTS.txt sha256 of each manifest file

	This script reads ARCHIVE_DIGESTS.txt, maps each <archive_id> to its
	matching `cr:FileObject` in croissant.json (by `@id`), and rewrites the
	`sha256` field. It also updates `contentSize` from the on-disk manifest's
	total file count via a sidecar count file (optional).

	Usage
	tools/update_croissant_with_real_hashes.py \
	--croissant dataset_metadata/croissant.json \
	--digests manifests_local/ARCHIVE_DIGESTS.txt \
	--out dataset_metadata/croissant.json

	Mapping (archive_id -> croissant @id) is hard-coded below to match the
	actual server layout in /data/data_wr/data_shushu/.
	"""
	import argparse
	import json
	import sys
	from pathlib import Path

	# Map H100 archive_id (chosen in run_hashes.sh) -> Croissant @id.
	# The Croissant file currently has cr:FileObject entries with @id ending
	# in ".tar"; we rewrite the entries even though the actual upload may be
	# a flat directory rather than a tar (the sha256 still pins the content).
	ARCHIVE_TO_CROISSANT = {
	"blender_indoor_round1+2": "blender-indoor-archive.tar",
	"blender_indoor_round2": "blender-indoor-round2.tar", # may not exist yet in croissant.json
	"blender_1m_discrete": "blender-1m-discrete.tar", # may not exist yet
	"HM3D": "hm3d-adapter.tar",
	"scannetpp": "scannetpp-adapter.tar",
	"OB3D": "outdoor-ob3d-archive.tar",
	"tartanground": "outdoor-tartanground-archive.tar",
	"shushu_line": "shushu-line.tar", # may not exist yet
	"shushu_circle": "shushu-circle.tar", # may not exist yet
	"robustness": "robustness.tar", # may not exist yet
	}


	def read_digests(path: Path) -> dict[str, str]:
	"""Parse ARCHIVE_DIGESTS.txt: each line is '<sha> SHA256SUMS_<id>.txt'."""
	out = {}
	with open(path) as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	sha, fname = line.split(maxsplit=1)
	# fname looks like 'SHA256SUMS_<archive_id>.txt'
	archive_id = fname.removeprefix("SHA256SUMS_").removesuffix(".txt")
	out[archive_id] = sha
	return out


	def main() -> int:
	ap = argparse.ArgumentParser()
	ap.add_argument("--croissant", required=True, type=Path)
	ap.add_argument("--digests", required=True, type=Path)
	ap.add_argument("--out", required=True, type=Path)
	args = ap.parse_args()

	cr = json.loads(args.croissant.read_text())
	digests = read_digests(args.digests)

	# Build @id -> entry index.
	by_id = {e.get("@id"): i for i, e in enumerate(cr.get("distribution", []))}

	matched, unmatched, missing_in_cr = 0, [], []
	for archive_id, sha in digests.items():
	cr_id = ARCHIVE_TO_CROISSANT.get(archive_id)
	if not cr_id:
	unmatched.append(archive_id)
	continue
	if cr_id not in by_id:
	missing_in_cr.append((archive_id, cr_id))
	continue
	cr["distribution"][by_id[cr_id]]["sha256"] = sha
	matched += 1

	args.out.write_text(json.dumps(cr, indent=2, ensure_ascii=False) + "\n")

	print(f"updated {matched} cr:FileObject entries with real sha256")
	if unmatched:
	print(f" no croissant mapping for: {unmatched}")
	if missing_in_cr:
	print(f" croissant @id not found for: {missing_in_cr}")
	print(f" wrote: {args.out}")
	return 0


	if __name__ == "__main__":
	sys.exit(main())