#!/bin/bash # Rebuild the sampled datasets from scratch, starting from the public raw # `usm3d/hoho22k_2026_trainval` dataset. Two stages: # # 1. cache_scenes.py : stream raw shards -> per-scene .pt files # (runs point fusion + priority grouping) # 2. make_sampled_cache.py : per-scene .pt -> fixed-size .npz files # (priority samples to seq_len=2048 or 4096) # # This reproduces the content of # hf://usm3d/s23dr-2026-sampled_2048_v2 # hf://usm3d/s23dr-2026-sampled_4096_v2 # without needing the intermediate (private) cached_full_pcd dataset. # # ~3-4 hr on a workstation for the full train+val set (network-bound in stage 1). set -e OUT_ROOT="${1:-cache}" FULL_TRAIN="$OUT_ROOT/full/train" FULL_VAL="$OUT_ROOT/full/validation" # ----- Stage 1: raw -> per-scene .pt ----- echo "=== Stage 1: caching train scenes from raw tars ===" python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_TRAIN" --split train --skip-existing echo "=== Stage 1: caching validation scenes from raw tars ===" python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_VAL" --split validation --skip-existing # ----- Stage 2: .pt -> sampled .npz ----- for split in train validation; do for seq_len in 2048 4096; do in_dir="$OUT_ROOT/full/$split" out_dir="$OUT_ROOT/sampled_${seq_len}/$split" echo "=== Stage 2: sampling $split at seq_len=$seq_len ===" python -m s23dr_2026_example.make_sampled_cache \ --in-dir "$in_dir" --out-dir "$out_dir" --seq-len "$seq_len" done done echo "" echo "All done. Sampled datasets are at:" echo " $OUT_ROOT/sampled_2048/{train,validation}" echo " $OUT_ROOT/sampled_4096/{train,validation}" echo "" echo "To train from these, point reproduce.sh at them via" echo " --cache-dir \"\$OUT_ROOT/sampled_2048/train\" (and similar for val/4096)" echo "instead of the default hf:// URLs."