File size: 1,928 Bytes
4946666 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | #!/bin/bash
# Rebuild the sampled datasets from scratch, starting from the public raw
# `usm3d/hoho22k_2026_trainval` dataset. Two stages:
#
# 1. cache_scenes.py : stream raw shards -> per-scene .pt files
# (runs point fusion + priority grouping)
# 2. make_sampled_cache.py : per-scene .pt -> fixed-size .npz files
# (priority samples to seq_len=2048 or 4096)
#
# This reproduces the content of
# hf://usm3d/s23dr-2026-sampled_2048_v2
# hf://usm3d/s23dr-2026-sampled_4096_v2
# without needing the intermediate (private) cached_full_pcd dataset.
#
# ~3-4 hr on a workstation for the full train+val set (network-bound in stage 1).
set -e
OUT_ROOT="${1:-cache}"
FULL_TRAIN="$OUT_ROOT/full/train"
FULL_VAL="$OUT_ROOT/full/validation"
# ----- Stage 1: raw -> per-scene .pt -----
echo "=== Stage 1: caching train scenes from raw tars ==="
python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_TRAIN" --split train --skip-existing
echo "=== Stage 1: caching validation scenes from raw tars ==="
python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_VAL" --split validation --skip-existing
# ----- Stage 2: .pt -> sampled .npz -----
for split in train validation; do
for seq_len in 2048 4096; do
in_dir="$OUT_ROOT/full/$split"
out_dir="$OUT_ROOT/sampled_${seq_len}/$split"
echo "=== Stage 2: sampling $split at seq_len=$seq_len ==="
python -m s23dr_2026_example.make_sampled_cache \
--in-dir "$in_dir" --out-dir "$out_dir" --seq-len "$seq_len"
done
done
echo ""
echo "All done. Sampled datasets are at:"
echo " $OUT_ROOT/sampled_2048/{train,validation}"
echo " $OUT_ROOT/sampled_4096/{train,validation}"
echo ""
echo "To train from these, point reproduce.sh at them via"
echo " --cache-dir \"\$OUT_ROOT/sampled_2048/train\" (and similar for val/4096)"
echo "instead of the default hf:// URLs."
|