#!/bin/bash
# Rebuild the sampled datasets from scratch, starting from the public raw
# `usm3d/hoho22k_2026_trainval` dataset. Two stages:
#
#   1. cache_scenes.py       : stream raw shards -> per-scene .pt files
#                              (runs point fusion + priority grouping)
#   2. make_sampled_cache.py : per-scene .pt -> fixed-size .npz files
#                              (priority samples to seq_len=2048 or 4096)
#
# This reproduces the content of
#   hf://usm3d/s23dr-2026-sampled_2048_v2
#   hf://usm3d/s23dr-2026-sampled_4096_v2
# without needing the intermediate (private) cached_full_pcd dataset.
#
# ~3-4 hr on a workstation for the full train+val set (network-bound in stage 1).
set -e

OUT_ROOT="${1:-cache}"
FULL_TRAIN="$OUT_ROOT/full/train"
FULL_VAL="$OUT_ROOT/full/validation"

# ----- Stage 1: raw -> per-scene .pt -----
echo "=== Stage 1: caching train scenes from raw tars ==="
python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_TRAIN" --split train --skip-existing

echo "=== Stage 1: caching validation scenes from raw tars ==="
python -m s23dr_2026_example.cache_scenes --out-dir "$FULL_VAL" --split validation --skip-existing

# ----- Stage 2: .pt -> sampled .npz -----
for split in train validation; do
    for seq_len in 2048 4096; do
        in_dir="$OUT_ROOT/full/$split"
        out_dir="$OUT_ROOT/sampled_${seq_len}/$split"
        echo "=== Stage 2: sampling $split at seq_len=$seq_len ==="
        python -m s23dr_2026_example.make_sampled_cache \
            --in-dir "$in_dir" --out-dir "$out_dir" --seq-len "$seq_len"
    done
done

echo ""
echo "All done. Sampled datasets are at:"
echo "  $OUT_ROOT/sampled_2048/{train,validation}"
echo "  $OUT_ROOT/sampled_4096/{train,validation}"
echo ""
echo "To train from these, point reproduce.sh at them via"
echo "  --cache-dir \"\$OUT_ROOT/sampled_2048/train\" (and similar for val/4096)"
echo "instead of the default hf:// URLs."