Other
PyTorch
3d-reconstruction
wireframe
building
point-cloud
s23dr
cvpr-2026
learned-baseline-2026 / reproduce_deterministic.sh
jacklangerman's picture
4096-release (#1)
0f31e57
#!/bin/bash
# Reproduce the best checkpoint in deterministic mode (bit-reproducible).
#
# Same three stages as reproduce.sh, but with --deterministic:
# 1. Train on 2048-point data (~3hr on 1x RTX 4090)
# 2. Finetune on 4096-point data (~30min)
# 3. Cooldown with endpoint loss (~2hr)
#
# Total: ~5.5hr on a single GPU (no torch.compile, ~2x slower than reproduce.sh).
# Deterministic mode disables torch.compile and forces CUDA deterministic ops.
# Results are bit-identical across runs with the same seed. Expected HSS ~0.372.
set -e
OUT_DIR="${1:-runs}"
BASE="--args-from configs/base.json"
# ============================================================
# Step 1: Train on 2048-point data (Phase 1)
# ============================================================
echo "=== Step 1: Training on 2048 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \
--seq-len 2048 \
--lr 3e-4 \
--batch-size 32 \
--steps 125000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 1 complete: $STEP1_DIR"
# ============================================================
# Step 2: Finetune on 4096-point data
# ============================================================
echo "=== Step 2: Finetuning on 4096 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
--resume "$STEP1_DIR/checkpoints/step125000.pt" \
--seq-len 4096 \
--lr 3e-5 \
--batch-size 64 \
--steps 135000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 2 complete: $STEP2_DIR"
# ============================================================
# Step 3: Cooldown with endpoint loss
# ============================================================
echo "=== Step 3: Cooldown with endpoint loss (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
--cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
--resume "$STEP2_DIR/checkpoints/step135000.pt" \
--seq-len 4096 \
--lr 3e-5 \
--batch-size 64 \
--endpoint-weight 0.1 \
--cooldown-start 150000 \
--cooldown-steps 20000 \
--steps 170000 \
--deterministic \
--out-dir "$OUT_DIR"
STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 3 complete: $STEP3_DIR"
echo ""
echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt"
echo "Copy to checkpoint.pt for submission."