#!/bin/bash # Reproduce the best checkpoint in deterministic mode (bit-reproducible). # # Same three stages as reproduce.sh, but with --deterministic: # 1. Train on 2048-point data (~3hr on 1x RTX 4090) # 2. Finetune on 4096-point data (~30min) # 3. Cooldown with endpoint loss (~2hr) # # Total: ~5.5hr on a single GPU (no torch.compile, ~2x slower than reproduce.sh). # Deterministic mode disables torch.compile and forces CUDA deterministic ops. # Results are bit-identical across runs with the same seed. Expected HSS ~0.372. set -e OUT_DIR="${1:-runs}" BASE="--args-from configs/base.json" # ============================================================ # Step 1: Train on 2048-point data (Phase 1) # ============================================================ echo "=== Step 1: Training on 2048 data (deterministic) ===" python -m s23dr_2026_example.train $BASE \ --cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \ --seq-len 2048 \ --lr 3e-4 \ --batch-size 32 \ --steps 125000 \ --deterministic \ --out-dir "$OUT_DIR" STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname) echo "Step 1 complete: $STEP1_DIR" # ============================================================ # Step 2: Finetune on 4096-point data # ============================================================ echo "=== Step 2: Finetuning on 4096 data (deterministic) ===" python -m s23dr_2026_example.train $BASE \ --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \ --resume "$STEP1_DIR/checkpoints/step125000.pt" \ --seq-len 4096 \ --lr 3e-5 \ --batch-size 64 \ --steps 135000 \ --deterministic \ --out-dir "$OUT_DIR" STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname) echo "Step 2 complete: $STEP2_DIR" # ============================================================ # Step 3: Cooldown with endpoint loss # ============================================================ echo "=== Step 3: Cooldown with endpoint loss (deterministic) ===" python -m s23dr_2026_example.train $BASE \ --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \ --resume "$STEP2_DIR/checkpoints/step135000.pt" \ --seq-len 4096 \ --lr 3e-5 \ --batch-size 64 \ --endpoint-weight 0.1 \ --cooldown-start 150000 \ --cooldown-steps 20000 \ --steps 170000 \ --deterministic \ --out-dir "$OUT_DIR" STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname) echo "Step 3 complete: $STEP3_DIR" echo "" echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt" echo "Copy to checkpoint.pt for submission."