#!/bin/bash
# Reproduce the best checkpoint in deterministic mode (bit-reproducible).
#
# Same three stages as reproduce.sh, but with --deterministic:
#   1. Train on 2048-point data       (~3hr on 1x RTX 4090)
#   2. Finetune on 4096-point data    (~30min)
#   3. Cooldown with endpoint loss    (~2hr)
#
# Total: ~5.5hr on a single GPU (no torch.compile, ~2x slower than reproduce.sh).
# Deterministic mode disables torch.compile and forces CUDA deterministic ops.
# Results are bit-identical across runs with the same seed. Expected HSS ~0.372.
set -e

OUT_DIR="${1:-runs}"
BASE="--args-from configs/base.json"

# ============================================================
# Step 1: Train on 2048-point data (Phase 1)
# ============================================================
echo "=== Step 1: Training on 2048 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
    --cache-dir hf://usm3d/s23dr-2026-sampled_2048_v2:train \
    --seq-len 2048 \
    --lr 3e-4 \
    --batch-size 32 \
    --steps 125000 \
    --deterministic \
    --out-dir "$OUT_DIR"

STEP1_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 1 complete: $STEP1_DIR"

# ============================================================
# Step 2: Finetune on 4096-point data
# ============================================================
echo "=== Step 2: Finetuning on 4096 data (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
    --resume "$STEP1_DIR/checkpoints/step125000.pt" \
    --seq-len 4096 \
    --lr 3e-5 \
    --batch-size 64 \
    --steps 135000 \
    --deterministic \
    --out-dir "$OUT_DIR"

STEP2_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 2 complete: $STEP2_DIR"

# ============================================================
# Step 3: Cooldown with endpoint loss
# ============================================================
echo "=== Step 3: Cooldown with endpoint loss (deterministic) ==="
python -m s23dr_2026_example.train $BASE \
    --cache-dir hf://usm3d/s23dr-2026-sampled_4096_v2:train \
    --resume "$STEP2_DIR/checkpoints/step135000.pt" \
    --seq-len 4096 \
    --lr 3e-5 \
    --batch-size 64 \
    --endpoint-weight 0.1 \
    --cooldown-start 150000 \
    --cooldown-steps 20000 \
    --steps 170000 \
    --deterministic \
    --out-dir "$OUT_DIR"

STEP3_DIR=$(ls -dt "$OUT_DIR"/*/args.json 2>/dev/null | head -1 | xargs dirname)
echo "Step 3 complete: $STEP3_DIR"
echo ""
echo "Final checkpoint: $STEP3_DIR/checkpoints/final.pt"
echo "Copy to checkpoint.pt for submission."