File size: 4,575 Bytes
513d5a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | #!/usr/bin/env bash
# =============================================================================
# Spider-FLEXITOKENS Remote Training Launch Script
# Target: NVIDIA RTX 6000 Pro (Blackwell sm120+, 48GB GDDR7)
# Precision: MXFP8 (rowwise_with_gw_hp) with FP8_DYNAMIC fallback
# =============================================================================
set -euo pipefail
# ---------------------------------------------------------------------------
# Configuration — adjust these for your remote environment
# ---------------------------------------------------------------------------
export CKPT_DIR="${CKPT_DIR:-checkpoints-spider-remote}"
export SEQ_LEN="${SEQ_LEN:-2048}"
export MICRO_BATCH="${MICRO_BATCH:-8}"
export GRAD_ACCUM="${GRAD_ACCUM:-4}"
export TARGET_TOKENS="${TARGET_TOKENS:-10000000000}" # 10B tokens
export N_LOOPS="${N_LOOPS:-6}"
export LR="${LR:-3e-4}"
export CKPT_EVERY="${CKPT_EVERY:-500}"
export PRECISION="${PRECISION:-mxfp8}" # mxfp8|fp8_dynamic|bf16
export RESUME="${RESUME:-}" # path to checkpoint for resume
# Derived
export GLOBAL_BATCH_TOK=$(( MICRO_BATCH * GRAD_ACCUM * SEQ_LEN ))
export TOTAL_STEPS=$(( TARGET_TOKENS / GLOBAL_BATCH_TOK ))
echo "============================================="
echo " Spider-FLEXITOKENS Remote Training"
echo "============================================="
echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
echo " VRAM: $(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null || echo 'N/A')"
echo " Precision: ${PRECISION}"
echo " Seq length: ${SEQ_LEN}"
echo " Micro batch: ${MICRO_BATCH}"
echo " Grad accum: ${GRAD_ACCUM}"
echo " Global batch: ${GLOBAL_BATCH_TOK} tokens/step"
echo " Target tokens: ${TARGET_TOKENS} ($(( TARGET_TOKENS / 1000000000 ))B)"
echo " Total steps: ~${TOTAL_STEPS}"
echo " LR: ${LR}"
echo " N loops: ${N_LOOPS}"
echo " Checkpoint dir: ${CKPT_DIR}"
echo " Resume from: ${RESUME:-none (auto-resume from ${CKPT_DIR})}"
echo "============================================="
# ---------------------------------------------------------------------------
# Environment setup
# ---------------------------------------------------------------------------
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
export UNSLOTH_MOE_BACKEND="grouped_mm"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
# ---------------------------------------------------------------------------
# Dependency check
# ---------------------------------------------------------------------------
python3 -c "import torch; print(f'PyTorch {torch.__version__} | CUDA {torch.version.cuda} | sm{torch.cuda.get_device_capability()[0]}')" || { echo "ERROR: PyTorch not found"; exit 1; }
python3 -c "import torchao; print(f'torchao {torchao.__version__}')" || echo "WARNING: torchao not found — FP8/MXFP8 unavailable, will use BF16"
python3 -c "from torchao.float8 import Float8LinearConfig; print(f'Float8LinearConfig OK (recipes: {[n.value for n in __import__(\"torchao.float8.config\", fromlist=[\"Float8LinearRecipeName\"]).Float8LinearRecipeName])}')" || echo "WARNING: torchao.float8 not available"
python3 -c "import bitsandbytes; print(f'bitsandbytes {bitsandbytes.__version__}')" || echo "INFO: bitsandbytes not found — using standard AdamW (expected for FP8+ modes)"
python3 -c "import unsloth; print('Unsloth available')" 2>/dev/null || echo "INFO: Unsloth not available — using standard PyTorch training"
# ---------------------------------------------------------------------------
# Create checkpoint directory
# ---------------------------------------------------------------------------
mkdir -p "${CKPT_DIR}"
# ---------------------------------------------------------------------------
# Build command
# ---------------------------------------------------------------------------
CMD="python3 scripts/train_spider.py"
CMD="${CMD} --precision ${PRECISION}"
CMD="${CMD} --ckpt_dir ${CKPT_DIR}"
CMD="${CMD} --seq_len ${SEQ_LEN}"
CMD="${CMD} --micro_batch ${MICRO_BATCH}"
CMD="${CMD} --n_loops ${N_LOOPS}"
CMD="${CMD} --lr ${LR}"
if [ -n "${RESUME}" ] && [ -f "${RESUME}" ]; then
CMD="${CMD} --resume ${RESUME}"
fi
# ---------------------------------------------------------------------------
# Launch
# ---------------------------------------------------------------------------
echo ""
echo "Launching training..."
echo "Command: ${CMD}"
echo ""
exec ${CMD}
|