Upload train_remote.sh with huggingface_hub
Browse files- train_remote.sh +90 -0
train_remote.sh
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# =============================================================================
|
| 3 |
+
# Spider-FLEXITOKENS Remote Training Launch Script
|
| 4 |
+
# Target: NVIDIA RTX 6000 Pro (Blackwell sm120+, 48GB GDDR7)
|
| 5 |
+
# Precision: MXFP8 (rowwise_with_gw_hp) with FP8_DYNAMIC fallback
|
| 6 |
+
# =============================================================================
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
|
| 9 |
+
# ---------------------------------------------------------------------------
|
| 10 |
+
# Configuration — adjust these for your remote environment
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
export CKPT_DIR="${CKPT_DIR:-checkpoints-spider-remote}"
|
| 13 |
+
export SEQ_LEN="${SEQ_LEN:-2048}"
|
| 14 |
+
export MICRO_BATCH="${MICRO_BATCH:-8}"
|
| 15 |
+
export GRAD_ACCUM="${GRAD_ACCUM:-4}"
|
| 16 |
+
export TARGET_TOKENS="${TARGET_TOKENS:-10000000000}" # 10B tokens
|
| 17 |
+
export N_LOOPS="${N_LOOPS:-6}"
|
| 18 |
+
export LR="${LR:-3e-4}"
|
| 19 |
+
export CKPT_EVERY="${CKPT_EVERY:-500}"
|
| 20 |
+
export PRECISION="${PRECISION:-mxfp8}" # mxfp8|fp8_dynamic|bf16
|
| 21 |
+
export RESUME="${RESUME:-}" # path to checkpoint for resume
|
| 22 |
+
|
| 23 |
+
# Derived
|
| 24 |
+
export GLOBAL_BATCH_TOK=$(( MICRO_BATCH * GRAD_ACCUM * SEQ_LEN ))
|
| 25 |
+
export TOTAL_STEPS=$(( TARGET_TOKENS / GLOBAL_BATCH_TOK ))
|
| 26 |
+
|
| 27 |
+
echo "============================================="
|
| 28 |
+
echo " Spider-FLEXITOKENS Remote Training"
|
| 29 |
+
echo "============================================="
|
| 30 |
+
echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
|
| 31 |
+
echo " VRAM: $(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null || echo 'N/A')"
|
| 32 |
+
echo " Precision: ${PRECISION}"
|
| 33 |
+
echo " Seq length: ${SEQ_LEN}"
|
| 34 |
+
echo " Micro batch: ${MICRO_BATCH}"
|
| 35 |
+
echo " Grad accum: ${GRAD_ACCUM}"
|
| 36 |
+
echo " Global batch: ${GLOBAL_BATCH_TOK} tokens/step"
|
| 37 |
+
echo " Target tokens: ${TARGET_TOKENS} ($(( TARGET_TOKENS / 1000000000 ))B)"
|
| 38 |
+
echo " Total steps: ~${TOTAL_STEPS}"
|
| 39 |
+
echo " LR: ${LR}"
|
| 40 |
+
echo " N loops: ${N_LOOPS}"
|
| 41 |
+
echo " Checkpoint dir: ${CKPT_DIR}"
|
| 42 |
+
echo " Resume from: ${RESUME:-none (auto-resume from ${CKPT_DIR})}"
|
| 43 |
+
echo "============================================="
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Environment setup
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
|
| 49 |
+
export UNSLOTH_MOE_BACKEND="grouped_mm"
|
| 50 |
+
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
| 51 |
+
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Dependency check
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
python3 -c "import torch; print(f'PyTorch {torch.__version__} | CUDA {torch.version.cuda} | sm{torch.cuda.get_device_capability()[0]}')" || { echo "ERROR: PyTorch not found"; exit 1; }
|
| 57 |
+
python3 -c "import torchao; print(f'torchao {torchao.__version__}')" || echo "WARNING: torchao not found — FP8/MXFP8 unavailable, will use BF16"
|
| 58 |
+
python3 -c "from torchao.float8 import Float8LinearConfig; print(f'Float8LinearConfig OK (recipes: {[n.value for n in __import__(\"torchao.float8.config\", fromlist=[\"Float8LinearRecipeName\"]).Float8LinearRecipeName])}')" || echo "WARNING: torchao.float8 not available"
|
| 59 |
+
python3 -c "import bitsandbytes; print(f'bitsandbytes {bitsandbytes.__version__}')" || echo "INFO: bitsandbytes not found — using standard AdamW (expected for FP8+ modes)"
|
| 60 |
+
python3 -c "import unsloth; print('Unsloth available')" 2>/dev/null || echo "INFO: Unsloth not available — using standard PyTorch training"
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Create checkpoint directory
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
mkdir -p "${CKPT_DIR}"
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
# Build command
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
CMD="python3 scripts/train_spider.py"
|
| 71 |
+
CMD="${CMD} --precision ${PRECISION}"
|
| 72 |
+
CMD="${CMD} --ckpt_dir ${CKPT_DIR}"
|
| 73 |
+
CMD="${CMD} --seq_len ${SEQ_LEN}"
|
| 74 |
+
CMD="${CMD} --micro_batch ${MICRO_BATCH}"
|
| 75 |
+
CMD="${CMD} --n_loops ${N_LOOPS}"
|
| 76 |
+
CMD="${CMD} --lr ${LR}"
|
| 77 |
+
|
| 78 |
+
if [ -n "${RESUME}" ] && [ -f "${RESUME}" ]; then
|
| 79 |
+
CMD="${CMD} --resume ${RESUME}"
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
# ---------------------------------------------------------------------------
|
| 83 |
+
# Launch
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
echo ""
|
| 86 |
+
echo "Launching training..."
|
| 87 |
+
echo "Command: ${CMD}"
|
| 88 |
+
echo ""
|
| 89 |
+
|
| 90 |
+
exec ${CMD}
|