CLIWorks
/

Spider-FLEXITOKENS

Model card Files Files and versions

xet

Community

CLIWorks commited on 14 days ago

Commit

513d5a8

verified ·

1 Parent(s): 546fd8f

Upload train_remote.sh with huggingface_hub

Browse files

Files changed (1) hide show

train_remote.sh +90 -0

train_remote.sh ADDED Viewed

	@@ -0,0 +1,90 @@

+#!/usr/bin/env bash
+# =============================================================================
+# Spider-FLEXITOKENS Remote Training Launch Script
+# Target: NVIDIA RTX 6000 Pro (Blackwell sm120+, 48GB GDDR7)
+# Precision: MXFP8 (rowwise_with_gw_hp) with FP8_DYNAMIC fallback
+# =============================================================================
+set -euo pipefail
+# ---------------------------------------------------------------------------
+# Configuration — adjust these for your remote environment
+# ---------------------------------------------------------------------------
+export CKPT_DIR="${CKPT_DIR:-checkpoints-spider-remote}"
+export SEQ_LEN="${SEQ_LEN:-2048}"
+export MICRO_BATCH="${MICRO_BATCH:-8}"
+export GRAD_ACCUM="${GRAD_ACCUM:-4}"
+export TARGET_TOKENS="${TARGET_TOKENS:-10000000000}"   # 10B tokens
+export N_LOOPS="${N_LOOPS:-6}"
+export LR="${LR:-3e-4}"
+export CKPT_EVERY="${CKPT_EVERY:-500}"
+export PRECISION="${PRECISION:-mxfp8}"                  # mxfp8|fp8_dynamic|bf16
+export RESUME="${RESUME:-}"                             # path to checkpoint for resume
+# Derived
+export GLOBAL_BATCH_TOK=$(( MICRO_BATCH * GRAD_ACCUM * SEQ_LEN ))
+export TOTAL_STEPS=$(( TARGET_TOKENS / GLOBAL_BATCH_TOK ))
+echo "============================================="
+echo " Spider-FLEXITOKENS Remote Training"
+echo "============================================="
+echo " GPU:             $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
+echo " VRAM:            $(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null || echo 'N/A')"
+echo " Precision:       ${PRECISION}"
+echo " Seq length:      ${SEQ_LEN}"
+echo " Micro batch:     ${MICRO_BATCH}"
+echo " Grad accum:      ${GRAD_ACCUM}"
+echo " Global batch:    ${GLOBAL_BATCH_TOK} tokens/step"
+echo " Target tokens:   ${TARGET_TOKENS} ($(( TARGET_TOKENS / 1000000000 ))B)"
+echo " Total steps:     ~${TOTAL_STEPS}"
+echo " LR:              ${LR}"
+echo " N loops:         ${N_LOOPS}"
+echo " Checkpoint dir:  ${CKPT_DIR}"
+echo " Resume from:     ${RESUME:-none (auto-resume from ${CKPT_DIR})}"
+echo "============================================="
+# ---------------------------------------------------------------------------
+# Environment setup
+# ---------------------------------------------------------------------------
+export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
+export UNSLOTH_MOE_BACKEND="grouped_mm"
+export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
+# ---------------------------------------------------------------------------
+# Dependency check
+# ---------------------------------------------------------------------------
+python3 -c "import torch; print(f'PyTorch {torch.__version__} | CUDA {torch.version.cuda} | sm{torch.cuda.get_device_capability()[0]}')" || { echo "ERROR: PyTorch not found"; exit 1; }
+python3 -c "import torchao; print(f'torchao {torchao.__version__}')" || echo "WARNING: torchao not found — FP8/MXFP8 unavailable, will use BF16"
+python3 -c "from torchao.float8 import Float8LinearConfig; print(f'Float8LinearConfig OK (recipes: {[n.value for n in __import__(\"torchao.float8.config\", fromlist=[\"Float8LinearRecipeName\"]).Float8LinearRecipeName])}')" || echo "WARNING: torchao.float8 not available"
+python3 -c "import bitsandbytes; print(f'bitsandbytes {bitsandbytes.__version__}')" || echo "INFO: bitsandbytes not found — using standard AdamW (expected for FP8+ modes)"
+python3 -c "import unsloth; print('Unsloth available')" 2>/dev/null || echo "INFO: Unsloth not available — using standard PyTorch training"
+# ---------------------------------------------------------------------------
+# Create checkpoint directory
+# ---------------------------------------------------------------------------
+mkdir -p "${CKPT_DIR}"
+# ---------------------------------------------------------------------------
+# Build command
+# ---------------------------------------------------------------------------
+CMD="python3 scripts/train_spider.py"
+CMD="${CMD} --precision ${PRECISION}"
+CMD="${CMD} --ckpt_dir ${CKPT_DIR}"
+CMD="${CMD} --seq_len ${SEQ_LEN}"
+CMD="${CMD} --micro_batch ${MICRO_BATCH}"
+CMD="${CMD} --n_loops ${N_LOOPS}"
+CMD="${CMD} --lr ${LR}"
+if [ -n "${RESUME}" ] && [ -f "${RESUME}" ]; then
+    CMD="${CMD} --resume ${RESUME}"
+fi
+# ---------------------------------------------------------------------------
+# Launch
+# ---------------------------------------------------------------------------
+echo ""
+echo "Launching training..."
+echo "Command: ${CMD}"
+echo ""
+exec ${CMD}