CLIWorks commited on
Commit
513d5a8
·
verified ·
1 Parent(s): 546fd8f

Upload train_remote.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. train_remote.sh +90 -0
train_remote.sh ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # =============================================================================
3
+ # Spider-FLEXITOKENS Remote Training Launch Script
4
+ # Target: NVIDIA RTX 6000 Pro (Blackwell sm120+, 48GB GDDR7)
5
+ # Precision: MXFP8 (rowwise_with_gw_hp) with FP8_DYNAMIC fallback
6
+ # =============================================================================
7
+ set -euo pipefail
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # Configuration — adjust these for your remote environment
11
+ # ---------------------------------------------------------------------------
12
+ export CKPT_DIR="${CKPT_DIR:-checkpoints-spider-remote}"
13
+ export SEQ_LEN="${SEQ_LEN:-2048}"
14
+ export MICRO_BATCH="${MICRO_BATCH:-8}"
15
+ export GRAD_ACCUM="${GRAD_ACCUM:-4}"
16
+ export TARGET_TOKENS="${TARGET_TOKENS:-10000000000}" # 10B tokens
17
+ export N_LOOPS="${N_LOOPS:-6}"
18
+ export LR="${LR:-3e-4}"
19
+ export CKPT_EVERY="${CKPT_EVERY:-500}"
20
+ export PRECISION="${PRECISION:-mxfp8}" # mxfp8|fp8_dynamic|bf16
21
+ export RESUME="${RESUME:-}" # path to checkpoint for resume
22
+
23
+ # Derived
24
+ export GLOBAL_BATCH_TOK=$(( MICRO_BATCH * GRAD_ACCUM * SEQ_LEN ))
25
+ export TOTAL_STEPS=$(( TARGET_TOKENS / GLOBAL_BATCH_TOK ))
26
+
27
+ echo "============================================="
28
+ echo " Spider-FLEXITOKENS Remote Training"
29
+ echo "============================================="
30
+ echo " GPU: $(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null || echo 'N/A')"
31
+ echo " VRAM: $(nvidia-smi --query-gpu=memory.total --format=csv,noheader 2>/dev/null || echo 'N/A')"
32
+ echo " Precision: ${PRECISION}"
33
+ echo " Seq length: ${SEQ_LEN}"
34
+ echo " Micro batch: ${MICRO_BATCH}"
35
+ echo " Grad accum: ${GRAD_ACCUM}"
36
+ echo " Global batch: ${GLOBAL_BATCH_TOK} tokens/step"
37
+ echo " Target tokens: ${TARGET_TOKENS} ($(( TARGET_TOKENS / 1000000000 ))B)"
38
+ echo " Total steps: ~${TOTAL_STEPS}"
39
+ echo " LR: ${LR}"
40
+ echo " N loops: ${N_LOOPS}"
41
+ echo " Checkpoint dir: ${CKPT_DIR}"
42
+ echo " Resume from: ${RESUME:-none (auto-resume from ${CKPT_DIR})}"
43
+ echo "============================================="
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Environment setup
47
+ # ---------------------------------------------------------------------------
48
+ export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
49
+ export UNSLOTH_MOE_BACKEND="grouped_mm"
50
+ export CUDA_DEVICE_ORDER="PCI_BUS_ID"
51
+ export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
52
+
53
+ # ---------------------------------------------------------------------------
54
+ # Dependency check
55
+ # ---------------------------------------------------------------------------
56
+ python3 -c "import torch; print(f'PyTorch {torch.__version__} | CUDA {torch.version.cuda} | sm{torch.cuda.get_device_capability()[0]}')" || { echo "ERROR: PyTorch not found"; exit 1; }
57
+ python3 -c "import torchao; print(f'torchao {torchao.__version__}')" || echo "WARNING: torchao not found — FP8/MXFP8 unavailable, will use BF16"
58
+ python3 -c "from torchao.float8 import Float8LinearConfig; print(f'Float8LinearConfig OK (recipes: {[n.value for n in __import__(\"torchao.float8.config\", fromlist=[\"Float8LinearRecipeName\"]).Float8LinearRecipeName])}')" || echo "WARNING: torchao.float8 not available"
59
+ python3 -c "import bitsandbytes; print(f'bitsandbytes {bitsandbytes.__version__}')" || echo "INFO: bitsandbytes not found — using standard AdamW (expected for FP8+ modes)"
60
+ python3 -c "import unsloth; print('Unsloth available')" 2>/dev/null || echo "INFO: Unsloth not available — using standard PyTorch training"
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Create checkpoint directory
64
+ # ---------------------------------------------------------------------------
65
+ mkdir -p "${CKPT_DIR}"
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Build command
69
+ # ---------------------------------------------------------------------------
70
+ CMD="python3 scripts/train_spider.py"
71
+ CMD="${CMD} --precision ${PRECISION}"
72
+ CMD="${CMD} --ckpt_dir ${CKPT_DIR}"
73
+ CMD="${CMD} --seq_len ${SEQ_LEN}"
74
+ CMD="${CMD} --micro_batch ${MICRO_BATCH}"
75
+ CMD="${CMD} --n_loops ${N_LOOPS}"
76
+ CMD="${CMD} --lr ${LR}"
77
+
78
+ if [ -n "${RESUME}" ] && [ -f "${RESUME}" ]; then
79
+ CMD="${CMD} --resume ${RESUME}"
80
+ fi
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # Launch
84
+ # ---------------------------------------------------------------------------
85
+ echo ""
86
+ echo "Launching training..."
87
+ echo "Command: ${CMD}"
88
+ echo ""
89
+
90
+ exec ${CMD}