Upload run_train.slurm with huggingface_hub
Browse files- run_train.slurm +43 -0
run_train.slurm
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#SBATCH --job-name=arka-llmae-train
|
| 3 |
+
#SBATCH --chdir=/srv/disk00/arkanatp/LLMAE
|
| 4 |
+
#SBATCH --account=compvision.a
|
| 5 |
+
#SBATCH -p dept.p
|
| 6 |
+
#SBATCH -w denali-6
|
| 7 |
+
#SBATCH --nodes=1
|
| 8 |
+
#SBATCH --ntasks=1
|
| 9 |
+
#SBATCH --gres=gpu:8
|
| 10 |
+
#SBATCH --cpus-per-task=32
|
| 11 |
+
#SBATCH --mem=1tb
|
| 12 |
+
#SBATCH -t 2-00:00:00
|
| 13 |
+
#SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out
|
| 14 |
+
#SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err
|
| 15 |
+
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
mkdir -p /srv/disk00/arkanatp/LLMAE/
|
| 19 |
+
|
| 20 |
+
export HF_TOKEN=$(cat /home/arkanatp/.hf_token)
|
| 21 |
+
export HF_HOME=/srv/disk00/arkanatp/.hf_home
|
| 22 |
+
export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets
|
| 23 |
+
export HF_HUB_DOWNLOAD_TIMEOUT=600
|
| 24 |
+
export HF_HUB_ETAG_TIMEOUT=600
|
| 25 |
+
mkdir -p "$HF_DATASETS_CACHE"
|
| 26 |
+
|
| 27 |
+
rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/
|
| 28 |
+
|
| 29 |
+
# # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes).
|
| 30 |
+
# RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new'
|
| 31 |
+
# RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}"
|
| 32 |
+
# rsync -anv --info=progress2 -e "$RSYNC_RSH" \
|
| 33 |
+
# --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \
|
| 34 |
+
# --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \
|
| 35 |
+
# "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt
|
| 39 |
+
|
| 40 |
+
/srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \
|
| 41 |
+
--num_processes=8 \
|
| 42 |
+
--mixed_precision=bf16 \
|
| 43 |
+
train.py
|