tb-diffusion-paradetox / run_train.slurm
arkanathp's picture
Upload run_train.slurm with huggingface_hub
d43e4d7 verified
#!/bin/bash
#SBATCH --job-name=arka-llmae-train
#SBATCH --chdir=/srv/disk00/arkanatp/LLMAE
#SBATCH --account=compvision.a
#SBATCH -p dept.p
#SBATCH -w denali-6
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:8
#SBATCH --cpus-per-task=32
#SBATCH --mem=1tb
#SBATCH -t 2-00:00:00
#SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out
#SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err
set -euo pipefail
mkdir -p /srv/disk00/arkanatp/LLMAE/
export HF_TOKEN=$(cat /home/arkanatp/.hf_token)
export HF_HOME=/srv/disk00/arkanatp/.hf_home
export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets
export HF_HUB_DOWNLOAD_TIMEOUT=600
export HF_HUB_ETAG_TIMEOUT=600
mkdir -p "$HF_DATASETS_CACHE"
rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/
# # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes).
# RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new'
# RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}"
# rsync -anv --info=progress2 -e "$RSYNC_RSH" \
# --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \
# --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \
# "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/
uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt
/srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \
--num_processes=8 \
--mixed_precision=bf16 \
train.py