#!/bin/bash #SBATCH --job-name=arka-llmae-train #SBATCH --chdir=/srv/disk00/arkanatp/LLMAE #SBATCH --account=compvision.a #SBATCH -p dept.p #SBATCH -w denali-6 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --gres=gpu:8 #SBATCH --cpus-per-task=32 #SBATCH --mem=1tb #SBATCH -t 2-00:00:00 #SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out #SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err set -euo pipefail mkdir -p /srv/disk00/arkanatp/LLMAE/ export HF_TOKEN=$(cat /home/arkanatp/.hf_token) export HF_HOME=/srv/disk00/arkanatp/.hf_home export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets export HF_HUB_DOWNLOAD_TIMEOUT=600 export HF_HUB_ETAG_TIMEOUT=600 mkdir -p "$HF_DATASETS_CACHE" rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/ # # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes). # RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new' # RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}" # rsync -anv --info=progress2 -e "$RSYNC_RSH" \ # --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \ # --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \ # "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/ uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt /srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \ --num_processes=8 \ --mixed_precision=bf16 \ train.py