| #!/bin/bash | |
| #SBATCH --job-name=arka-llmae-train | |
| #SBATCH --chdir=/srv/disk00/arkanatp/LLMAE | |
| #SBATCH --account=compvision.a | |
| #SBATCH -p dept.p | |
| #SBATCH -w denali-6 | |
| #SBATCH --nodes=1 | |
| #SBATCH --ntasks=1 | |
| #SBATCH --gres=gpu:8 | |
| #SBATCH --cpus-per-task=32 | |
| #SBATCH --mem=1tb | |
| #SBATCH -t 2-00:00:00 | |
| #SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out | |
| #SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err | |
| set -euo pipefail | |
| mkdir -p /srv/disk00/arkanatp/LLMAE/ | |
| export HF_TOKEN=$(cat /home/arkanatp/.hf_token) | |
| export HF_HOME=/srv/disk00/arkanatp/.hf_home | |
| export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets | |
| export HF_HUB_DOWNLOAD_TIMEOUT=600 | |
| export HF_HUB_ETAG_TIMEOUT=600 | |
| mkdir -p "$HF_DATASETS_CACHE" | |
| rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/ | |
| # # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes). | |
| # RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new' | |
| # RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}" | |
| # rsync -anv --info=progress2 -e "$RSYNC_RSH" \ | |
| # --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \ | |
| # --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \ | |
| # "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/ | |
| uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt | |
| /srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \ | |
| --num_processes=8 \ | |
| --mixed_precision=bf16 \ | |
| train.py | |