File size: 1,803 Bytes
4abdc05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
#SBATCH --job-name=arka-llmae-train
#SBATCH --chdir=/srv/disk00/arkanatp/LLMAE
#SBATCH --account=compvision.a
#SBATCH -p dept.p
#SBATCH -w denali-6
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:8
#SBATCH --cpus-per-task=32
#SBATCH --mem=1tb
#SBATCH -t 2-00:00:00
#SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out
#SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err

set -euo pipefail

mkdir -p /srv/disk00/arkanatp/LLMAE/

export HF_TOKEN=$(cat /home/arkanatp/.hf_token)
export HF_HOME=/srv/disk00/arkanatp/.hf_home
export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets
export HF_HUB_DOWNLOAD_TIMEOUT=600
export HF_HUB_ETAG_TIMEOUT=600
mkdir -p "$HF_DATASETS_CACHE"

rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/

# # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes).
# RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new'
# RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}"
# rsync -anv --info=progress2 -e "$RSYNC_RSH" \
#   --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \
#   --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \
#   "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/


uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt

/srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \
  --num_processes=8 \
  --mixed_precision=bf16 \
  train.py