arkanathp commited on
Commit
d43e4d7
·
verified ·
1 Parent(s): 1a57d93

Upload run_train.slurm with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_train.slurm +43 -0
run_train.slurm ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --job-name=arka-llmae-train
3
+ #SBATCH --chdir=/srv/disk00/arkanatp/LLMAE
4
+ #SBATCH --account=compvision.a
5
+ #SBATCH -p dept.p
6
+ #SBATCH -w denali-6
7
+ #SBATCH --nodes=1
8
+ #SBATCH --ntasks=1
9
+ #SBATCH --gres=gpu:8
10
+ #SBATCH --cpus-per-task=32
11
+ #SBATCH --mem=1tb
12
+ #SBATCH -t 2-00:00:00
13
+ #SBATCH --output=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.out
14
+ #SBATCH --error=/srv/disk00/arkanatp/LLMAE/slurm_output/slurm-%x.%j.err
15
+
16
+ set -euo pipefail
17
+
18
+ mkdir -p /srv/disk00/arkanatp/LLMAE/
19
+
20
+ export HF_TOKEN=$(cat /home/arkanatp/.hf_token)
21
+ export HF_HOME=/srv/disk00/arkanatp/.hf_home
22
+ export HF_DATASETS_CACHE=/srv/disk00/arkanatp/.hf_home/datasets
23
+ export HF_HUB_DOWNLOAD_TIMEOUT=600
24
+ export HF_HUB_ETAG_TIMEOUT=600
25
+ mkdir -p "$HF_DATASETS_CACHE"
26
+
27
+ rsync -a --info=progress2 --exclude='.venv/' --exclude='.git/' --exclude='__pycache__/' --exclude='checkpoint*' /home/arkanatp/arkanatp/ /srv/disk00/arkanatp/
28
+
29
+ # # Pull from denali-16 /srv disk over SSH (shared key in home: visible on all cluster nodes).
30
+ # RSYNC_RSH='ssh -i /home/arkanatp/keys/denali-disk-rsync/id_ed25519 -o BatchMode=yes -o StrictHostKeyChecking=accept-new'
31
+ # RSYNC_SRC_HOST="${RSYNC_SRC_HOST:-denali-16}"
32
+ # rsync -anv --info=progress2 -e "$RSYNC_RSH" \
33
+ # --exclude='.venv/' --exclude='.hf_home/' --exclude='.git/' --exclude='__pycache__/' --exclude='*checkpoint*' --exclude='outputs/' --exclude='generated_texts/' \
34
+ # --exclude='slurm_output/' --exclude='data/' --exclude='metrics/' --exclude='results/' --exclude='cache/ \
35
+ # "arkanatp@${RSYNC_SRC_HOST}:/srv/disk00/arkanatp/" /srv/disk00/arkanatp/
36
+
37
+
38
+ uv pip install --python /srv/disk00/arkanatp/LLMAE/.venv/bin/python -r /srv/disk00/arkanatp/LLMAE/requirements.txt
39
+
40
+ /srv/disk00/arkanatp/LLMAE/.venv/bin/accelerate launch \
41
+ --num_processes=8 \
42
+ --mixed_precision=bf16 \
43
+ train.py