hirann commited on
Commit
4593aaf
Β·
verified Β·
1 Parent(s): bbc88e8

Upload scripts/hpc/run_all.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/hpc/run_all.sh +116 -0
scripts/hpc/run_all.sh ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Submit the full ImmunoOrg HPC pipeline as a chain of SLURM jobs with
3
+ # `--dependency=afterok:` between stages. One command, then walk away.
4
+ #
5
+ # Usage:
6
+ # export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
7
+ # bash scripts/hpc/run_all.sh # default settings
8
+ # bash scripts/hpc/run_all.sh --skip-sft # GRPO only (no warmstart)
9
+ # bash scripts/hpc/run_all.sh --multigpu 4 # GRPO on 4 GPUs
10
+ # bash scripts/hpc/run_all.sh --partition gpu-h100 # custom partition
11
+ #
12
+ # Pre-req: bash scripts/hpc/setup_env.sh has been run on the login node.
13
+
14
+ set -euo pipefail
15
+
16
+ REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
17
+ cd "$REPO_ROOT"
18
+
19
+ # ── Defaults (override via flags) ───────────────────────────────────────────
20
+ PARTITION="${IMMUNOORG_PARTITION:-gpu}"
21
+ PARTITION_CPU="${IMMUNOORG_PARTITION_CPU:-cpu}"
22
+ GPU_COUNT=1
23
+ SKIP_SFT=0
24
+
25
+ while [[ $# -gt 0 ]]; do
26
+ case "$1" in
27
+ --skip-sft) SKIP_SFT=1; shift ;;
28
+ --multigpu) GPU_COUNT="$2"; shift 2 ;;
29
+ --partition) PARTITION="$2"; shift 2 ;;
30
+ --partition-cpu) PARTITION_CPU="$2"; shift 2 ;;
31
+ -h|--help)
32
+ grep '^# ' "$0" | sed 's/^# //'
33
+ exit 0 ;;
34
+ *) echo "unknown flag: $1"; exit 1 ;;
35
+ esac
36
+ done
37
+
38
+ if [ -z "${HF_TOKEN:-}" ]; then
39
+ echo "WARNING: HF_TOKEN not set β€” datasets, model, and evidence will NOT be pushed to the Hub."
40
+ fi
41
+
42
+ mkdir -p logs
43
+
44
+ echo "===================================================================="
45
+ echo " ImmunoOrg HPC pipeline submission"
46
+ echo "===================================================================="
47
+ echo " partition (gpu) : $PARTITION"
48
+ echo " partition (cpu) : $PARTITION_CPU"
49
+ echo " GPUs / job : $GPU_COUNT"
50
+ echo " skip SFT? : $SKIP_SFT"
51
+ echo " HF_TOKEN set? : ${HF_TOKEN:+yes}"
52
+ echo "===================================================================="
53
+ echo
54
+
55
+ # ── Stage 0: datasets (CPU) ─────────────────────────────────────────────────
56
+ JOB0=$(sbatch --parsable \
57
+ --partition="$PARTITION_CPU" \
58
+ --export=ALL,REPO_ROOT="$REPO_ROOT" \
59
+ scripts/hpc/slurm/00_datasets.sbatch)
60
+ echo "[stage 0] datasets -> job $JOB0 (partition=$PARTITION_CPU)"
61
+
62
+ # ── Stage 1: SFT (depends on stage 0) ───────────────────────────────────────
63
+ if [ "$SKIP_SFT" -eq 1 ]; then
64
+ echo "[stage 1] SFT warmstart -> SKIPPED"
65
+ SFT_DEP=""
66
+ JOB1=""
67
+ else
68
+ JOB1=$(sbatch --parsable \
69
+ --partition="$PARTITION" \
70
+ --dependency=afterok:"$JOB0" \
71
+ --export=ALL,REPO_ROOT="$REPO_ROOT" \
72
+ scripts/hpc/slurm/01_sft.sbatch)
73
+ echo "[stage 1] SFT warmstart -> job $JOB1 (after $JOB0, partition=$PARTITION)"
74
+ SFT_DEP=":$JOB1"
75
+ fi
76
+
77
+ # ── Stage 2: GRPO (depends on stage 0 + stage 1 if present) ────────────────
78
+ GRPO_DEP="afterok:$JOB0$SFT_DEP"
79
+ GRPO_GRES="gpu:$GPU_COUNT"
80
+ GRPO_NTASKS="$GPU_COUNT"
81
+
82
+ JOB2=$(sbatch --parsable \
83
+ --partition="$PARTITION" \
84
+ --dependency="$GRPO_DEP" \
85
+ --gres="$GRPO_GRES" \
86
+ --ntasks-per-node="$GRPO_NTASKS" \
87
+ --export=ALL,REPO_ROOT="$REPO_ROOT",IMMUNOORG_SKIP_SFT="$SKIP_SFT" \
88
+ scripts/hpc/slurm/02_grpo.sbatch)
89
+ echo "[stage 2] GRPO training -> job $JOB2 (after $GRPO_DEP, $GRPO_GRES)"
90
+
91
+ # ── Stage 3: evaluation (depends on GRPO) ──────────────────────────────────
92
+ JOB3=$(sbatch --parsable \
93
+ --partition="$PARTITION" \
94
+ --dependency=afterok:"$JOB2" \
95
+ --export=ALL,REPO_ROOT="$REPO_ROOT" \
96
+ scripts/hpc/slurm/03_eval.sbatch)
97
+ echo "[stage 3] evaluation -> job $JOB3 (after $JOB2)"
98
+
99
+ # ── Stage 4: push (depends on eval) ────────────────────────────────────────
100
+ JOB4=$(sbatch --parsable \
101
+ --partition="$PARTITION_CPU" \
102
+ --dependency=afterok:"$JOB3" \
103
+ --export=ALL,REPO_ROOT="$REPO_ROOT" \
104
+ scripts/hpc/slurm/04_push.sbatch)
105
+ echo "[stage 4] push artifacts -> job $JOB4 (after $JOB3, partition=$PARTITION_CPU)"
106
+
107
+ echo
108
+ echo "===================================================================="
109
+ echo " All 5 jobs submitted. Watch with:"
110
+ echo " squeue -u \$USER"
111
+ echo " tail -f logs/stage*-*.out"
112
+ echo "===================================================================="
113
+ echo
114
+ echo "When the chain finishes, results land at:"
115
+ echo " https://huggingface.co/${HF_PUSH_REPO:-hirann/immunoorg-grpo-defender}"
116
+ echo " https://huggingface.co/datasets/${HF_DATASET_REPO:-hirann/immunoorg-grpo-dataset}"