Spaces:
Paused
Paused
Upload scripts/hpc/run_all.sh with huggingface_hub
Browse files- scripts/hpc/run_all.sh +116 -0
scripts/hpc/run_all.sh
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Submit the full ImmunoOrg HPC pipeline as a chain of SLURM jobs with
|
| 3 |
+
# `--dependency=afterok:` between stages. One command, then walk away.
|
| 4 |
+
#
|
| 5 |
+
# Usage:
|
| 6 |
+
# export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
| 7 |
+
# bash scripts/hpc/run_all.sh # default settings
|
| 8 |
+
# bash scripts/hpc/run_all.sh --skip-sft # GRPO only (no warmstart)
|
| 9 |
+
# bash scripts/hpc/run_all.sh --multigpu 4 # GRPO on 4 GPUs
|
| 10 |
+
# bash scripts/hpc/run_all.sh --partition gpu-h100 # custom partition
|
| 11 |
+
#
|
| 12 |
+
# Pre-req: bash scripts/hpc/setup_env.sh has been run on the login node.
|
| 13 |
+
|
| 14 |
+
set -euo pipefail
|
| 15 |
+
|
| 16 |
+
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
| 17 |
+
cd "$REPO_ROOT"
|
| 18 |
+
|
| 19 |
+
# ββ Defaults (override via flags) βββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
PARTITION="${IMMUNOORG_PARTITION:-gpu}"
|
| 21 |
+
PARTITION_CPU="${IMMUNOORG_PARTITION_CPU:-cpu}"
|
| 22 |
+
GPU_COUNT=1
|
| 23 |
+
SKIP_SFT=0
|
| 24 |
+
|
| 25 |
+
while [[ $# -gt 0 ]]; do
|
| 26 |
+
case "$1" in
|
| 27 |
+
--skip-sft) SKIP_SFT=1; shift ;;
|
| 28 |
+
--multigpu) GPU_COUNT="$2"; shift 2 ;;
|
| 29 |
+
--partition) PARTITION="$2"; shift 2 ;;
|
| 30 |
+
--partition-cpu) PARTITION_CPU="$2"; shift 2 ;;
|
| 31 |
+
-h|--help)
|
| 32 |
+
grep '^# ' "$0" | sed 's/^# //'
|
| 33 |
+
exit 0 ;;
|
| 34 |
+
*) echo "unknown flag: $1"; exit 1 ;;
|
| 35 |
+
esac
|
| 36 |
+
done
|
| 37 |
+
|
| 38 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 39 |
+
echo "WARNING: HF_TOKEN not set β datasets, model, and evidence will NOT be pushed to the Hub."
|
| 40 |
+
fi
|
| 41 |
+
|
| 42 |
+
mkdir -p logs
|
| 43 |
+
|
| 44 |
+
echo "===================================================================="
|
| 45 |
+
echo " ImmunoOrg HPC pipeline submission"
|
| 46 |
+
echo "===================================================================="
|
| 47 |
+
echo " partition (gpu) : $PARTITION"
|
| 48 |
+
echo " partition (cpu) : $PARTITION_CPU"
|
| 49 |
+
echo " GPUs / job : $GPU_COUNT"
|
| 50 |
+
echo " skip SFT? : $SKIP_SFT"
|
| 51 |
+
echo " HF_TOKEN set? : ${HF_TOKEN:+yes}"
|
| 52 |
+
echo "===================================================================="
|
| 53 |
+
echo
|
| 54 |
+
|
| 55 |
+
# ββ Stage 0: datasets (CPU) βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
JOB0=$(sbatch --parsable \
|
| 57 |
+
--partition="$PARTITION_CPU" \
|
| 58 |
+
--export=ALL,REPO_ROOT="$REPO_ROOT" \
|
| 59 |
+
scripts/hpc/slurm/00_datasets.sbatch)
|
| 60 |
+
echo "[stage 0] datasets -> job $JOB0 (partition=$PARTITION_CPU)"
|
| 61 |
+
|
| 62 |
+
# ββ Stage 1: SFT (depends on stage 0) βββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
if [ "$SKIP_SFT" -eq 1 ]; then
|
| 64 |
+
echo "[stage 1] SFT warmstart -> SKIPPED"
|
| 65 |
+
SFT_DEP=""
|
| 66 |
+
JOB1=""
|
| 67 |
+
else
|
| 68 |
+
JOB1=$(sbatch --parsable \
|
| 69 |
+
--partition="$PARTITION" \
|
| 70 |
+
--dependency=afterok:"$JOB0" \
|
| 71 |
+
--export=ALL,REPO_ROOT="$REPO_ROOT" \
|
| 72 |
+
scripts/hpc/slurm/01_sft.sbatch)
|
| 73 |
+
echo "[stage 1] SFT warmstart -> job $JOB1 (after $JOB0, partition=$PARTITION)"
|
| 74 |
+
SFT_DEP=":$JOB1"
|
| 75 |
+
fi
|
| 76 |
+
|
| 77 |
+
# ββ Stage 2: GRPO (depends on stage 0 + stage 1 if present) ββββββββββββββββ
|
| 78 |
+
GRPO_DEP="afterok:$JOB0$SFT_DEP"
|
| 79 |
+
GRPO_GRES="gpu:$GPU_COUNT"
|
| 80 |
+
GRPO_NTASKS="$GPU_COUNT"
|
| 81 |
+
|
| 82 |
+
JOB2=$(sbatch --parsable \
|
| 83 |
+
--partition="$PARTITION" \
|
| 84 |
+
--dependency="$GRPO_DEP" \
|
| 85 |
+
--gres="$GRPO_GRES" \
|
| 86 |
+
--ntasks-per-node="$GRPO_NTASKS" \
|
| 87 |
+
--export=ALL,REPO_ROOT="$REPO_ROOT",IMMUNOORG_SKIP_SFT="$SKIP_SFT" \
|
| 88 |
+
scripts/hpc/slurm/02_grpo.sbatch)
|
| 89 |
+
echo "[stage 2] GRPO training -> job $JOB2 (after $GRPO_DEP, $GRPO_GRES)"
|
| 90 |
+
|
| 91 |
+
# ββ Stage 3: evaluation (depends on GRPO) ββββββββββββββββββββββββββββββββββ
|
| 92 |
+
JOB3=$(sbatch --parsable \
|
| 93 |
+
--partition="$PARTITION" \
|
| 94 |
+
--dependency=afterok:"$JOB2" \
|
| 95 |
+
--export=ALL,REPO_ROOT="$REPO_ROOT" \
|
| 96 |
+
scripts/hpc/slurm/03_eval.sbatch)
|
| 97 |
+
echo "[stage 3] evaluation -> job $JOB3 (after $JOB2)"
|
| 98 |
+
|
| 99 |
+
# ββ Stage 4: push (depends on eval) ββββββββββββββββββββββββββββββββββββββββ
|
| 100 |
+
JOB4=$(sbatch --parsable \
|
| 101 |
+
--partition="$PARTITION_CPU" \
|
| 102 |
+
--dependency=afterok:"$JOB3" \
|
| 103 |
+
--export=ALL,REPO_ROOT="$REPO_ROOT" \
|
| 104 |
+
scripts/hpc/slurm/04_push.sbatch)
|
| 105 |
+
echo "[stage 4] push artifacts -> job $JOB4 (after $JOB3, partition=$PARTITION_CPU)"
|
| 106 |
+
|
| 107 |
+
echo
|
| 108 |
+
echo "===================================================================="
|
| 109 |
+
echo " All 5 jobs submitted. Watch with:"
|
| 110 |
+
echo " squeue -u \$USER"
|
| 111 |
+
echo " tail -f logs/stage*-*.out"
|
| 112 |
+
echo "===================================================================="
|
| 113 |
+
echo
|
| 114 |
+
echo "When the chain finishes, results land at:"
|
| 115 |
+
echo " https://huggingface.co/${HF_PUSH_REPO:-hirann/immunoorg-grpo-defender}"
|
| 116 |
+
echo " https://huggingface.co/datasets/${HF_DATASET_REPO:-hirann/immunoorg-grpo-dataset}"
|