immunoorg-v3 / scripts /hpc /run_all.sh
hirann's picture
Upload scripts/hpc/run_all.sh with huggingface_hub
4593aaf verified
#!/usr/bin/env bash
# Submit the full ImmunoOrg HPC pipeline as a chain of SLURM jobs with
# `--dependency=afterok:` between stages. One command, then walk away.
#
# Usage:
# export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# bash scripts/hpc/run_all.sh # default settings
# bash scripts/hpc/run_all.sh --skip-sft # GRPO only (no warmstart)
# bash scripts/hpc/run_all.sh --multigpu 4 # GRPO on 4 GPUs
# bash scripts/hpc/run_all.sh --partition gpu-h100 # custom partition
#
# Pre-req: bash scripts/hpc/setup_env.sh has been run on the login node.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$REPO_ROOT"
# ── Defaults (override via flags) ───────────────────────────────────────────
PARTITION="${IMMUNOORG_PARTITION:-gpu}"
PARTITION_CPU="${IMMUNOORG_PARTITION_CPU:-cpu}"
GPU_COUNT=1
SKIP_SFT=0
while [[ $# -gt 0 ]]; do
case "$1" in
--skip-sft) SKIP_SFT=1; shift ;;
--multigpu) GPU_COUNT="$2"; shift 2 ;;
--partition) PARTITION="$2"; shift 2 ;;
--partition-cpu) PARTITION_CPU="$2"; shift 2 ;;
-h|--help)
grep '^# ' "$0" | sed 's/^# //'
exit 0 ;;
*) echo "unknown flag: $1"; exit 1 ;;
esac
done
if [ -z "${HF_TOKEN:-}" ]; then
echo "WARNING: HF_TOKEN not set β€” datasets, model, and evidence will NOT be pushed to the Hub."
fi
mkdir -p logs
echo "===================================================================="
echo " ImmunoOrg HPC pipeline submission"
echo "===================================================================="
echo " partition (gpu) : $PARTITION"
echo " partition (cpu) : $PARTITION_CPU"
echo " GPUs / job : $GPU_COUNT"
echo " skip SFT? : $SKIP_SFT"
echo " HF_TOKEN set? : ${HF_TOKEN:+yes}"
echo "===================================================================="
echo
# ── Stage 0: datasets (CPU) ─────────────────────────────────────────────────
JOB0=$(sbatch --parsable \
--partition="$PARTITION_CPU" \
--export=ALL,REPO_ROOT="$REPO_ROOT" \
scripts/hpc/slurm/00_datasets.sbatch)
echo "[stage 0] datasets -> job $JOB0 (partition=$PARTITION_CPU)"
# ── Stage 1: SFT (depends on stage 0) ───────────────────────────────────────
if [ "$SKIP_SFT" -eq 1 ]; then
echo "[stage 1] SFT warmstart -> SKIPPED"
SFT_DEP=""
JOB1=""
else
JOB1=$(sbatch --parsable \
--partition="$PARTITION" \
--dependency=afterok:"$JOB0" \
--export=ALL,REPO_ROOT="$REPO_ROOT" \
scripts/hpc/slurm/01_sft.sbatch)
echo "[stage 1] SFT warmstart -> job $JOB1 (after $JOB0, partition=$PARTITION)"
SFT_DEP=":$JOB1"
fi
# ── Stage 2: GRPO (depends on stage 0 + stage 1 if present) ────────────────
GRPO_DEP="afterok:$JOB0$SFT_DEP"
GRPO_GRES="gpu:$GPU_COUNT"
GRPO_NTASKS="$GPU_COUNT"
JOB2=$(sbatch --parsable \
--partition="$PARTITION" \
--dependency="$GRPO_DEP" \
--gres="$GRPO_GRES" \
--ntasks-per-node="$GRPO_NTASKS" \
--export=ALL,REPO_ROOT="$REPO_ROOT",IMMUNOORG_SKIP_SFT="$SKIP_SFT" \
scripts/hpc/slurm/02_grpo.sbatch)
echo "[stage 2] GRPO training -> job $JOB2 (after $GRPO_DEP, $GRPO_GRES)"
# ── Stage 3: evaluation (depends on GRPO) ──────────────────────────────────
JOB3=$(sbatch --parsable \
--partition="$PARTITION" \
--dependency=afterok:"$JOB2" \
--export=ALL,REPO_ROOT="$REPO_ROOT" \
scripts/hpc/slurm/03_eval.sbatch)
echo "[stage 3] evaluation -> job $JOB3 (after $JOB2)"
# ── Stage 4: push (depends on eval) ────────────────────────────────────────
JOB4=$(sbatch --parsable \
--partition="$PARTITION_CPU" \
--dependency=afterok:"$JOB3" \
--export=ALL,REPO_ROOT="$REPO_ROOT" \
scripts/hpc/slurm/04_push.sbatch)
echo "[stage 4] push artifacts -> job $JOB4 (after $JOB3, partition=$PARTITION_CPU)"
echo
echo "===================================================================="
echo " All 5 jobs submitted. Watch with:"
echo " squeue -u \$USER"
echo " tail -f logs/stage*-*.out"
echo "===================================================================="
echo
echo "When the chain finishes, results land at:"
echo " https://huggingface.co/${HF_PUSH_REPO:-hirann/immunoorg-grpo-defender}"
echo " https://huggingface.co/datasets/${HF_DATASET_REPO:-hirann/immunoorg-grpo-dataset}"