#!/usr/bin/env bash # Submit the full ImmunoOrg HPC pipeline as a chain of SLURM jobs with # `--dependency=afterok:` between stages. One command, then walk away. # # Usage: # export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # bash scripts/hpc/run_all.sh # default settings # bash scripts/hpc/run_all.sh --skip-sft # GRPO only (no warmstart) # bash scripts/hpc/run_all.sh --multigpu 4 # GRPO on 4 GPUs # bash scripts/hpc/run_all.sh --partition gpu-h100 # custom partition # # Pre-req: bash scripts/hpc/setup_env.sh has been run on the login node. set -euo pipefail REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" cd "$REPO_ROOT" # ── Defaults (override via flags) ─────────────────────────────────────────── PARTITION="${IMMUNOORG_PARTITION:-gpu}" PARTITION_CPU="${IMMUNOORG_PARTITION_CPU:-cpu}" GPU_COUNT=1 SKIP_SFT=0 while [[ $# -gt 0 ]]; do case "$1" in --skip-sft) SKIP_SFT=1; shift ;; --multigpu) GPU_COUNT="$2"; shift 2 ;; --partition) PARTITION="$2"; shift 2 ;; --partition-cpu) PARTITION_CPU="$2"; shift 2 ;; -h|--help) grep '^# ' "$0" | sed 's/^# //' exit 0 ;; *) echo "unknown flag: $1"; exit 1 ;; esac done if [ -z "${HF_TOKEN:-}" ]; then echo "WARNING: HF_TOKEN not set — datasets, model, and evidence will NOT be pushed to the Hub." fi mkdir -p logs echo "====================================================================" echo " ImmunoOrg HPC pipeline submission" echo "====================================================================" echo " partition (gpu) : $PARTITION" echo " partition (cpu) : $PARTITION_CPU" echo " GPUs / job : $GPU_COUNT" echo " skip SFT? : $SKIP_SFT" echo " HF_TOKEN set? : ${HF_TOKEN:+yes}" echo "====================================================================" echo # ── Stage 0: datasets (CPU) ───────────────────────────────────────────────── JOB0=$(sbatch --parsable \ --partition="$PARTITION_CPU" \ --export=ALL,REPO_ROOT="$REPO_ROOT" \ scripts/hpc/slurm/00_datasets.sbatch) echo "[stage 0] datasets -> job $JOB0 (partition=$PARTITION_CPU)" # ── Stage 1: SFT (depends on stage 0) ─────────────────────────────────────── if [ "$SKIP_SFT" -eq 1 ]; then echo "[stage 1] SFT warmstart -> SKIPPED" SFT_DEP="" JOB1="" else JOB1=$(sbatch --parsable \ --partition="$PARTITION" \ --dependency=afterok:"$JOB0" \ --export=ALL,REPO_ROOT="$REPO_ROOT" \ scripts/hpc/slurm/01_sft.sbatch) echo "[stage 1] SFT warmstart -> job $JOB1 (after $JOB0, partition=$PARTITION)" SFT_DEP=":$JOB1" fi # ── Stage 2: GRPO (depends on stage 0 + stage 1 if present) ──────────────── GRPO_DEP="afterok:$JOB0$SFT_DEP" GRPO_GRES="gpu:$GPU_COUNT" GRPO_NTASKS="$GPU_COUNT" JOB2=$(sbatch --parsable \ --partition="$PARTITION" \ --dependency="$GRPO_DEP" \ --gres="$GRPO_GRES" \ --ntasks-per-node="$GRPO_NTASKS" \ --export=ALL,REPO_ROOT="$REPO_ROOT",IMMUNOORG_SKIP_SFT="$SKIP_SFT" \ scripts/hpc/slurm/02_grpo.sbatch) echo "[stage 2] GRPO training -> job $JOB2 (after $GRPO_DEP, $GRPO_GRES)" # ── Stage 3: evaluation (depends on GRPO) ────────────────────────────────── JOB3=$(sbatch --parsable \ --partition="$PARTITION" \ --dependency=afterok:"$JOB2" \ --export=ALL,REPO_ROOT="$REPO_ROOT" \ scripts/hpc/slurm/03_eval.sbatch) echo "[stage 3] evaluation -> job $JOB3 (after $JOB2)" # ── Stage 4: push (depends on eval) ──────────────────────────────────────── JOB4=$(sbatch --parsable \ --partition="$PARTITION_CPU" \ --dependency=afterok:"$JOB3" \ --export=ALL,REPO_ROOT="$REPO_ROOT" \ scripts/hpc/slurm/04_push.sbatch) echo "[stage 4] push artifacts -> job $JOB4 (after $JOB3, partition=$PARTITION_CPU)" echo echo "====================================================================" echo " All 5 jobs submitted. Watch with:" echo " squeue -u \$USER" echo " tail -f logs/stage*-*.out" echo "====================================================================" echo echo "When the chain finishes, results land at:" echo " https://huggingface.co/${HF_PUSH_REPO:-hirann/immunoorg-grpo-defender}" echo " https://huggingface.co/datasets/${HF_DATASET_REPO:-hirann/immunoorg-grpo-dataset}"