Spaces:
Paused
Paused
| # Submit the full ImmunoOrg HPC pipeline as a chain of SLURM jobs with | |
| # `--dependency=afterok:` between stages. One command, then walk away. | |
| # | |
| # Usage: | |
| # export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx | |
| # bash scripts/hpc/run_all.sh # default settings | |
| # bash scripts/hpc/run_all.sh --skip-sft # GRPO only (no warmstart) | |
| # bash scripts/hpc/run_all.sh --multigpu 4 # GRPO on 4 GPUs | |
| # bash scripts/hpc/run_all.sh --partition gpu-h100 # custom partition | |
| # | |
| # Pre-req: bash scripts/hpc/setup_env.sh has been run on the login node. | |
| set -euo pipefail | |
| REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" | |
| cd "$REPO_ROOT" | |
| # ββ Defaults (override via flags) βββββββββββββββββββββββββββββββββββββββββββ | |
| PARTITION="${IMMUNOORG_PARTITION:-gpu}" | |
| PARTITION_CPU="${IMMUNOORG_PARTITION_CPU:-cpu}" | |
| GPU_COUNT=1 | |
| SKIP_SFT=0 | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --skip-sft) SKIP_SFT=1; shift ;; | |
| --multigpu) GPU_COUNT="$2"; shift 2 ;; | |
| --partition) PARTITION="$2"; shift 2 ;; | |
| --partition-cpu) PARTITION_CPU="$2"; shift 2 ;; | |
| -h|--help) | |
| grep '^# ' "$0" | sed 's/^# //' | |
| exit 0 ;; | |
| *) echo "unknown flag: $1"; exit 1 ;; | |
| esac | |
| done | |
| if [ -z "${HF_TOKEN:-}" ]; then | |
| echo "WARNING: HF_TOKEN not set β datasets, model, and evidence will NOT be pushed to the Hub." | |
| fi | |
| mkdir -p logs | |
| echo "====================================================================" | |
| echo " ImmunoOrg HPC pipeline submission" | |
| echo "====================================================================" | |
| echo " partition (gpu) : $PARTITION" | |
| echo " partition (cpu) : $PARTITION_CPU" | |
| echo " GPUs / job : $GPU_COUNT" | |
| echo " skip SFT? : $SKIP_SFT" | |
| echo " HF_TOKEN set? : ${HF_TOKEN:+yes}" | |
| echo "====================================================================" | |
| echo | |
| # ββ Stage 0: datasets (CPU) βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| JOB0=$(sbatch --parsable \ | |
| --partition="$PARTITION_CPU" \ | |
| --export=ALL,REPO_ROOT="$REPO_ROOT" \ | |
| scripts/hpc/slurm/00_datasets.sbatch) | |
| echo "[stage 0] datasets -> job $JOB0 (partition=$PARTITION_CPU)" | |
| # ββ Stage 1: SFT (depends on stage 0) βββββββββββββββββββββββββββββββββββββββ | |
| if [ "$SKIP_SFT" -eq 1 ]; then | |
| echo "[stage 1] SFT warmstart -> SKIPPED" | |
| SFT_DEP="" | |
| JOB1="" | |
| else | |
| JOB1=$(sbatch --parsable \ | |
| --partition="$PARTITION" \ | |
| --dependency=afterok:"$JOB0" \ | |
| --export=ALL,REPO_ROOT="$REPO_ROOT" \ | |
| scripts/hpc/slurm/01_sft.sbatch) | |
| echo "[stage 1] SFT warmstart -> job $JOB1 (after $JOB0, partition=$PARTITION)" | |
| SFT_DEP=":$JOB1" | |
| fi | |
| # ββ Stage 2: GRPO (depends on stage 0 + stage 1 if present) ββββββββββββββββ | |
| GRPO_DEP="afterok:$JOB0$SFT_DEP" | |
| GRPO_GRES="gpu:$GPU_COUNT" | |
| GRPO_NTASKS="$GPU_COUNT" | |
| JOB2=$(sbatch --parsable \ | |
| --partition="$PARTITION" \ | |
| --dependency="$GRPO_DEP" \ | |
| --gres="$GRPO_GRES" \ | |
| --ntasks-per-node="$GRPO_NTASKS" \ | |
| --export=ALL,REPO_ROOT="$REPO_ROOT",IMMUNOORG_SKIP_SFT="$SKIP_SFT" \ | |
| scripts/hpc/slurm/02_grpo.sbatch) | |
| echo "[stage 2] GRPO training -> job $JOB2 (after $GRPO_DEP, $GRPO_GRES)" | |
| # ββ Stage 3: evaluation (depends on GRPO) ββββββββββββββββββββββββββββββββββ | |
| JOB3=$(sbatch --parsable \ | |
| --partition="$PARTITION" \ | |
| --dependency=afterok:"$JOB2" \ | |
| --export=ALL,REPO_ROOT="$REPO_ROOT" \ | |
| scripts/hpc/slurm/03_eval.sbatch) | |
| echo "[stage 3] evaluation -> job $JOB3 (after $JOB2)" | |
| # ββ Stage 4: push (depends on eval) ββββββββββββββββββββββββββββββββββββββββ | |
| JOB4=$(sbatch --parsable \ | |
| --partition="$PARTITION_CPU" \ | |
| --dependency=afterok:"$JOB3" \ | |
| --export=ALL,REPO_ROOT="$REPO_ROOT" \ | |
| scripts/hpc/slurm/04_push.sbatch) | |
| echo "[stage 4] push artifacts -> job $JOB4 (after $JOB3, partition=$PARTITION_CPU)" | |
| echo | |
| echo "====================================================================" | |
| echo " All 5 jobs submitted. Watch with:" | |
| echo " squeue -u \$USER" | |
| echo " tail -f logs/stage*-*.out" | |
| echo "====================================================================" | |
| echo | |
| echo "When the chain finishes, results land at:" | |
| echo " https://huggingface.co/${HF_PUSH_REPO:-hirann/immunoorg-grpo-defender}" | |
| echo " https://huggingface.co/datasets/${HF_DATASET_REPO:-hirann/immunoorg-grpo-dataset}" | |