immunoorg-v3 / scripts /hpc /setup_env.sh
hirann's picture
Upload scripts/hpc/setup_env.sh with huggingface_hub
2f59eb7 verified
#!/usr/bin/env bash
# ImmunoOrg HPC env setup (4-stage pipeline edition)
# ===================================================
#
# Run ONCE on the *login node* (not inside a SLURM job).
# Idempotent: re-running just verifies the env exists.
#
# Installs:
# - uv (single-binary Python package manager, no conda needed)
# - Python 3.11 venv at .venv-hpc/
# - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
# - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
# - Unsloth (single-GPU 2-3x speedup for <13B)
# - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
# - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
#
# Optional flags:
# --no-flash-attn skip flash-attention install (some clusters lack the headers)
# --no-deepspeed skip deepspeed install (only matters for >2x GPU runs)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$REPO_ROOT"
INSTALL_FLASH_ATTN=1
INSTALL_DEEPSPEED=1
while [[ $# -gt 0 ]]; do
case "$1" in
--no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
--no-deepspeed) INSTALL_DEEPSPEED=0; shift ;;
-h|--help) grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
*) echo "unknown flag: $1"; exit 1 ;;
esac
done
echo "===================================================================="
echo " ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
echo " Repo: $REPO_ROOT"
echo "===================================================================="
# ── 1. Install uv if missing (no sudo, single binary) ────────────────────
if ! command -v uv >/dev/null 2>&1; then
echo
echo "[1/5] installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
fi
echo "[1/5] uv: $(uv --version)"
# ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
echo
echo "[2/5] looking for CUDA / GCC modules..."
if command -v module >/dev/null 2>&1; then
module purge 2>/dev/null || true
for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
done
for mod in gcc/11 gcc/10 gcc; do
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
done
nvcc --version 2>/dev/null || echo " (no nvcc on login node - GPU node will have it)"
else
echo " (no Lmod - assuming system CUDA / GCC)"
fi
# ── 3. Create venv ───────────────────────────────────────────────────────
echo
echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
if [ ! -d ".venv-hpc" ]; then
uv venv --python 3.11 .venv-hpc
fi
# shellcheck disable=SC1091
source .venv-hpc/bin/activate
python -V
# ── 4. Install training stack ────────────────────────────────────────────
echo
echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
uv pip install --upgrade pip wheel setuptools
# Pinned baseline
uv pip install --no-cache \
"torch==2.4.*" \
"transformers>=4.45,<5.0" \
"trl>=0.15.0,<1.0" \
"datasets>=2.19" \
"accelerate>=0.30" \
"peft>=0.11" \
"bitsandbytes>=0.43" \
"sentencepiece" \
"safetensors" \
"huggingface_hub>=0.24" \
"matplotlib>=3.7" \
"numpy<2.0" \
"pandas" \
"fastapi" \
"uvicorn[standard]" \
"pydantic>=2.6" \
"networkx>=3.2" \
"pyyaml" \
"rich"
# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
uv pip install --no-cache "unsloth"
# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
echo
echo " installing flash-attn (skip with --no-flash-attn if it fails)..."
uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
echo " flash-attn install failed (not fatal β€” Unsloth has its own kernels)"
fi
# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
echo
echo " installing deepspeed (only used for multi-GPU runs)..."
uv pip install --no-cache "deepspeed>=0.14" || \
echo " deepspeed install failed (not fatal β€” single-GPU still works)"
fi
# ── 5. Sanity check ──────────────────────────────────────────────────────
echo
echo "[5/5] sanity check..."
python - <<'PY'
import importlib
mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
for m in mods:
try:
v = getattr(importlib.import_module(m), "__version__", "?")
print(f" {m:18s} {v}")
except Exception as e:
print(f" {m:18s} FAILED ({e})")
try:
import unsloth
print(f" unsloth {unsloth.__version__}")
except Exception:
print(" unsloth (not installed - single-GPU will fall back to plain HF)")
PY
mkdir -p logs outputs
echo
echo "===================================================================="
echo " ENV READY"
echo "===================================================================="
echo " next:"
echo " export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
echo " bash scripts/hpc/run_all.sh"
echo "===================================================================="