Spaces:

hirann
/

immunoorg-v3

Paused

App Files Files Community

hirann commited on 12 days ago

Commit

2f59eb7

verified ·

1 Parent(s): 4593aaf

Upload scripts/hpc/setup_env.sh with huggingface_hub

Browse files

Files changed (1) hide show

scripts/hpc/setup_env.sh +151 -0

scripts/hpc/setup_env.sh ADDED Viewed

	@@ -0,0 +1,151 @@

+#!/usr/bin/env bash
+# ImmunoOrg HPC env setup (4-stage pipeline edition)
+# ===================================================
+#
+# Run ONCE on the *login node* (not inside a SLURM job).
+# Idempotent: re-running just verifies the env exists.
+#
+# Installs:
+#   - uv (single-binary Python package manager, no conda needed)
+#   - Python 3.11 venv at .venv-hpc/
+#   - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
+#   - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
+#   - Unsloth (single-GPU 2-3x speedup for <13B)
+#   - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
+#   - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
+#
+# Optional flags:
+#   --no-flash-attn   skip flash-attention install (some clusters lack the headers)
+#   --no-deepspeed    skip deepspeed install (only matters for >2x GPU runs)
+set -euo pipefail
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$REPO_ROOT"
+INSTALL_FLASH_ATTN=1
+INSTALL_DEEPSPEED=1
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
+        --no-deepspeed)  INSTALL_DEEPSPEED=0; shift ;;
+        -h|--help)       grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
+        *) echo "unknown flag: $1"; exit 1 ;;
+    esac
+done
+echo "===================================================================="
+echo "  ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
+echo "  Repo: $REPO_ROOT"
+echo "===================================================================="
+# ── 1. Install uv if missing (no sudo, single binary) ────────────────────
+if ! command -v uv >/dev/null 2>&1; then
+    echo
+    echo "[1/5] installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
+fi
+echo "[1/5] uv: $(uv --version)"
+# ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
+echo
+echo "[2/5] looking for CUDA / GCC modules..."
+if command -v module >/dev/null 2>&1; then
+    module purge 2>/dev/null || true
+    for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
+        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
+    done
+    for mod in gcc/11 gcc/10 gcc; do
+        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
+    done
+    nvcc --version 2>/dev/null || echo "    (no nvcc on login node - GPU node will have it)"
+else
+    echo "    (no Lmod - assuming system CUDA / GCC)"
+fi
+# ── 3. Create venv ───────────────────────────────────────────────────────
+echo
+echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
+if [ ! -d ".venv-hpc" ]; then
+    uv venv --python 3.11 .venv-hpc
+fi
+# shellcheck disable=SC1091
+source .venv-hpc/bin/activate
+python -V
+# ── 4. Install training stack ────────────────────────────────────────────
+echo
+echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
+uv pip install --upgrade pip wheel setuptools
+# Pinned baseline
+uv pip install --no-cache \
+    "torch==2.4.*" \
+    "transformers>=4.45,<5.0" \
+    "trl>=0.15.0,<1.0" \
+    "datasets>=2.19" \
+    "accelerate>=0.30" \
+    "peft>=0.11" \
+    "bitsandbytes>=0.43" \
+    "sentencepiece" \
+    "safetensors" \
+    "huggingface_hub>=0.24" \
+    "matplotlib>=3.7" \
+    "numpy<2.0" \
+    "pandas" \
+    "fastapi" \
+    "uvicorn[standard]" \
+    "pydantic>=2.6" \
+    "networkx>=3.2" \
+    "pyyaml" \
+    "rich"
+# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
+uv pip install --no-cache "unsloth"
+# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
+if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
+    echo
+    echo "    installing flash-attn (skip with --no-flash-attn if it fails)..."
+    uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
+        echo "    flash-attn install failed (not fatal — Unsloth has its own kernels)"
+fi
+# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
+if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
+    echo
+    echo "    installing deepspeed (only used for multi-GPU runs)..."
+    uv pip install --no-cache "deepspeed>=0.14" || \
+        echo "    deepspeed install failed (not fatal — single-GPU still works)"
+fi
+# ── 5. Sanity check ──────────────────────────────────────────────────────
+echo
+echo "[5/5] sanity check..."
+python - <<'PY'
+import importlib
+mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
+for m in mods:
+    try:
+        v = getattr(importlib.import_module(m), "__version__", "?")
+        print(f"  {m:18s} {v}")
+    except Exception as e:
+        print(f"  {m:18s} FAILED ({e})")
+try:
+    import unsloth
+    print(f"  unsloth            {unsloth.__version__}")
+except Exception:
+    print("  unsloth            (not installed - single-GPU will fall back to plain HF)")
+PY
+mkdir -p logs outputs
+echo
+echo "===================================================================="
+echo "  ENV READY"
+echo "===================================================================="
+echo "  next:"
+echo "    export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+echo "    bash scripts/hpc/run_all.sh"
+echo "===================================================================="