Spaces:
Paused
Paused
File size: 5,659 Bytes
2f59eb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | #!/usr/bin/env bash
# ImmunoOrg HPC env setup (4-stage pipeline edition)
# ===================================================
#
# Run ONCE on the *login node* (not inside a SLURM job).
# Idempotent: re-running just verifies the env exists.
#
# Installs:
# - uv (single-binary Python package manager, no conda needed)
# - Python 3.11 venv at .venv-hpc/
# - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
# - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
# - Unsloth (single-GPU 2-3x speedup for <13B)
# - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
# - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
#
# Optional flags:
# --no-flash-attn skip flash-attention install (some clusters lack the headers)
# --no-deepspeed skip deepspeed install (only matters for >2x GPU runs)
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$REPO_ROOT"
INSTALL_FLASH_ATTN=1
INSTALL_DEEPSPEED=1
while [[ $# -gt 0 ]]; do
case "$1" in
--no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
--no-deepspeed) INSTALL_DEEPSPEED=0; shift ;;
-h|--help) grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
*) echo "unknown flag: $1"; exit 1 ;;
esac
done
echo "===================================================================="
echo " ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
echo " Repo: $REPO_ROOT"
echo "===================================================================="
# ββ 1. Install uv if missing (no sudo, single binary) ββββββββββββββββββββ
if ! command -v uv >/dev/null 2>&1; then
echo
echo "[1/5] installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
fi
echo "[1/5] uv: $(uv --version)"
# ββ 2. Try to load CUDA + GCC modules if Lmod is present ββββββββββββββββ
echo
echo "[2/5] looking for CUDA / GCC modules..."
if command -v module >/dev/null 2>&1; then
module purge 2>/dev/null || true
for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
done
for mod in gcc/11 gcc/10 gcc; do
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
done
nvcc --version 2>/dev/null || echo " (no nvcc on login node - GPU node will have it)"
else
echo " (no Lmod - assuming system CUDA / GCC)"
fi
# ββ 3. Create venv βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
echo
echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
if [ ! -d ".venv-hpc" ]; then
uv venv --python 3.11 .venv-hpc
fi
# shellcheck disable=SC1091
source .venv-hpc/bin/activate
python -V
# ββ 4. Install training stack ββββββββββββββββββββββββββββββββββββββββββββ
echo
echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
uv pip install --upgrade pip wheel setuptools
# Pinned baseline
uv pip install --no-cache \
"torch==2.4.*" \
"transformers>=4.45,<5.0" \
"trl>=0.15.0,<1.0" \
"datasets>=2.19" \
"accelerate>=0.30" \
"peft>=0.11" \
"bitsandbytes>=0.43" \
"sentencepiece" \
"safetensors" \
"huggingface_hub>=0.24" \
"matplotlib>=3.7" \
"numpy<2.0" \
"pandas" \
"fastapi" \
"uvicorn[standard]" \
"pydantic>=2.6" \
"networkx>=3.2" \
"pyyaml" \
"rich"
# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
uv pip install --no-cache "unsloth"
# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
echo
echo " installing flash-attn (skip with --no-flash-attn if it fails)..."
uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
echo " flash-attn install failed (not fatal β Unsloth has its own kernels)"
fi
# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
echo
echo " installing deepspeed (only used for multi-GPU runs)..."
uv pip install --no-cache "deepspeed>=0.14" || \
echo " deepspeed install failed (not fatal β single-GPU still works)"
fi
# ββ 5. Sanity check ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
echo
echo "[5/5] sanity check..."
python - <<'PY'
import importlib
mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
for m in mods:
try:
v = getattr(importlib.import_module(m), "__version__", "?")
print(f" {m:18s} {v}")
except Exception as e:
print(f" {m:18s} FAILED ({e})")
try:
import unsloth
print(f" unsloth {unsloth.__version__}")
except Exception:
print(" unsloth (not installed - single-GPU will fall back to plain HF)")
PY
mkdir -p logs outputs
echo
echo "===================================================================="
echo " ENV READY"
echo "===================================================================="
echo " next:"
echo " export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
echo " bash scripts/hpc/run_all.sh"
echo "===================================================================="
|