Spaces:
Paused
Paused
Upload scripts/hpc/setup_env.sh with huggingface_hub
Browse files- scripts/hpc/setup_env.sh +151 -0
scripts/hpc/setup_env.sh
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# ImmunoOrg HPC env setup (4-stage pipeline edition)
|
| 3 |
+
# ===================================================
|
| 4 |
+
#
|
| 5 |
+
# Run ONCE on the *login node* (not inside a SLURM job).
|
| 6 |
+
# Idempotent: re-running just verifies the env exists.
|
| 7 |
+
#
|
| 8 |
+
# Installs:
|
| 9 |
+
# - uv (single-binary Python package manager, no conda needed)
|
| 10 |
+
# - Python 3.11 venv at .venv-hpc/
|
| 11 |
+
# - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
|
| 12 |
+
# - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
|
| 13 |
+
# - Unsloth (single-GPU 2-3x speedup for <13B)
|
| 14 |
+
# - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
|
| 15 |
+
# - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
|
| 16 |
+
#
|
| 17 |
+
# Optional flags:
|
| 18 |
+
# --no-flash-attn skip flash-attention install (some clusters lack the headers)
|
| 19 |
+
# --no-deepspeed skip deepspeed install (only matters for >2x GPU runs)
|
| 20 |
+
|
| 21 |
+
set -euo pipefail
|
| 22 |
+
|
| 23 |
+
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
| 24 |
+
cd "$REPO_ROOT"
|
| 25 |
+
|
| 26 |
+
INSTALL_FLASH_ATTN=1
|
| 27 |
+
INSTALL_DEEPSPEED=1
|
| 28 |
+
while [[ $# -gt 0 ]]; do
|
| 29 |
+
case "$1" in
|
| 30 |
+
--no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
|
| 31 |
+
--no-deepspeed) INSTALL_DEEPSPEED=0; shift ;;
|
| 32 |
+
-h|--help) grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
|
| 33 |
+
*) echo "unknown flag: $1"; exit 1 ;;
|
| 34 |
+
esac
|
| 35 |
+
done
|
| 36 |
+
|
| 37 |
+
echo "===================================================================="
|
| 38 |
+
echo " ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
|
| 39 |
+
echo " Repo: $REPO_ROOT"
|
| 40 |
+
echo "===================================================================="
|
| 41 |
+
|
| 42 |
+
# ββ 1. Install uv if missing (no sudo, single binary) ββββββββββββββββββββ
|
| 43 |
+
if ! command -v uv >/dev/null 2>&1; then
|
| 44 |
+
echo
|
| 45 |
+
echo "[1/5] installing uv..."
|
| 46 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 47 |
+
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
|
| 48 |
+
fi
|
| 49 |
+
echo "[1/5] uv: $(uv --version)"
|
| 50 |
+
|
| 51 |
+
# ββ 2. Try to load CUDA + GCC modules if Lmod is present ββββββββββββββββ
|
| 52 |
+
echo
|
| 53 |
+
echo "[2/5] looking for CUDA / GCC modules..."
|
| 54 |
+
if command -v module >/dev/null 2>&1; then
|
| 55 |
+
module purge 2>/dev/null || true
|
| 56 |
+
for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
|
| 57 |
+
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
|
| 58 |
+
done
|
| 59 |
+
for mod in gcc/11 gcc/10 gcc; do
|
| 60 |
+
if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
|
| 61 |
+
done
|
| 62 |
+
nvcc --version 2>/dev/null || echo " (no nvcc on login node - GPU node will have it)"
|
| 63 |
+
else
|
| 64 |
+
echo " (no Lmod - assuming system CUDA / GCC)"
|
| 65 |
+
fi
|
| 66 |
+
|
| 67 |
+
# ββ 3. Create venv βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
echo
|
| 69 |
+
echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
|
| 70 |
+
if [ ! -d ".venv-hpc" ]; then
|
| 71 |
+
uv venv --python 3.11 .venv-hpc
|
| 72 |
+
fi
|
| 73 |
+
# shellcheck disable=SC1091
|
| 74 |
+
source .venv-hpc/bin/activate
|
| 75 |
+
python -V
|
| 76 |
+
|
| 77 |
+
# ββ 4. Install training stack ββββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
echo
|
| 79 |
+
echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
|
| 80 |
+
uv pip install --upgrade pip wheel setuptools
|
| 81 |
+
|
| 82 |
+
# Pinned baseline
|
| 83 |
+
uv pip install --no-cache \
|
| 84 |
+
"torch==2.4.*" \
|
| 85 |
+
"transformers>=4.45,<5.0" \
|
| 86 |
+
"trl>=0.15.0,<1.0" \
|
| 87 |
+
"datasets>=2.19" \
|
| 88 |
+
"accelerate>=0.30" \
|
| 89 |
+
"peft>=0.11" \
|
| 90 |
+
"bitsandbytes>=0.43" \
|
| 91 |
+
"sentencepiece" \
|
| 92 |
+
"safetensors" \
|
| 93 |
+
"huggingface_hub>=0.24" \
|
| 94 |
+
"matplotlib>=3.7" \
|
| 95 |
+
"numpy<2.0" \
|
| 96 |
+
"pandas" \
|
| 97 |
+
"fastapi" \
|
| 98 |
+
"uvicorn[standard]" \
|
| 99 |
+
"pydantic>=2.6" \
|
| 100 |
+
"networkx>=3.2" \
|
| 101 |
+
"pyyaml" \
|
| 102 |
+
"rich"
|
| 103 |
+
|
| 104 |
+
# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
|
| 105 |
+
uv pip install --no-cache "unsloth"
|
| 106 |
+
|
| 107 |
+
# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
|
| 108 |
+
if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
|
| 109 |
+
echo
|
| 110 |
+
echo " installing flash-attn (skip with --no-flash-attn if it fails)..."
|
| 111 |
+
uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
|
| 112 |
+
echo " flash-attn install failed (not fatal β Unsloth has its own kernels)"
|
| 113 |
+
fi
|
| 114 |
+
|
| 115 |
+
# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
|
| 116 |
+
if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
|
| 117 |
+
echo
|
| 118 |
+
echo " installing deepspeed (only used for multi-GPU runs)..."
|
| 119 |
+
uv pip install --no-cache "deepspeed>=0.14" || \
|
| 120 |
+
echo " deepspeed install failed (not fatal β single-GPU still works)"
|
| 121 |
+
fi
|
| 122 |
+
|
| 123 |
+
# ββ 5. Sanity check ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
echo
|
| 125 |
+
echo "[5/5] sanity check..."
|
| 126 |
+
python - <<'PY'
|
| 127 |
+
import importlib
|
| 128 |
+
mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
|
| 129 |
+
for m in mods:
|
| 130 |
+
try:
|
| 131 |
+
v = getattr(importlib.import_module(m), "__version__", "?")
|
| 132 |
+
print(f" {m:18s} {v}")
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f" {m:18s} FAILED ({e})")
|
| 135 |
+
try:
|
| 136 |
+
import unsloth
|
| 137 |
+
print(f" unsloth {unsloth.__version__}")
|
| 138 |
+
except Exception:
|
| 139 |
+
print(" unsloth (not installed - single-GPU will fall back to plain HF)")
|
| 140 |
+
PY
|
| 141 |
+
|
| 142 |
+
mkdir -p logs outputs
|
| 143 |
+
|
| 144 |
+
echo
|
| 145 |
+
echo "===================================================================="
|
| 146 |
+
echo " ENV READY"
|
| 147 |
+
echo "===================================================================="
|
| 148 |
+
echo " next:"
|
| 149 |
+
echo " export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
| 150 |
+
echo " bash scripts/hpc/run_all.sh"
|
| 151 |
+
echo "===================================================================="
|