hirann commited on
Commit
2f59eb7
Β·
verified Β·
1 Parent(s): 4593aaf

Upload scripts/hpc/setup_env.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/hpc/setup_env.sh +151 -0
scripts/hpc/setup_env.sh ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # ImmunoOrg HPC env setup (4-stage pipeline edition)
3
+ # ===================================================
4
+ #
5
+ # Run ONCE on the *login node* (not inside a SLURM job).
6
+ # Idempotent: re-running just verifies the env exists.
7
+ #
8
+ # Installs:
9
+ # - uv (single-binary Python package manager, no conda needed)
10
+ # - Python 3.11 venv at .venv-hpc/
11
+ # - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
12
+ # - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
13
+ # - Unsloth (single-GPU 2-3x speedup for <13B)
14
+ # - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
15
+ # - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
16
+ #
17
+ # Optional flags:
18
+ # --no-flash-attn skip flash-attention install (some clusters lack the headers)
19
+ # --no-deepspeed skip deepspeed install (only matters for >2x GPU runs)
20
+
21
+ set -euo pipefail
22
+
23
+ REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
24
+ cd "$REPO_ROOT"
25
+
26
+ INSTALL_FLASH_ATTN=1
27
+ INSTALL_DEEPSPEED=1
28
+ while [[ $# -gt 0 ]]; do
29
+ case "$1" in
30
+ --no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
31
+ --no-deepspeed) INSTALL_DEEPSPEED=0; shift ;;
32
+ -h|--help) grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
33
+ *) echo "unknown flag: $1"; exit 1 ;;
34
+ esac
35
+ done
36
+
37
+ echo "===================================================================="
38
+ echo " ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
39
+ echo " Repo: $REPO_ROOT"
40
+ echo "===================================================================="
41
+
42
+ # ── 1. Install uv if missing (no sudo, single binary) ────────────────────
43
+ if ! command -v uv >/dev/null 2>&1; then
44
+ echo
45
+ echo "[1/5] installing uv..."
46
+ curl -LsSf https://astral.sh/uv/install.sh | sh
47
+ export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
48
+ fi
49
+ echo "[1/5] uv: $(uv --version)"
50
+
51
+ # ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
52
+ echo
53
+ echo "[2/5] looking for CUDA / GCC modules..."
54
+ if command -v module >/dev/null 2>&1; then
55
+ module purge 2>/dev/null || true
56
+ for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
57
+ if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
58
+ done
59
+ for mod in gcc/11 gcc/10 gcc; do
60
+ if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
61
+ done
62
+ nvcc --version 2>/dev/null || echo " (no nvcc on login node - GPU node will have it)"
63
+ else
64
+ echo " (no Lmod - assuming system CUDA / GCC)"
65
+ fi
66
+
67
+ # ── 3. Create venv ───────────────────────────────────────────────────────
68
+ echo
69
+ echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
70
+ if [ ! -d ".venv-hpc" ]; then
71
+ uv venv --python 3.11 .venv-hpc
72
+ fi
73
+ # shellcheck disable=SC1091
74
+ source .venv-hpc/bin/activate
75
+ python -V
76
+
77
+ # ── 4. Install training stack ────────────────────────────────────────────
78
+ echo
79
+ echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
80
+ uv pip install --upgrade pip wheel setuptools
81
+
82
+ # Pinned baseline
83
+ uv pip install --no-cache \
84
+ "torch==2.4.*" \
85
+ "transformers>=4.45,<5.0" \
86
+ "trl>=0.15.0,<1.0" \
87
+ "datasets>=2.19" \
88
+ "accelerate>=0.30" \
89
+ "peft>=0.11" \
90
+ "bitsandbytes>=0.43" \
91
+ "sentencepiece" \
92
+ "safetensors" \
93
+ "huggingface_hub>=0.24" \
94
+ "matplotlib>=3.7" \
95
+ "numpy<2.0" \
96
+ "pandas" \
97
+ "fastapi" \
98
+ "uvicorn[standard]" \
99
+ "pydantic>=2.6" \
100
+ "networkx>=3.2" \
101
+ "pyyaml" \
102
+ "rich"
103
+
104
+ # Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
105
+ uv pip install --no-cache "unsloth"
106
+
107
+ # Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
108
+ if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
109
+ echo
110
+ echo " installing flash-attn (skip with --no-flash-attn if it fails)..."
111
+ uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
112
+ echo " flash-attn install failed (not fatal β€” Unsloth has its own kernels)"
113
+ fi
114
+
115
+ # DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
116
+ if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
117
+ echo
118
+ echo " installing deepspeed (only used for multi-GPU runs)..."
119
+ uv pip install --no-cache "deepspeed>=0.14" || \
120
+ echo " deepspeed install failed (not fatal β€” single-GPU still works)"
121
+ fi
122
+
123
+ # ── 5. Sanity check ──────────────────────────────────────────────────────
124
+ echo
125
+ echo "[5/5] sanity check..."
126
+ python - <<'PY'
127
+ import importlib
128
+ mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
129
+ for m in mods:
130
+ try:
131
+ v = getattr(importlib.import_module(m), "__version__", "?")
132
+ print(f" {m:18s} {v}")
133
+ except Exception as e:
134
+ print(f" {m:18s} FAILED ({e})")
135
+ try:
136
+ import unsloth
137
+ print(f" unsloth {unsloth.__version__}")
138
+ except Exception:
139
+ print(" unsloth (not installed - single-GPU will fall back to plain HF)")
140
+ PY
141
+
142
+ mkdir -p logs outputs
143
+
144
+ echo
145
+ echo "===================================================================="
146
+ echo " ENV READY"
147
+ echo "===================================================================="
148
+ echo " next:"
149
+ echo " export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
150
+ echo " bash scripts/hpc/run_all.sh"
151
+ echo "===================================================================="