File size: 5,659 Bytes
2f59eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env bash
# ImmunoOrg HPC env setup (4-stage pipeline edition)
# ===================================================
#
# Run ONCE on the *login node* (not inside a SLURM job).
# Idempotent: re-running just verifies the env exists.
#
# Installs:
#   - uv (single-binary Python package manager, no conda needed)
#   - Python 3.11 venv at .venv-hpc/
#   - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
#   - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
#   - Unsloth (single-GPU 2-3x speedup for <13B)
#   - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
#   - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
#
# Optional flags:
#   --no-flash-attn   skip flash-attention install (some clusters lack the headers)
#   --no-deepspeed    skip deepspeed install (only matters for >2x GPU runs)

set -euo pipefail

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$REPO_ROOT"

INSTALL_FLASH_ATTN=1
INSTALL_DEEPSPEED=1
while [[ $# -gt 0 ]]; do
    case "$1" in
        --no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
        --no-deepspeed)  INSTALL_DEEPSPEED=0; shift ;;
        -h|--help)       grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
        *) echo "unknown flag: $1"; exit 1 ;;
    esac
done

echo "===================================================================="
echo "  ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
echo "  Repo: $REPO_ROOT"
echo "===================================================================="

# ── 1. Install uv if missing (no sudo, single binary) ────────────────────
if ! command -v uv >/dev/null 2>&1; then
    echo
    echo "[1/5] installing uv..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
fi
echo "[1/5] uv: $(uv --version)"

# ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
echo
echo "[2/5] looking for CUDA / GCC modules..."
if command -v module >/dev/null 2>&1; then
    module purge 2>/dev/null || true
    for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
    done
    for mod in gcc/11 gcc/10 gcc; do
        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
    done
    nvcc --version 2>/dev/null || echo "    (no nvcc on login node - GPU node will have it)"
else
    echo "    (no Lmod - assuming system CUDA / GCC)"
fi

# ── 3. Create venv ───────────────────────────────────────────────────────
echo
echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
if [ ! -d ".venv-hpc" ]; then
    uv venv --python 3.11 .venv-hpc
fi
# shellcheck disable=SC1091
source .venv-hpc/bin/activate
python -V

# ── 4. Install training stack ────────────────────────────────────────────
echo
echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
uv pip install --upgrade pip wheel setuptools

# Pinned baseline
uv pip install --no-cache \
    "torch==2.4.*" \
    "transformers>=4.45,<5.0" \
    "trl>=0.15.0,<1.0" \
    "datasets>=2.19" \
    "accelerate>=0.30" \
    "peft>=0.11" \
    "bitsandbytes>=0.43" \
    "sentencepiece" \
    "safetensors" \
    "huggingface_hub>=0.24" \
    "matplotlib>=3.7" \
    "numpy<2.0" \
    "pandas" \
    "fastapi" \
    "uvicorn[standard]" \
    "pydantic>=2.6" \
    "networkx>=3.2" \
    "pyyaml" \
    "rich"

# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
uv pip install --no-cache "unsloth"

# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
    echo
    echo "    installing flash-attn (skip with --no-flash-attn if it fails)..."
    uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
        echo "    flash-attn install failed (not fatal β€” Unsloth has its own kernels)"
fi

# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
    echo
    echo "    installing deepspeed (only used for multi-GPU runs)..."
    uv pip install --no-cache "deepspeed>=0.14" || \
        echo "    deepspeed install failed (not fatal β€” single-GPU still works)"
fi

# ── 5. Sanity check ──────────────────────────────────────────────────────
echo
echo "[5/5] sanity check..."
python - <<'PY'
import importlib
mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
for m in mods:
    try:
        v = getattr(importlib.import_module(m), "__version__", "?")
        print(f"  {m:18s} {v}")
    except Exception as e:
        print(f"  {m:18s} FAILED ({e})")
try:
    import unsloth
    print(f"  unsloth            {unsloth.__version__}")
except Exception:
    print("  unsloth            (not installed - single-GPU will fall back to plain HF)")
PY

mkdir -p logs outputs

echo
echo "===================================================================="
echo "  ENV READY"
echo "===================================================================="
echo "  next:"
echo "    export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
echo "    bash scripts/hpc/run_all.sh"
echo "===================================================================="