Spaces:

hirann
/

immunoorg-v3

Paused

App Files Files Community

immunoorg-v3 / scripts /hpc /setup_env.sh

hirann

Upload scripts/hpc/setup_env.sh with huggingface_hub

2f59eb7 verified 12 days ago

raw

history blame contribute delete

5.66 kB

	#!/usr/bin/env bash
	# ImmunoOrg HPC env setup (4-stage pipeline edition)
	# ===================================================
	#
	# Run ONCE on the login node (not inside a SLURM job).
	# Idempotent: re-running just verifies the env exists.
	#
	# Installs:
	# - uv (single-binary Python package manager, no conda needed)
	# - Python 3.11 venv at .venv-hpc/
	# - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
	# - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
	# - Unsloth (single-GPU 2-3x speedup for <13B)
	# - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
	# - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
	#
	# Optional flags:
	# --no-flash-attn skip flash-attention install (some clusters lack the headers)
	# --no-deepspeed skip deepspeed install (only matters for >2x GPU runs)

	set -euo pipefail

	REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
	cd "$REPO_ROOT"

	INSTALL_FLASH_ATTN=1
	INSTALL_DEEPSPEED=1
	while [[ $# -gt 0 ]]; do
	case "$1" in
	--no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
	--no-deepspeed) INSTALL_DEEPSPEED=0; shift ;;
	-h\|--help) grep '^# ' "$0" \| sed 's/^# //'; exit 0 ;;
	*) echo "unknown flag: $1"; exit 1 ;;
	esac
	done

	echo "===================================================================="
	echo " ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
	echo " Repo: $REPO_ROOT"
	echo "===================================================================="

	# ── 1. Install uv if missing (no sudo, single binary) ────────────────────
	if ! command -v uv >/dev/null 2>&1; then
	echo
	echo "[1/5] installing uv..."
	curl -LsSf https://astral.sh/uv/install.sh \| sh
	export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
	fi
	echo "[1/5] uv: $(uv --version)"

	# ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
	echo
	echo "[2/5] looking for CUDA / GCC modules..."
	if command -v module >/dev/null 2>&1; then
	module purge 2>/dev/null \|\| true
	for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
	if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
	done
	for mod in gcc/11 gcc/10 gcc; do
	if module load "$mod" 2>/dev/null; then echo " loaded: $mod"; break; fi
	done
	nvcc --version 2>/dev/null \|\| echo " (no nvcc on login node - GPU node will have it)"
	else
	echo " (no Lmod - assuming system CUDA / GCC)"
	fi

	# ── 3. Create venv ───────────────────────────────────────────────────────
	echo
	echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
	if [ ! -d ".venv-hpc" ]; then
	uv venv --python 3.11 .venv-hpc
	fi
	# shellcheck disable=SC1091
	source .venv-hpc/bin/activate
	python -V

	# ── 4. Install training stack ────────────────────────────────────────────
	echo
	echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
	uv pip install --upgrade pip wheel setuptools

	# Pinned baseline
	uv pip install --no-cache \
	"torch==2.4.*" \
	"transformers>=4.45,<5.0" \
	"trl>=0.15.0,<1.0" \
	"datasets>=2.19" \
	"accelerate>=0.30" \
	"peft>=0.11" \
	"bitsandbytes>=0.43" \
	"sentencepiece" \
	"safetensors" \
	"huggingface_hub>=0.24" \
	"matplotlib>=3.7" \
	"numpy<2.0" \
	"pandas" \
	"fastapi" \
	"uvicorn[standard]" \
	"pydantic>=2.6" \
	"networkx>=3.2" \
	"pyyaml" \
	"rich"

	# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
	uv pip install --no-cache "unsloth"

	# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
	if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
	echo
	echo " installing flash-attn (skip with --no-flash-attn if it fails)..."
	uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation \|\| \
	echo " flash-attn install failed (not fatal — Unsloth has its own kernels)"
	fi

	# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
	if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
	echo
	echo " installing deepspeed (only used for multi-GPU runs)..."
	uv pip install --no-cache "deepspeed>=0.14" \|\| \
	echo " deepspeed install failed (not fatal — single-GPU still works)"
	fi

	# ── 5. Sanity check ──────────────────────────────────────────────────────
	echo
	echo "[5/5] sanity check..."
	python - <<'PY'
	import importlib
	mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
	for m in mods:
	try:
	v = getattr(importlib.import_module(m), "__version__", "?")
	print(f" {m:18s} {v}")
	except Exception as e:
	print(f" {m:18s} FAILED ({e})")
	try:
	import unsloth
	print(f" unsloth {unsloth.__version__}")
	except Exception:
	print(" unsloth (not installed - single-GPU will fall back to plain HF)")
	PY

	mkdir -p logs outputs

	echo
	echo "===================================================================="
	echo " ENV READY"
	echo "===================================================================="
	echo " next:"
	echo " export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	echo " bash scripts/hpc/run_all.sh"
	echo "===================================================================="