#!/usr/bin/env bash
# ImmunoOrg HPC env setup (4-stage pipeline edition)
# ===================================================
#
# Run ONCE on the *login node* (not inside a SLURM job).
# Idempotent: re-running just verifies the env exists.
#
# Installs:
#   - uv (single-binary Python package manager, no conda needed)
#   - Python 3.11 venv at .venv-hpc/
#   - PyTorch 2.4 + CUDA 12 wheels (broad cluster compat)
#   - TRL >= 0.15, transformers >= 4.45, peft, accelerate, datasets
#   - Unsloth (single-GPU 2-3x speedup for <13B)
#   - bitsandbytes (4-bit quantisation), safetensors, sentencepiece
#   - matplotlib, pyyaml, networkx, pydantic, fastapi (for env package)
#
# Optional flags:
#   --no-flash-attn   skip flash-attention install (some clusters lack the headers)
#   --no-deepspeed    skip deepspeed install (only matters for >2x GPU runs)

set -euo pipefail

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
cd "$REPO_ROOT"

INSTALL_FLASH_ATTN=1
INSTALL_DEEPSPEED=1
while [[ $# -gt 0 ]]; do
    case "$1" in
        --no-flash-attn) INSTALL_FLASH_ATTN=0; shift ;;
        --no-deepspeed)  INSTALL_DEEPSPEED=0; shift ;;
        -h|--help)       grep '^# ' "$0" | sed 's/^# //'; exit 0 ;;
        *) echo "unknown flag: $1"; exit 1 ;;
    esac
done

echo "===================================================================="
echo "  ImmunoOrg 2.0 HPC env setup (4-stage pipeline edition)"
echo "  Repo: $REPO_ROOT"
echo "===================================================================="

# ── 1. Install uv if missing (no sudo, single binary) ────────────────────
if ! command -v uv >/dev/null 2>&1; then
    echo
    echo "[1/5] installing uv..."
    curl -LsSf https://astral.sh/uv/install.sh | sh
    export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
fi
echo "[1/5] uv: $(uv --version)"

# ── 2. Try to load CUDA + GCC modules if Lmod is present ────────────────
echo
echo "[2/5] looking for CUDA / GCC modules..."
if command -v module >/dev/null 2>&1; then
    module purge 2>/dev/null || true
    for mod in cuda/12.4 cuda/12.1 cuda/12.0 cuda/11.8 cuda CUDA; do
        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
    done
    for mod in gcc/11 gcc/10 gcc; do
        if module load "$mod" 2>/dev/null; then echo "    loaded: $mod"; break; fi
    done
    nvcc --version 2>/dev/null || echo "    (no nvcc on login node - GPU node will have it)"
else
    echo "    (no Lmod - assuming system CUDA / GCC)"
fi

# ── 3. Create venv ───────────────────────────────────────────────────────
echo
echo "[3/5] creating venv at .venv-hpc with Python 3.11..."
if [ ! -d ".venv-hpc" ]; then
    uv venv --python 3.11 .venv-hpc
fi
# shellcheck disable=SC1091
source .venv-hpc/bin/activate
python -V

# ── 4. Install training stack ────────────────────────────────────────────
echo
echo "[4/5] installing GRPO / SFT training stack (~5 min)..."
uv pip install --upgrade pip wheel setuptools

# Pinned baseline
uv pip install --no-cache \
    "torch==2.4.*" \
    "transformers>=4.45,<5.0" \
    "trl>=0.15.0,<1.0" \
    "datasets>=2.19" \
    "accelerate>=0.30" \
    "peft>=0.11" \
    "bitsandbytes>=0.43" \
    "sentencepiece" \
    "safetensors" \
    "huggingface_hub>=0.24" \
    "matplotlib>=3.7" \
    "numpy<2.0" \
    "pandas" \
    "fastapi" \
    "uvicorn[standard]" \
    "pydantic>=2.6" \
    "networkx>=3.2" \
    "pyyaml" \
    "rich"

# Unsloth (single-GPU speedup; pulls in xformers / triton matching torch)
uv pip install --no-cache "unsloth"

# Flash-attention 2 (optional; fails on some older clusters / non-Ampere GPUs)
if [ "$INSTALL_FLASH_ATTN" -eq 1 ]; then
    echo
    echo "    installing flash-attn (skip with --no-flash-attn if it fails)..."
    uv pip install --no-cache "flash-attn>=2.5" --no-build-isolation || \
        echo "    flash-attn install failed (not fatal — Unsloth has its own kernels)"
fi

# DeepSpeed for multi-GPU GRPO (optional; only used when --multigpu N > 1)
if [ "$INSTALL_DEEPSPEED" -eq 1 ]; then
    echo
    echo "    installing deepspeed (only used for multi-GPU runs)..."
    uv pip install --no-cache "deepspeed>=0.14" || \
        echo "    deepspeed install failed (not fatal — single-GPU still works)"
fi

# ── 5. Sanity check ──────────────────────────────────────────────────────
echo
echo "[5/5] sanity check..."
python - <<'PY'
import importlib
mods = ["torch", "transformers", "trl", "peft", "datasets", "accelerate", "bitsandbytes", "huggingface_hub"]
for m in mods:
    try:
        v = getattr(importlib.import_module(m), "__version__", "?")
        print(f"  {m:18s} {v}")
    except Exception as e:
        print(f"  {m:18s} FAILED ({e})")
try:
    import unsloth
    print(f"  unsloth            {unsloth.__version__}")
except Exception:
    print("  unsloth            (not installed - single-GPU will fall back to plain HF)")
PY

mkdir -p logs outputs

echo
echo "===================================================================="
echo "  ENV READY"
echo "===================================================================="
echo "  next:"
echo "    export HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
echo "    bash scripts/hpc/run_all.sh"
echo "===================================================================="