jampuramprem commited on
Commit
a4b12d5
Β·
1 Parent(s): ec4ae03

Fix Space Dockerfile and add requirements.space.txt

Browse files
Files changed (3) hide show
  1. Dockerfile +21 -118
  2. README.md +1 -1
  3. requirements.space.txt +3 -0
Dockerfile CHANGED
@@ -1,130 +1,33 @@
1
- # AxiomForgeAI β€” GRPO Training Image
2
  # ─────────────────────────────────────────────────────────────────────────────
3
- # Hardware target : 1Γ— A100 PCIE 80 GB | AMD EPYC 7V13 | NVMe 300 GB
 
4
  #
5
- # CUDA driver : >= 13.0 (enforced at container start via entrypoint)
6
- # CUDA toolkit : 12.4.1 (backward-compatible with driver 13.x)
7
- # PyTorch : 2.5.1+cu124 (pinned in requirements.txt)
8
- # Flash-Attn : 2.8.3 (pinned in requirements.txt)
9
- #
10
- # All Python package versions are taken exclusively from requirements.txt.
11
- # No versions are hard-coded in this file.
12
- #
13
- # ── Build ─────────────────────────────────────────────────────────────────────
14
- # docker build -t axiomforgeai-train:latest .
15
- #
16
- # ── Interactive shell ─────────────────────────────────────────────────────────
17
- # docker run --gpus all --ipc=host --ulimit memlock=-1 \
18
- # -v $(pwd)/data:/workspace/data \
19
- # -v $(pwd)/checkpoints:/workspace/checkpoints \
20
- # -v $(pwd)/logs:/workspace/logs \
21
- # -it axiomforgeai-train:latest bash
22
- #
23
- # ── GRPO training (one-shot) ──────────────────────────────────────────────────
24
- # docker run --gpus all --ipc=host --ulimit memlock=-1 \
25
- # -v $(pwd)/data:/workspace/data \
26
- # -v $(pwd)/checkpoints:/workspace/checkpoints \
27
- # -v $(pwd)/logs:/workspace/logs \
28
- # axiomforgeai-train:latest \
29
- # python scripts/run_grpo_training.py \
30
- # --base-model checkpoints/dual_task_v1 \
31
- # --gsm8k-data data/sft/gsm8k_sft.jsonl \
32
- # --num-iterations 30 --group-size 8 --questions-per-iter 16
33
  # ─────────────────────────────────────────────────────────────────────────────
34
 
35
- # CUDA toolkit 12.4.1 β€” matches the cu124 wheels in requirements.txt and is
36
- # fully compatible with the A100's CUDA 13.2 driver (driver is always β‰₯ toolkit).
37
- FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
38
-
39
- LABEL org.opencontainers.image.title="AxiomForgeAI Training" \
40
- cuda.driver.minimum="13.0" \
41
- cuda.toolkit="12.4.1" \
42
- torch.version="2.5.1+cu124" \
43
- flash_attn.version="2.8.3"
44
-
45
- # ── System packages ────────────────────────────────────────────────────────────
46
- ENV DEBIAN_FRONTEND=noninteractive
47
- RUN apt-get update && apt-get install -y --no-install-recommends \
48
- python3.11 \
49
- python3.11-dev \
50
- python3-pip \
51
- python3.11-venv \
52
- git \
53
- git-lfs \
54
- curl \
55
- wget \
56
- build-essential \
57
- ninja-build \
58
- pkg-config \
59
- libssl-dev \
60
- libffi-dev \
61
- ca-certificates \
62
- && ln -sf /usr/bin/python3.11 /usr/bin/python3 \
63
- && ln -sf /usr/bin/python3 /usr/bin/python \
64
- && rm -rf /var/lib/apt/lists/*
65
-
66
- # ── Upgrade pip + build tooling ───────────────────────────────────────────────
67
- RUN python -m pip install --upgrade --no-cache-dir pip setuptools wheel
68
 
69
- # ── PyTorch (CUDA 12.4 wheels) ────────────────────────────────────────────────
70
- # Must be installed before flash-attn because flash-attn runs a torch version
71
- # check at install time. The cu124 index is also used for all CUDA-linked wheels.
72
- # Version is taken from requirements.txt β€” the --constraint flag keeps pip from
73
- # re-resolving to a different version when requirements.txt is processed next.
74
- RUN pip install --no-cache-dir \
75
- --extra-index-url https://download.pytorch.org/whl/cu124 \
76
- "torch==2.5.1" "torchvision==0.20.1" "torchaudio==2.5.1"
77
 
78
- # ── All remaining pinned requirements (from requirements.txt) ─────────────────
79
- # flash-attn, xformers, vllm, triton, bitsandbytes, transformers, accelerate,
80
- # peft, ray, sympy, scipy, numpy, openenv-core, fastapi, uvicorn, … are all
81
- # installed here at the exact versions pinned in requirements.txt.
82
- # The cu124 index is provided so CUDA-linked wheels resolve correctly.
83
- COPY requirements.txt /tmp/requirements.txt
84
- RUN pip install --no-cache-dir \
85
- --extra-index-url https://download.pytorch.org/whl/cu124 \
86
- -r /tmp/requirements.txt
87
 
88
- # ── Project source ────────────────────────────────────────────────────────────
89
- WORKDIR /workspace
90
- COPY . /workspace/
91
 
92
- # ── Environment variables ─────────────────────────────────────────────────────
93
- # Repo root on PYTHONPATH so `from src.rl.X import Y` works without editable install
94
- ENV PYTHONPATH="/workspace:$PYTHONPATH"
95
 
96
- # HuggingFace model cache β€” mount a host path here to persist model downloads:
97
- # -v /host/hf_cache:/workspace/.hf_cache
98
- ENV HF_HOME="/workspace/.hf_cache"
99
- ENV TRANSFORMERS_CACHE="/workspace/.hf_cache"
100
 
101
- # A100 CUDA / NCCL tuning
102
- ENV CUDA_DEVICE_MAX_CONNECTIONS=1
103
- ENV NCCL_P2P_DISABLE=0
104
- ENV NCCL_IB_DISABLE=0
105
- # Required for Flash-Attn 2 with bfloat16 on Ampere
106
- ENV TORCH_CUDNN_V8_API_ENABLED=1
107
 
108
- # ── Runtime entrypoint: enforce CUDA driver >= 13.0 ──────────────────────────
109
- # nvidia-smi is injected at runtime via --gpus, so this check runs when the
110
- # container starts, not at build time.
111
- RUN printf '%s\n' \
112
- '#!/bin/sh' \
113
- 'if command -v nvidia-smi >/dev/null 2>&1; then' \
114
- ' CUDA_VER=$(nvidia-smi 2>/dev/null | grep -oP "CUDA Version: \K[0-9.]+" || echo "0.0")' \
115
- ' MAJOR=$(echo "$CUDA_VER" | cut -d. -f1)' \
116
- ' echo "[AxiomForgeAI] CUDA driver reports toolkit: $CUDA_VER"' \
117
- ' if [ "${MAJOR:-0}" -lt 13 ] 2>/dev/null; then' \
118
- ' echo "[ERROR] CUDA driver >= 13.0 required; detected $CUDA_VER. Upgrade your NVIDIA driver."' \
119
- ' exit 1' \
120
- ' fi' \
121
- ' echo "[AxiomForgeAI] CUDA $CUDA_VER >= 13.0 β€” OK"' \
122
- 'else' \
123
- ' echo "[WARNING] nvidia-smi not found β€” CUDA driver version check skipped."' \
124
- 'fi' \
125
- 'exec "$@"' \
126
- > /usr/local/bin/entrypoint.sh \
127
- && chmod +x /usr/local/bin/entrypoint.sh
128
 
129
- ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
130
- CMD ["bash"]
 
1
+ # AxiomForgeAI β€” Hugging Face Docker Space
2
  # ─────────────────────────────────────────────────────────────────────────────
3
+ # CPU-only build that exposes the OpenEnv math RL environment server.
4
+ # Listens on port 7860 (required by Hugging Face Spaces).
5
  #
6
+ # The heavy ML stack (torch, PRM, flash-attn) is NOT installed here.
7
+ # The environment falls back to SymPy-only scoring, which works on CPU.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # ─────────────────────────────────────────────────────────────────────────────
9
 
10
+ FROM python:3.11-slim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # HF requires a non-root user
13
+ RUN useradd -m -u 1000 user
14
+ USER user
15
+ ENV PATH="/home/user/.local/bin:$PATH"
 
 
 
 
16
 
17
+ WORKDIR /app
 
 
 
 
 
 
 
 
18
 
19
+ # Minimal runtime dependencies only (no torch/cuda/flash-attn)
20
+ COPY --chown=user requirements.space.txt requirements.txt
21
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
22
 
23
+ # Copy project source
24
+ COPY --chown=user . /app
 
25
 
26
+ ENV PYTHONPATH="/app:$PYTHONPATH"
 
 
 
27
 
28
+ # Disable HF telemetry inside the Space
29
+ ENV HF_HUB_DISABLE_TELEMETRY=1
 
 
 
 
30
 
31
+ EXPOSE 7860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
8
- app_port: 8000
9
  base_path: /web
10
  tags:
11
  - openenv
 
5
  colorTo: pink
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
9
  base_path: /web
10
  tags:
11
  - openenv
requirements.space.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ openenv[core]>=0.2.0
2
+ fastapi>=0.115.0
3
+ uvicorn>=0.24.0