K446 commited on
Commit
f4d773c
·
1 Parent(s): 9b70933

Fix: install torchao 0.8.0 separately, unsloth --no-deps to avoid torchao>=0.13 conflict

Browse files
Files changed (2) hide show
  1. Dockerfile +15 -22
  2. requirements-training.txt +1 -9
Dockerfile CHANGED
@@ -1,5 +1,4 @@
1
  # Hugging Face Docker Space — OpenGrid
2
- # Serves both the UI dashboard AND GRPO training.
3
  # Set env OPENGRID_MODE=training for training mode.
4
 
5
  FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
@@ -8,7 +7,6 @@ LABEL org.opencontainers.image.title="OpenGrid"
8
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
9
  LABEL openenv="true"
10
 
11
- # Install Python 3.10 and build tools (needed by Triton/Unsloth)
12
  USER root
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  python3.10 python3-pip python3.10-venv python3-dev \
@@ -26,37 +24,32 @@ ENV PATH="/home/user/.local/bin:$PATH"
26
 
27
  WORKDIR /app
28
 
29
- # --- Dependencies ---
30
- # 1. Install server deps (cached across builds)
31
  COPY --chown=user requirements.txt .
32
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
33
 
34
- # 2. Install PyTorch 2.6.0 with CUDA 12.1 (supports torch.int1 for torchao)
35
- RUN pip install --no-cache-dir \
36
- torch==2.6.0 \
37
- --extra-index-url https://download.pytorch.org/whl/cu121
38
 
39
- # 3. Install training deps with pinned compatible versions
 
 
 
40
  COPY --chown=user requirements-training.txt .
41
- RUN pip install --no-cache-dir --upgrade -r requirements-training.txt
 
 
 
42
 
43
- # --- Application code ---
44
  COPY --chown=user src/ /app/src/
45
  COPY --chown=user training/ /app/training/
46
-
47
- COPY --chown=user app.py /app/
48
- COPY --chown=user run_training.py /app/
49
- COPY --chown=user inference.py /app/
50
- COPY --chown=user entrypoint.sh /app/
51
-
52
  COPY --chown=user static/ /app/static/
53
-
54
- COPY --chown=user pyproject.toml /app/
55
- COPY --chown=user openenv.yaml /app/
56
 
57
  RUN chmod +x entrypoint.sh
58
 
59
- # OPENGRID_MODE=training → GRPO pipeline, OPENGRID_MODE=server → UI (default)
60
  EXPOSE 7860
61
 
62
  HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
 
1
  # Hugging Face Docker Space — OpenGrid
 
2
  # Set env OPENGRID_MODE=training for training mode.
3
 
4
  FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
 
7
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
8
  LABEL openenv="true"
9
 
 
10
  USER root
11
  RUN apt-get update && apt-get install -y --no-install-recommends \
12
  python3.10 python3-pip python3.10-venv python3-dev \
 
24
 
25
  WORKDIR /app
26
 
27
+ # 1. Server deps
 
28
  COPY --chown=user requirements.txt .
29
+ RUN pip install --no-cache-dir -r requirements.txt
30
 
31
+ # 2. PyTorch 2.6.0 + CUDA 12.1
32
+ RUN pip install --no-cache-dir torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
 
 
33
 
34
+ # 3. torchao 0.8.0 (compatible with torch 2.6, satisfies transformers import)
35
+ RUN pip install --no-cache-dir torchao==0.8.0
36
+
37
+ # 4. Training deps (no unsloth here)
38
  COPY --chown=user requirements-training.txt .
39
+ RUN pip install --no-cache-dir -r requirements-training.txt
40
+
41
+ # 5. Unsloth --no-deps (avoids torchao>=0.13 conflict)
42
+ RUN pip install --no-cache-dir --no-deps unsloth==2025.11.1 unsloth_zoo
43
 
44
+ # --- App code ---
45
  COPY --chown=user src/ /app/src/
46
  COPY --chown=user training/ /app/training/
47
+ COPY --chown=user app.py run_training.py inference.py entrypoint.sh /app/
 
 
 
 
 
48
  COPY --chown=user static/ /app/static/
49
+ COPY --chown=user pyproject.toml openenv.yaml /app/
 
 
50
 
51
  RUN chmod +x entrypoint.sh
52
 
 
53
  EXPOSE 7860
54
 
55
  HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
requirements-training.txt CHANGED
@@ -1,18 +1,10 @@
1
- # Training dependencies (torch 2.6.0 installed separately in Dockerfile)
2
- # -------------------------------------------------------------------
3
- # Pin versions that are mutually compatible with:
4
- # torch==2.6.0, unsloth==2025.11.1, CUDA 12.1
5
- # -------------------------------------------------------------------
6
-
7
  transformers>=4.51.3,<5.0
8
  trl>=0.12.0,<1.0
9
  peft>=0.13.0
10
  accelerate>=1.0.0
11
  datasets>=3.0.0
12
  bitsandbytes
13
- unsloth==2025.11.1
14
- unsloth_zoo
15
- torchao>=0.5.0,<0.9
16
 
17
  # Shared with environment
18
  fastapi
 
1
+ # Training deps (torch, torchao, unsloth installed separately in Dockerfile)
 
 
 
 
 
2
  transformers>=4.51.3,<5.0
3
  trl>=0.12.0,<1.0
4
  peft>=0.13.0
5
  accelerate>=1.0.0
6
  datasets>=3.0.0
7
  bitsandbytes
 
 
 
8
 
9
  # Shared with environment
10
  fastapi