Fix: install torchao 0.8.0 separately, unsloth --no-deps to avoid torchao>=0.13 conflict
Browse files- Dockerfile +15 -22
- requirements-training.txt +1 -9
Dockerfile
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
# Hugging Face Docker Space — OpenGrid
|
| 2 |
-
# Serves both the UI dashboard AND GRPO training.
|
| 3 |
# Set env OPENGRID_MODE=training for training mode.
|
| 4 |
|
| 5 |
FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
|
|
@@ -8,7 +7,6 @@ LABEL org.opencontainers.image.title="OpenGrid"
|
|
| 8 |
LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
|
| 9 |
LABEL openenv="true"
|
| 10 |
|
| 11 |
-
# Install Python 3.10 and build tools (needed by Triton/Unsloth)
|
| 12 |
USER root
|
| 13 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 14 |
python3.10 python3-pip python3.10-venv python3-dev \
|
|
@@ -26,37 +24,32 @@ ENV PATH="/home/user/.local/bin:$PATH"
|
|
| 26 |
|
| 27 |
WORKDIR /app
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
# 1. Install server deps (cached across builds)
|
| 31 |
COPY --chown=user requirements.txt .
|
| 32 |
-
RUN pip install --no-cache-dir -
|
| 33 |
|
| 34 |
-
# 2.
|
| 35 |
-
RUN pip install --no-cache-dir
|
| 36 |
-
torch==2.6.0 \
|
| 37 |
-
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 38 |
|
| 39 |
-
# 3.
|
|
|
|
|
|
|
|
|
|
| 40 |
COPY --chown=user requirements-training.txt .
|
| 41 |
-
RUN pip install --no-cache-dir -
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
# ---
|
| 44 |
COPY --chown=user src/ /app/src/
|
| 45 |
COPY --chown=user training/ /app/training/
|
| 46 |
-
|
| 47 |
-
COPY --chown=user app.py /app/
|
| 48 |
-
COPY --chown=user run_training.py /app/
|
| 49 |
-
COPY --chown=user inference.py /app/
|
| 50 |
-
COPY --chown=user entrypoint.sh /app/
|
| 51 |
-
|
| 52 |
COPY --chown=user static/ /app/static/
|
| 53 |
-
|
| 54 |
-
COPY --chown=user pyproject.toml /app/
|
| 55 |
-
COPY --chown=user openenv.yaml /app/
|
| 56 |
|
| 57 |
RUN chmod +x entrypoint.sh
|
| 58 |
|
| 59 |
-
# OPENGRID_MODE=training → GRPO pipeline, OPENGRID_MODE=server → UI (default)
|
| 60 |
EXPOSE 7860
|
| 61 |
|
| 62 |
HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
|
|
|
|
| 1 |
# Hugging Face Docker Space — OpenGrid
|
|
|
|
| 2 |
# Set env OPENGRID_MODE=training for training mode.
|
| 3 |
|
| 4 |
FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
|
|
|
|
| 7 |
LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
|
| 8 |
LABEL openenv="true"
|
| 9 |
|
|
|
|
| 10 |
USER root
|
| 11 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
python3.10 python3-pip python3.10-venv python3-dev \
|
|
|
|
| 24 |
|
| 25 |
WORKDIR /app
|
| 26 |
|
| 27 |
+
# 1. Server deps
|
|
|
|
| 28 |
COPY --chown=user requirements.txt .
|
| 29 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 30 |
|
| 31 |
+
# 2. PyTorch 2.6.0 + CUDA 12.1
|
| 32 |
+
RUN pip install --no-cache-dir torch==2.6.0 --extra-index-url https://download.pytorch.org/whl/cu121
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# 3. torchao 0.8.0 (compatible with torch 2.6, satisfies transformers import)
|
| 35 |
+
RUN pip install --no-cache-dir torchao==0.8.0
|
| 36 |
+
|
| 37 |
+
# 4. Training deps (no unsloth here)
|
| 38 |
COPY --chown=user requirements-training.txt .
|
| 39 |
+
RUN pip install --no-cache-dir -r requirements-training.txt
|
| 40 |
+
|
| 41 |
+
# 5. Unsloth --no-deps (avoids torchao>=0.13 conflict)
|
| 42 |
+
RUN pip install --no-cache-dir --no-deps unsloth==2025.11.1 unsloth_zoo
|
| 43 |
|
| 44 |
+
# --- App code ---
|
| 45 |
COPY --chown=user src/ /app/src/
|
| 46 |
COPY --chown=user training/ /app/training/
|
| 47 |
+
COPY --chown=user app.py run_training.py inference.py entrypoint.sh /app/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
COPY --chown=user static/ /app/static/
|
| 49 |
+
COPY --chown=user pyproject.toml openenv.yaml /app/
|
|
|
|
|
|
|
| 50 |
|
| 51 |
RUN chmod +x entrypoint.sh
|
| 52 |
|
|
|
|
| 53 |
EXPOSE 7860
|
| 54 |
|
| 55 |
HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
|
requirements-training.txt
CHANGED
|
@@ -1,18 +1,10 @@
|
|
| 1 |
-
# Training
|
| 2 |
-
# -------------------------------------------------------------------
|
| 3 |
-
# Pin versions that are mutually compatible with:
|
| 4 |
-
# torch==2.6.0, unsloth==2025.11.1, CUDA 12.1
|
| 5 |
-
# -------------------------------------------------------------------
|
| 6 |
-
|
| 7 |
transformers>=4.51.3,<5.0
|
| 8 |
trl>=0.12.0,<1.0
|
| 9 |
peft>=0.13.0
|
| 10 |
accelerate>=1.0.0
|
| 11 |
datasets>=3.0.0
|
| 12 |
bitsandbytes
|
| 13 |
-
unsloth==2025.11.1
|
| 14 |
-
unsloth_zoo
|
| 15 |
-
torchao>=0.5.0,<0.9
|
| 16 |
|
| 17 |
# Shared with environment
|
| 18 |
fastapi
|
|
|
|
| 1 |
+
# Training deps (torch, torchao, unsloth installed separately in Dockerfile)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
transformers>=4.51.3,<5.0
|
| 3 |
trl>=0.12.0,<1.0
|
| 4 |
peft>=0.13.0
|
| 5 |
accelerate>=1.0.0
|
| 6 |
datasets>=3.0.0
|
| 7 |
bitsandbytes
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Shared with environment
|
| 10 |
fastapi
|