K446 commited on
Commit
9b70933
·
1 Parent(s): 371b620

Pin compatible versions: torch 2.6.0 + torchao <0.9 + transformers <5.0

Browse files
Files changed (2) hide show
  1. Dockerfile +10 -13
  2. requirements-training.txt +8 -2
Dockerfile CHANGED
@@ -2,7 +2,7 @@
2
  # Serves both the UI dashboard AND GRPO training.
3
  # Set env OPENGRID_MODE=training for training mode.
4
 
5
- FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
6
 
7
  LABEL org.opencontainers.image.title="OpenGrid"
8
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
@@ -12,7 +12,7 @@ LABEL openenv="true"
12
  USER root
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  python3.10 python3-pip python3.10-venv python3-dev \
15
- build-essential gcc g++ && \
16
  ln -sf /usr/bin/python3.10 /usr/bin/python && \
17
  ln -sf /usr/bin/pip3 /usr/bin/pip && \
18
  rm -rf /var/lib/apt/lists/*
@@ -27,39 +27,36 @@ ENV PATH="/home/user/.local/bin:$PATH"
27
  WORKDIR /app
28
 
29
  # --- Dependencies ---
30
- # Install server deps first (cached across builds)
31
  COPY --chown=user requirements.txt .
32
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
33
 
34
- # Install PyTorch (latest version to support torchao/torch.int1)
35
- RUN pip install --no-cache-dir torch
 
 
36
 
37
- # Install training deps (only re-runs if training reqs change)
38
  COPY --chown=user requirements-training.txt .
39
  RUN pip install --no-cache-dir --upgrade -r requirements-training.txt
40
 
41
- # --- Application code (selective COPY for lean images) ---
42
- # Core Python modules
43
  COPY --chown=user src/ /app/src/
44
  COPY --chown=user training/ /app/training/
45
 
46
- # Entry points
47
  COPY --chown=user app.py /app/
48
  COPY --chown=user run_training.py /app/
49
  COPY --chown=user inference.py /app/
50
  COPY --chown=user entrypoint.sh /app/
51
 
52
- # Frontend (small, needed for server mode)
53
  COPY --chown=user static/ /app/static/
54
 
55
- # Config
56
  COPY --chown=user pyproject.toml /app/
57
  COPY --chown=user openenv.yaml /app/
58
 
59
  RUN chmod +x entrypoint.sh
60
 
61
- # Default mode controlled by OPENGRID_MODE env var (set via HF secrets)
62
- # server = FastAPI UI, training = GRPO pipeline
63
  EXPOSE 7860
64
 
65
  HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
 
2
  # Serves both the UI dashboard AND GRPO training.
3
  # Set env OPENGRID_MODE=training for training mode.
4
 
5
+ FROM nvidia/cuda:12.1.1-runtime-ubuntu22.04
6
 
7
  LABEL org.opencontainers.image.title="OpenGrid"
8
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
 
12
  USER root
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
14
  python3.10 python3-pip python3.10-venv python3-dev \
15
+ build-essential gcc g++ git && \
16
  ln -sf /usr/bin/python3.10 /usr/bin/python && \
17
  ln -sf /usr/bin/pip3 /usr/bin/pip && \
18
  rm -rf /var/lib/apt/lists/*
 
27
  WORKDIR /app
28
 
29
  # --- Dependencies ---
30
+ # 1. Install server deps (cached across builds)
31
  COPY --chown=user requirements.txt .
32
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
33
 
34
+ # 2. Install PyTorch 2.6.0 with CUDA 12.1 (supports torch.int1 for torchao)
35
+ RUN pip install --no-cache-dir \
36
+ torch==2.6.0 \
37
+ --extra-index-url https://download.pytorch.org/whl/cu121
38
 
39
+ # 3. Install training deps with pinned compatible versions
40
  COPY --chown=user requirements-training.txt .
41
  RUN pip install --no-cache-dir --upgrade -r requirements-training.txt
42
 
43
+ # --- Application code ---
 
44
  COPY --chown=user src/ /app/src/
45
  COPY --chown=user training/ /app/training/
46
 
 
47
  COPY --chown=user app.py /app/
48
  COPY --chown=user run_training.py /app/
49
  COPY --chown=user inference.py /app/
50
  COPY --chown=user entrypoint.sh /app/
51
 
 
52
  COPY --chown=user static/ /app/static/
53
 
 
54
  COPY --chown=user pyproject.toml /app/
55
  COPY --chown=user openenv.yaml /app/
56
 
57
  RUN chmod +x entrypoint.sh
58
 
59
+ # OPENGRID_MODE=training GRPO pipeline, OPENGRID_MODE=server UI (default)
 
60
  EXPOSE 7860
61
 
62
  HEALTHCHECK --interval=60s --timeout=10s --start-period=600s \
requirements-training.txt CHANGED
@@ -1,5 +1,10 @@
1
- # Training dependencies (torch installed separately in Dockerfile with CUDA)
2
- transformers>=4.51.3
 
 
 
 
 
3
  trl>=0.12.0,<1.0
4
  peft>=0.13.0
5
  accelerate>=1.0.0
@@ -7,6 +12,7 @@ datasets>=3.0.0
7
  bitsandbytes
8
  unsloth==2025.11.1
9
  unsloth_zoo
 
10
 
11
  # Shared with environment
12
  fastapi
 
1
+ # Training dependencies (torch 2.6.0 installed separately in Dockerfile)
2
+ # -------------------------------------------------------------------
3
+ # Pin versions that are mutually compatible with:
4
+ # torch==2.6.0, unsloth==2025.11.1, CUDA 12.1
5
+ # -------------------------------------------------------------------
6
+
7
+ transformers>=4.51.3,<5.0
8
  trl>=0.12.0,<1.0
9
  peft>=0.13.0
10
  accelerate>=1.0.0
 
12
  bitsandbytes
13
  unsloth==2025.11.1
14
  unsloth_zoo
15
+ torchao>=0.5.0,<0.9
16
 
17
  # Shared with environment
18
  fastapi