K446 commited on
Commit
b2a04c7
·
1 Parent(s): 1dfed79

fix: lean Dockerfile + remove unsloth from training deps

Browse files
Files changed (2) hide show
  1. Dockerfile +26 -16
  2. requirements-training.txt +10 -13
Dockerfile CHANGED
@@ -1,8 +1,6 @@
1
  # Hugging Face Docker Space — OpenGrid
2
- # Docs: https://huggingface.co/docs/hub/spaces-sdks-docker
3
- #
4
- # This Dockerfile serves both the UI Space and the Training Space.
5
- # Set OPENGRID_MODE=training to run GRPO training instead of the server.
6
 
7
  FROM python:3.10-slim
8
 
@@ -10,34 +8,46 @@ LABEL org.opencontainers.image.title="OpenGrid"
10
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
11
  LABEL openenv="true"
12
 
13
- # Create non-root user required by HF Spaces
14
  RUN useradd -m -u 1000 user
15
  USER user
16
  ENV PATH="/home/user/.local/bin:$PATH"
17
 
18
  WORKDIR /app
19
 
20
- # Install dependencies (both server and training)
 
21
  COPY --chown=user requirements.txt .
 
 
 
22
  COPY --chown=user requirements-training.txt .
23
- RUN pip install --no-cache-dir --upgrade -r requirements.txt \
24
- && pip install --no-cache-dir --upgrade -r requirements-training.txt
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Copy application code
27
- COPY --chown=user . /app
 
 
 
 
28
 
29
- # Make entrypoint executable
30
  RUN chmod +x entrypoint.sh
31
 
32
- # Default to server mode (override with OPENGRID_MODE=training)
33
  ENV OPENGRID_MODE=server
34
-
35
- # Expose HF Spaces default port
36
  EXPOSE 7860
37
 
38
- # Healthcheck (only applies in server mode)
39
  HEALTHCHECK --interval=30s --timeout=5s --start-period=15s \
40
  CMD python -c "import httpx; httpx.get('http://localhost:7860/health').raise_for_status()" || exit 1
41
 
42
- # Entrypoint switches between server and training
43
  CMD ["./entrypoint.sh"]
 
1
  # Hugging Face Docker Space — OpenGrid
2
+ # Serves both the UI dashboard AND GRPO training.
3
+ # Set env OPENGRID_MODE=training for training mode.
 
 
4
 
5
  FROM python:3.10-slim
6
 
 
8
  LABEL org.opencontainers.image.description="Renewable energy grid load-balancing environment"
9
  LABEL openenv="true"
10
 
 
11
  RUN useradd -m -u 1000 user
12
  USER user
13
  ENV PATH="/home/user/.local/bin:$PATH"
14
 
15
  WORKDIR /app
16
 
17
+ # --- Dependencies ---
18
+ # Install server deps first (cached across builds)
19
  COPY --chown=user requirements.txt .
20
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
21
+
22
+ # Install training deps (only re-runs if training reqs change)
23
  COPY --chown=user requirements-training.txt .
24
+ RUN pip install --no-cache-dir --upgrade -r requirements-training.txt
25
+
26
+ # --- Application code (selective COPY for lean images) ---
27
+ # Core Python modules
28
+ COPY --chown=user src/ /app/src/
29
+ COPY --chown=user training/ /app/training/
30
+
31
+ # Entry points
32
+ COPY --chown=user app.py /app/
33
+ COPY --chown=user run_training.py /app/
34
+ COPY --chown=user inference.py /app/
35
+ COPY --chown=user entrypoint.sh /app/
36
 
37
+ # Frontend (small, needed for server mode)
38
+ COPY --chown=user static/ /app/static/
39
+
40
+ # Config
41
+ COPY --chown=user pyproject.toml /app/
42
+ COPY --chown=user openenv.yaml /app/
43
 
 
44
  RUN chmod +x entrypoint.sh
45
 
46
+ # Default: server mode. Set OPENGRID_MODE=training for GRPO.
47
  ENV OPENGRID_MODE=server
 
 
48
  EXPOSE 7860
49
 
 
50
  HEALTHCHECK --interval=30s --timeout=5s --start-period=15s \
51
  CMD python -c "import httpx; httpx.get('http://localhost:7860/health').raise_for_status()" || exit 1
52
 
 
53
  CMD ["./entrypoint.sh"]
requirements-training.txt CHANGED
@@ -1,20 +1,17 @@
1
- # Core env
 
 
 
 
 
 
 
 
 
2
  fastapi
3
  uvicorn[standard]
4
  pydantic>=2.0
5
  numpy
6
  networkx
7
  matplotlib
8
- openai
9
  httpx
10
- openenv-core>=0.2.0
11
-
12
- # Training
13
- torch
14
- transformers
15
- trl>=0.17.0
16
- peft
17
- accelerate
18
- bitsandbytes
19
- datasets
20
- unsloth
 
1
+ # Training dependencies only — no unsloth (causes 10+ min dep resolution)
2
+ torch
3
+ transformers>=4.45.0,<5.0
4
+ trl>=0.12.0,<2.0
5
+ peft>=0.13.0
6
+ accelerate>=1.0.0
7
+ datasets>=3.0.0
8
+ bitsandbytes
9
+
10
+ # Shared with environment
11
  fastapi
12
  uvicorn[standard]
13
  pydantic>=2.0
14
  numpy
15
  networkx
16
  matplotlib
 
17
  httpx