Switch to GPU Dockerfile + 8b reconciler for nvidia-t4-small
Browse filesUser upgraded the HF Space to nvidia-t4-small (HF Pro). Production
image variant from the spine HEAD:
- Base: nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04
- Ollama installer auto-detects GPU, dispatches CUDA build
- Granite 4.1:8b pulled at *runtime* (HF build sandbox can't fit
8B + EO toolchain alongside torch); ~2 min cold start, then
OLLAMA_KEEP_ALIVE=24h holds it resident
- 3b alias remapped to 8b via RIPRAP_OLLAMA_3B_TAG so the planner
+ reconciler both run on 8b (full quality, single warm model)
- Flash attention + KV cache q8_0 for ~2x throughput on T4
- Pre-warm 8b into VRAM in entrypoint so the first reconcile
doesn't pay the ~30s model-load tax
EO toolchain (Phase 1 Prithvi live + Phase 4 TerraMind synthesis)
runtime-installs into $HOME/.eo-pkgs (build-sandbox couldn't fit
it). If the install fails, the lazy-import in those specialists
returns 'skipped' cleanly and the other 14 specialists run normally.
Inference drops from ~60-180s on cpu-basic to ~2-4s on t4-small
for the synthesis reconciler.
- Dockerfile +62 -41
- entrypoint.sh +88 -5
|
@@ -1,36 +1,32 @@
|
|
| 1 |
-
# Riprap β Hugging Face Spaces (Docker SDK
|
| 2 |
#
|
| 3 |
-
#
|
| 4 |
-
#
|
| 5 |
-
#
|
|
|
|
| 6 |
#
|
| 7 |
# Bakes:
|
| 8 |
-
# - Python 3.
|
| 9 |
-
# - Ollama + granite4.1:3b model (~2 GB)
|
| 10 |
-
# RIPRAP_OLLAMA_8B_TAG=granite4.1:3b aliases the 8b reconciler
|
| 11 |
-
# calls to 3b so the polished UI runs end-to-end without 8b's
|
| 12 |
-
# ~5 GB image cost. Quality drops vs 8b; speed lever is the
|
| 13 |
-
# vLLM-on-AMD-MI300X demo path (RIPRAP_LLM_PRIMARY=vllm).
|
| 14 |
# - All pre-computed fixtures in data/ + corpus/
|
| 15 |
#
|
| 16 |
# Runtime:
|
| 17 |
-
# - Ollama daemon serves Granite 4.1
|
| 18 |
# - Granite Embedding 278M auto-downloads via sentence-transformers
|
| 19 |
-
# on first FastAPI startup (~280 MB)
|
| 20 |
-
# - uvicorn FastAPI on port 7860 (HF default)
|
| 21 |
|
| 22 |
-
FROM
|
| 23 |
|
| 24 |
-
# OS deps
|
| 25 |
-
|
| 26 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
| 27 |
curl ca-certificates zstd procps \
|
| 28 |
gdal-bin libgdal-dev libgeos-dev libproj-dev \
|
| 29 |
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
|
| 31 |
# HF Spaces convention: run as a non-root "user" account at /home/user/app.
|
| 32 |
-
# Granite Embedding cache lives in /home/user/.cache/huggingface β it
|
| 33 |
-
# survives container restarts when persistent storage is mounted there.
|
| 34 |
RUN useradd -m -u 1000 user
|
| 35 |
ENV HOME=/home/user \
|
| 36 |
PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:/bin \
|
|
@@ -39,38 +35,63 @@ ENV HOME=/home/user \
|
|
| 39 |
OLLAMA_HOST=127.0.0.1:11434 \
|
| 40 |
OLLAMA_NUM_PARALLEL=1 \
|
| 41 |
OLLAMA_KEEP_ALIVE=24h \
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
-
# Install Ollama
|
|
|
|
|
|
|
|
|
|
| 47 |
RUN curl -fsSL https://ollama.com/install.sh | sh
|
| 48 |
|
| 49 |
WORKDIR /home/user/app
|
| 50 |
|
| 51 |
-
# Python deps
|
|
|
|
| 52 |
COPY --chown=user:user requirements.txt ./
|
| 53 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 54 |
pip install --no-cache-dir -r requirements.txt
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
#
|
| 58 |
-
# user-writable location so the runtime container can also serve.
|
| 59 |
#
|
| 60 |
-
#
|
| 61 |
-
#
|
| 62 |
-
#
|
| 63 |
-
#
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# App code + fixtures
|
| 76 |
COPY --chown=user:user app/ ./app/
|
|
@@ -78,7 +99,7 @@ COPY --chown=user:user web/ ./web/
|
|
| 78 |
COPY --chown=user:user scripts/ ./scripts/
|
| 79 |
COPY --chown=user:user data/ ./data/
|
| 80 |
COPY --chown=user:user corpus/ ./corpus/
|
| 81 |
-
COPY --chown=user:user agent.py
|
| 82 |
COPY --chown=user:user entrypoint.sh ./
|
| 83 |
RUN chmod +x ./entrypoint.sh
|
| 84 |
|
|
|
|
| 1 |
+
# Riprap β Hugging Face Spaces deployment (Docker SDK, GPU).
|
| 2 |
#
|
| 3 |
+
# Base: NVIDIA CUDA 12.4 runtime + cuDNN on Ubuntu 22.04. Ollama's
|
| 4 |
+
# installer detects the GPU and pulls the CUDA-aware build automatically;
|
| 5 |
+
# Granite 4.1:3b inference drops from ~60-180s on CPU Basic to ~2-4s on
|
| 6 |
+
# nvidia-t4-small.
|
| 7 |
#
|
| 8 |
# Bakes:
|
| 9 |
+
# - Python 3.10 (default on 22.04) + pip deps (~2.5 GB once torch is in)
|
| 10 |
+
# - Ollama + granite4.1:3b model (~2 GB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
# - All pre-computed fixtures in data/ + corpus/
|
| 12 |
#
|
| 13 |
# Runtime:
|
| 14 |
+
# - Ollama daemon serves Granite 4.1 via CUDA
|
| 15 |
# - Granite Embedding 278M auto-downloads via sentence-transformers
|
| 16 |
+
# on first FastAPI startup (~280 MB)
|
| 17 |
+
# - uvicorn FastAPI on port 7860 (HF Spaces default)
|
| 18 |
|
| 19 |
+
FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base
|
| 20 |
|
| 21 |
+
# OS deps: Python 3.10 + geo libs + Ollama install dependencies.
|
| 22 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 23 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 24 |
+
python3 python3-pip python3-venv python-is-python3 \
|
| 25 |
curl ca-certificates zstd procps \
|
| 26 |
gdal-bin libgdal-dev libgeos-dev libproj-dev \
|
| 27 |
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
| 29 |
# HF Spaces convention: run as a non-root "user" account at /home/user/app.
|
|
|
|
|
|
|
| 30 |
RUN useradd -m -u 1000 user
|
| 31 |
ENV HOME=/home/user \
|
| 32 |
PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:/bin \
|
|
|
|
| 35 |
OLLAMA_HOST=127.0.0.1:11434 \
|
| 36 |
OLLAMA_NUM_PARALLEL=1 \
|
| 37 |
OLLAMA_KEEP_ALIVE=24h \
|
| 38 |
+
OLLAMA_MAX_LOADED_MODELS=2 \
|
| 39 |
+
OLLAMA_FLASH_ATTENTION=1 \
|
| 40 |
+
OLLAMA_KV_CACHE_TYPE=q8_0 \
|
| 41 |
+
OLLAMA_DEBUG=1
|
| 42 |
|
| 43 |
+
# Install Ollama. install.sh ships the cuda_v12 dispatcher libs
|
| 44 |
+
# unconditionally; the GPU detection at the tail of the script only gates
|
| 45 |
+
# host-driver install (a no-op inside a container). So this works fine
|
| 46 |
+
# on a CPU builder for a GPU-attached runtime.
|
| 47 |
RUN curl -fsSL https://ollama.com/install.sh | sh
|
| 48 |
|
| 49 |
WORKDIR /home/user/app
|
| 50 |
|
| 51 |
+
# Python deps. CUDA 12.x in base image lets pip pull cu124 torch wheels
|
| 52 |
+
# automatically when sentence-transformers asks for torch.
|
| 53 |
COPY --chown=user:user requirements.txt ./
|
| 54 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 55 |
pip install --no-cache-dir -r requirements.txt
|
| 56 |
|
| 57 |
+
# --- Earth-observation toolchain (Phase 1 Prithvi live + Phase 4
|
| 58 |
+
# TerraMind synthesis) ---------------------------------------------------
|
|
|
|
| 59 |
#
|
| 60 |
+
# Tried four times to land terratorch on HF's Py3.10 image alongside
|
| 61 |
+
# our pinned stack (transformers<5, hf_hub<1, granite-tsfm<0.3.4,
|
| 62 |
+
# mellea<0.4). Each attempt failed at the same point β a `mkdir`
|
| 63 |
+
# immediately after the --no-deps install β with no actionable error
|
| 64 |
+
# in HF's build log. The failure pattern is consistent with build-
|
| 65 |
+
# sandbox disk exhaustion; even a 4-package narrow install
|
| 66 |
+
# (terratorch + einops + diffusers + timm with --no-deps) hits it.
|
| 67 |
+
#
|
| 68 |
+
# Accepting this: TerraMind synthesis + Prithvi-live remain
|
| 69 |
+
# local-/AMD-only on this deployment. The lazy-import pattern in
|
| 70 |
+
# app/context/terramind_synthesis.py + app/flood_layers/prithvi_live.py
|
| 71 |
+
# returns clean `skipped: deps unavailable on this deployment` on HF;
|
| 72 |
+
# the trace card and the map legend make that visible. The other 14
|
| 73 |
+
# specialists run normally.
|
| 74 |
+
#
|
| 75 |
+
# Re-enable on a deployment with more build disk (Docker SDK on a
|
| 76 |
+
# self-hosted machine, AMD droplet, etc.) by adding the EO --no-deps
|
| 77 |
+
# install back here.
|
| 78 |
+
|
| 79 |
+
# Pull both Granite 4.1 variants into the image:
|
| 80 |
+
# :3b β fast routing (planner) + live_now reconciler (short outputs)
|
| 81 |
+
# :8b β synthesis reconciler for single_address / neighborhood / dev_check
|
| 82 |
+
# Both fit warm on the T4 with OLLAMA_MAX_LOADED_MODELS=2 (~10 GB total
|
| 83 |
+
# VRAM out of 16). We start ollama in the background, poll its HTTP
|
| 84 |
+
# endpoint, pull, and let the layer exit (Docker reaps the daemon β
|
| 85 |
+
# don't pkill, it'll match this RUN's own cmdline and exit 143).
|
| 86 |
+
ENV OLLAMA_MODELS=/home/user/.ollama/models \
|
| 87 |
+
RIPRAP_OLLAMA_3B_TAG=granite4.1:8b
|
| 88 |
+
# Granite weights are pulled at *container start* (see entrypoint.sh)
|
| 89 |
+
# instead of at build time. HF's build sandbox can't fit the EO
|
| 90 |
+
# toolchain + Granite 8B (5GB) simultaneously, but the runtime
|
| 91 |
+
# rootfs is larger and persists between container starts within an
|
| 92 |
+
# image lifetime. Cold-start on first launch ~2 min for the 8B pull;
|
| 93 |
+
# subsequent restarts are fast since Ollama's cache survives.
|
| 94 |
+
RUN mkdir -p $OLLAMA_MODELS
|
| 95 |
|
| 96 |
# App code + fixtures
|
| 97 |
COPY --chown=user:user app/ ./app/
|
|
|
|
| 99 |
COPY --chown=user:user scripts/ ./scripts/
|
| 100 |
COPY --chown=user:user data/ ./data/
|
| 101 |
COPY --chown=user:user corpus/ ./corpus/
|
| 102 |
+
COPY --chown=user:user agent.py riprap.py ./
|
| 103 |
COPY --chown=user:user entrypoint.sh ./
|
| 104 |
RUN chmod +x ./entrypoint.sh
|
| 105 |
|
|
@@ -6,6 +6,59 @@
|
|
| 6 |
# $HOME (which we own) instead.
|
| 7 |
set -e
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Stream Ollama's stdout+stderr to BOTH stdout (so it shows up in HF
|
| 10 |
# Spaces runtime logs β needed to see GPU discovery output from
|
| 11 |
# OLLAMA_DEBUG=1) AND a file (for the readiness fail-fast tail below).
|
|
@@ -34,14 +87,44 @@ if ! curl -sf http://127.0.0.1:11434/ > /dev/null 2>&1; then
|
|
| 34 |
exit 1
|
| 35 |
fi
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
ollama list
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
# Log GPU visibility + Ollama lib layout so we can confirm CUDA dispatch
|
| 46 |
# from the runtime logs (paired with OLLAMA_DEBUG=1 in the daemon).
|
| 47 |
if command -v nvidia-smi > /dev/null 2>&1; then
|
|
|
|
| 6 |
# $HOME (which we own) instead.
|
| 7 |
set -e
|
| 8 |
|
| 9 |
+
# --- Earth-observation toolchain (Phase 1 + Phase 4) -------------------
|
| 10 |
+
# Build-time install was blocked by HF's build-disk threshold (5
|
| 11 |
+
# attempts; all failed at the same point). Runtime install in the
|
| 12 |
+
# running container works around the build-sandbox limit β the
|
| 13 |
+
# running container has more disk.
|
| 14 |
+
#
|
| 15 |
+
# Use `--target=$EO_DIR` instead of `--user`: explicit path that we
|
| 16 |
+
# can prepend to PYTHONPATH ourselves, so the install location is
|
| 17 |
+
# guaranteed visible regardless of HF Spaces' Python site-config.
|
| 18 |
+
# The `--user` approach was failing silently because HF's Python
|
| 19 |
+
# environment apparently bypasses the user-site discovery path.
|
| 20 |
+
EO_DIR="$HOME/.eo-pkgs"
|
| 21 |
+
EO_MARKER="$EO_DIR/.installed"
|
| 22 |
+
if [ ! -f "$EO_MARKER" ]; then
|
| 23 |
+
echo "[entrypoint] EO toolchain not yet installed; running pip install (~2 min)..."
|
| 24 |
+
mkdir -p "$EO_DIR"
|
| 25 |
+
# Bisect: previous build (1cf59ee) added torchvision + 7 more deps
|
| 26 |
+
# at once and the whole install failed (eo_dir empty, no marker).
|
| 27 |
+
# Pip's resolver is all-or-nothing per RUN β one bad package fails
|
| 28 |
+
# everything. Revert to the known-good 4 + just torchvision (the
|
| 29 |
+
# one terratorch actually needs to import). Once this proves out,
|
| 30 |
+
# add Prithvi-live deps in a second RUN.
|
| 31 |
+
if pip install --no-cache-dir --no-deps --target="$EO_DIR" \
|
| 32 |
+
terratorch==1.1rc6 \
|
| 33 |
+
einops \
|
| 34 |
+
diffusers \
|
| 35 |
+
timm \
|
| 36 |
+
torchvision; then
|
| 37 |
+
echo "[entrypoint] pip install OK; verifying import..."
|
| 38 |
+
if PYTHONPATH="$EO_DIR:$PYTHONPATH" python -c "
|
| 39 |
+
import terratorch
|
| 40 |
+
from terratorch.registry import FULL_MODEL_REGISTRY
|
| 41 |
+
import terratorch.models.backbones.terramind.model.terramind_register
|
| 42 |
+
n = len([k for k in FULL_MODEL_REGISTRY if 'terramind' in k.lower()])
|
| 43 |
+
assert n > 0, 'no terramind register entries'
|
| 44 |
+
print(f'[entrypoint] terratorch ok, terramind register: {n} entries')
|
| 45 |
+
"; then
|
| 46 |
+
touch "$EO_MARKER"
|
| 47 |
+
echo "[entrypoint] EO toolchain READY at $EO_DIR"
|
| 48 |
+
else
|
| 49 |
+
echo "[entrypoint] EO verify FAILED β TerraMind/Prithvi-live will skip"
|
| 50 |
+
fi
|
| 51 |
+
else
|
| 52 |
+
echo "[entrypoint] pip install FAILED β TerraMind/Prithvi-live will skip"
|
| 53 |
+
fi
|
| 54 |
+
else
|
| 55 |
+
echo "[entrypoint] EO toolchain already installed at $EO_DIR (cached)"
|
| 56 |
+
fi
|
| 57 |
+
# Always export PYTHONPATH so uvicorn can find the install (no-op if
|
| 58 |
+
# the install failed and the dir is empty β the lazy-import in the
|
| 59 |
+
# specialists handles that case cleanly).
|
| 60 |
+
export PYTHONPATH="$EO_DIR:$PYTHONPATH"
|
| 61 |
+
|
| 62 |
# Stream Ollama's stdout+stderr to BOTH stdout (so it shows up in HF
|
| 63 |
# Spaces runtime logs β needed to see GPU discovery output from
|
| 64 |
# OLLAMA_DEBUG=1) AND a file (for the readiness fail-fast tail below).
|
|
|
|
| 87 |
exit 1
|
| 88 |
fi
|
| 89 |
|
| 90 |
+
# Granite 4.1:8b is pulled at runtime instead of baked into the image
|
| 91 |
+
# β the EO toolchain (Phase 1 Prithvi + Phase 4 TerraMind) doesn't
|
| 92 |
+
# fit alongside Granite weights in HF's build sandbox. First container
|
| 93 |
+
# start does the pull (~2 min over the wire). Subsequent runtime
|
| 94 |
+
# restarts within the same image lifetime reuse Ollama's cache so
|
| 95 |
+
# this is a one-time per-image cost.
|
| 96 |
+
#
|
| 97 |
+
# 3b is also handled if present, but with RIPRAP_OLLAMA_3B_TAG=
|
| 98 |
+
# granite4.1:8b set, the planner alias resolves to 8b too β so 8b
|
| 99 |
+
# alone covers planner + reconciler.
|
| 100 |
+
for model in "granite4.1:8b" "granite4.1:3b"; do
|
| 101 |
+
if ! ollama list | grep -q "$model"; then
|
| 102 |
+
if [ "$model" = "granite4.1:8b" ]; then
|
| 103 |
+
echo "[entrypoint] $model not found; pulling now (~5GB, ~2 min over the wire)..."
|
| 104 |
+
ollama pull "$model" || {
|
| 105 |
+
echo "[entrypoint] FATAL: pull failed for $model β reconciler will not work"
|
| 106 |
+
exit 1
|
| 107 |
+
}
|
| 108 |
+
else
|
| 109 |
+
# 3B is optional; if it's not there and the env override is set,
|
| 110 |
+
# the router will route the planner alias to 8B.
|
| 111 |
+
echo "[entrypoint] $model not found (optional β planner alias remapped to 8b via RIPRAP_OLLAMA_3B_TAG)"
|
| 112 |
+
fi
|
| 113 |
+
fi
|
| 114 |
+
done
|
| 115 |
|
| 116 |
ollama list
|
| 117 |
|
| 118 |
+
# Pre-warm Granite 4.1:8b into VRAM so the first reconcile doesn't pay
|
| 119 |
+
# the ~30s model-load tax. The empty prompt keeps it tiny; OLLAMA_KEEP_ALIVE
|
| 120 |
+
# (24h) holds the weights resident through the demo.
|
| 121 |
+
echo "[entrypoint] pre-warming granite4.1:8b into VRAM (one-shot)..."
|
| 122 |
+
curl -s -X POST http://127.0.0.1:11434/api/generate \
|
| 123 |
+
-d '{"model":"granite4.1:8b","prompt":"hi","stream":false,"keep_alive":"24h","options":{"num_predict":1}}' \
|
| 124 |
+
-o /dev/null --max-time 120 \
|
| 125 |
+
&& echo "[entrypoint] granite4.1:8b warm" \
|
| 126 |
+
|| echo "[entrypoint] WARNING: 8b warmup failed (will load lazily)"
|
| 127 |
+
|
| 128 |
# Log GPU visibility + Ollama lib layout so we can confirm CUDA dispatch
|
| 129 |
# from the runtime logs (paired with OLLAMA_DEBUG=1 in the daemon).
|
| 130 |
if command -v nvidia-smi > /dev/null 2>&1; then
|