Spaces:

lablab-ai-amd-developer-hackathon
/

riprap-nyc

Running

File size: 4,779 Bytes

# Riprap — Hugging Face Spaces deployment (Docker SDK, GPU).
#
# Base: NVIDIA CUDA 12.4 runtime + cuDNN on Ubuntu 22.04. Ollama's
# installer detects the GPU and pulls the CUDA-aware build automatically;
# Granite 4.1:3b inference drops from ~60-180s on CPU Basic to ~2-4s on
# nvidia-t4-small.
#
# Bakes:
#   - Python 3.10 (default on 22.04) + pip deps (~2.5 GB once torch is in)
#   - Ollama + granite4.1:3b model (~2 GB)
#   - All pre-computed fixtures in data/ + corpus/
#
# Runtime:
#   - Ollama daemon serves Granite 4.1 via CUDA
#   - Granite Embedding 278M auto-downloads via sentence-transformers
#     on first FastAPI startup (~280 MB)
#   - uvicorn FastAPI on port 7860 (HF Spaces default)

FROM nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04 AS base

# OS deps: Python 3.10 + geo libs + Ollama install dependencies.
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
        python3 python3-pip python3-venv python-is-python3 \
        curl ca-certificates zstd procps \
        gdal-bin libgdal-dev libgeos-dev libproj-dev \
    && rm -rf /var/lib/apt/lists/*

# HF Spaces convention: run as a non-root "user" account at /home/user/app.
RUN useradd -m -u 1000 user
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:/usr/local/bin:/usr/bin:/bin \
    PYTHONUNBUFFERED=1 \
    HF_HOME=/home/user/.cache/huggingface \
    OLLAMA_HOST=127.0.0.1:11434 \
    OLLAMA_NUM_PARALLEL=1 \
    OLLAMA_KEEP_ALIVE=24h \
    OLLAMA_MAX_LOADED_MODELS=2 \
    OLLAMA_FLASH_ATTENTION=1 \
    OLLAMA_KV_CACHE_TYPE=q8_0 \
    OLLAMA_DEBUG=1

# Install Ollama. install.sh ships the cuda_v12 dispatcher libs
# unconditionally; the GPU detection at the tail of the script only gates
# host-driver install (a no-op inside a container). So this works fine
# on a CPU builder for a GPU-attached runtime.
RUN curl -fsSL https://ollama.com/install.sh | sh

WORKDIR /home/user/app

# Python deps. CUDA 12.x in base image lets pip pull cu124 torch wheels
# automatically when sentence-transformers asks for torch.
COPY --chown=user:user requirements.txt ./
RUN pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir -r requirements.txt

# --- Earth-observation toolchain (Phase 1 Prithvi live + Phase 4
# TerraMind synthesis) ---------------------------------------------------
#
# Tried four times to land terratorch on HF's Py3.10 image alongside
# our pinned stack (transformers<5, hf_hub<1, granite-tsfm<0.3.4,
# mellea<0.4). Each attempt failed at the same point — a `mkdir`
# immediately after the --no-deps install — with no actionable error
# in HF's build log. The failure pattern is consistent with build-
# sandbox disk exhaustion; even a 4-package narrow install
# (terratorch + einops + diffusers + timm with --no-deps) hits it.
#
# Accepting this: TerraMind synthesis + Prithvi-live remain
# local-/AMD-only on this deployment. The lazy-import pattern in
# app/context/terramind_synthesis.py + app/flood_layers/prithvi_live.py
# returns clean `skipped: deps unavailable on this deployment` on HF;
# the trace card and the map legend make that visible. The other 14
# specialists run normally.
#
# Re-enable on a deployment with more build disk (Docker SDK on a
# self-hosted machine, AMD droplet, etc.) by adding the EO --no-deps
# install back here.

# Pull both Granite 4.1 variants into the image:
#   :3b — fast routing (planner) + live_now reconciler (short outputs)
#   :8b — synthesis reconciler for single_address / neighborhood / dev_check
# Both fit warm on the T4 with OLLAMA_MAX_LOADED_MODELS=2 (~10 GB total
# VRAM out of 16). We start ollama in the background, poll its HTTP
# endpoint, pull, and let the layer exit (Docker reaps the daemon —
# don't pkill, it'll match this RUN's own cmdline and exit 143).
ENV OLLAMA_MODELS=/home/user/.ollama/models \
    RIPRAP_OLLAMA_3B_TAG=granite4.1:8b
# Granite weights are pulled at *container start* (see entrypoint.sh)
# instead of at build time. HF's build sandbox can't fit the EO
# toolchain + Granite 8B (5GB) simultaneously, but the runtime
# rootfs is larger and persists between container starts within an
# image lifetime. Cold-start on first launch ~2 min for the 8B pull;
# subsequent restarts are fast since Ollama's cache survives.
RUN mkdir -p $OLLAMA_MODELS

# App code + fixtures
COPY --chown=user:user app/ ./app/
COPY --chown=user:user web/ ./web/
COPY --chown=user:user scripts/ ./scripts/
COPY --chown=user:user data/ ./data/
COPY --chown=user:user corpus/ ./corpus/
COPY --chown=user:user agent.py riprap.py ./
COPY --chown=user:user entrypoint.sh ./
RUN chmod +x ./entrypoint.sh

# Hand off to a non-root user the way HF Spaces expects
RUN chown -R user:user /home/user
USER user

EXPOSE 7860
CMD ["./entrypoint.sh"]