# ============================================================================= # RareDx — Hugging Face Spaces Dockerfile # Single container: FastAPI (8080, internal) + Streamlit (8501, public) # ============================================================================= FROM python:3.11-slim # -------------------------------------------------------------------------- # System dependencies # -------------------------------------------------------------------------- RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ g++ \ libxml2-dev \ libxslt-dev \ curl \ supervisor \ && rm -rf /var/lib/apt/lists/* WORKDIR /app # -------------------------------------------------------------------------- # Python dependencies # Install before copying source so this layer is cached on code-only changes # -------------------------------------------------------------------------- COPY backend/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r requirements.txt # -------------------------------------------------------------------------- # Pre-download BioLORD-2023 model into the image # This avoids a ~500MB download on every Space restart # -------------------------------------------------------------------------- ENV HF_HOME=/app/.cache/huggingface RUN python -c "\ from sentence_transformers import SentenceTransformer; \ print('Downloading BioLORD-2023...'); \ SentenceTransformer('FremyCompany/BioLORD-2023'); \ print('Model cached.')" # -------------------------------------------------------------------------- # Application source # -------------------------------------------------------------------------- COPY backend/ ./backend/ # -------------------------------------------------------------------------- # Pre-built knowledge data (bundled — no runtime download needed) # data/graph_store.json — 33MB Orphanet+HPO knowledge graph (NetworkX JSON) # data/chromadb/ — 149MB BioLORD disease embeddings (ChromaDB) # data/hpo_index/ — 26MB BioLORD HPO term embeddings (numpy + JSON) # -------------------------------------------------------------------------- COPY data/graph_store.json ./data/graph_store.json COPY data/chromadb/ ./data/chromadb/ COPY data/hpo_index/ ./data/hpo_index/ # -------------------------------------------------------------------------- # supervisord config # -------------------------------------------------------------------------- COPY supervisord.conf /etc/supervisor/conf.d/raredx.conf # -------------------------------------------------------------------------- # Runtime environment # Tell pipeline to use embedded ChromaDB and local graph store # (no Neo4j or external ChromaDB server needed) # -------------------------------------------------------------------------- ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ CHROMA_HOST=localhost \ CHROMA_PORT=9999 \ CHROMA_COLLECTION=rare_diseases \ EMBED_MODEL=FremyCompany/BioLORD-2023 \ ORPHANET_DATA_DIR=/app/data/orphanet # Port Streamlit listens on (declared for HF Spaces) EXPOSE 8501 # -------------------------------------------------------------------------- # Start both services via supervisord # FastAPI: 127.0.0.1:8080 (internal — Streamlit calls it) # Streamlit: 0.0.0.0:8501 (public — HF Spaces exposes this) # -------------------------------------------------------------------------- CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/raredx.conf"]