File size: 1,571 Bytes
7ff7119 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | # syntax=docker/dockerfile:1.6
FROM python:3.12-slim AS base
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# OS-level dependencies:
# - tesseract-ocr (eng + hun + deu): scanned PDF OCR fallback (multilingual demo support)
# - poppler-utils: pdfplumber table extraction
# - libmupdf-dev: PyMuPDF native lib
# - curl: healthcheck
RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-hun \
tesseract-ocr-deu \
poppler-utils \
libmupdf-dev \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Python deps — CPU-only torch first (smaller image), then the rest
COPY requirements.txt .
RUN pip install --upgrade pip \
&& pip install --index-url https://download.pytorch.org/whl/cpu torch \
&& pip install -r requirements.txt
# Sentence-transformers model pre-download (no runtime network call).
# BAAI/bge-m3 = 2.27 GB, 1024 dim, multilingual (EN/HU/DE/FR/...).
RUN python -c "from sentence_transformers import SentenceTransformer; \
SentenceTransformer('BAAI/bge-m3')"
# Source code
COPY . .
# Streamlit healthcheck — port 7860 for HF Space deployment (HF expects this)
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -f http://localhost:7860/_stcore/health || exit 1
CMD ["streamlit", "run", "app/main.py", \
"--server.address=0.0.0.0", \
"--server.port=7860", \
"--server.headless=true"]
|