| # syntax=docker/dockerfile:1.6 | |
| FROM python:3.12-slim AS base | |
| ENV PYTHONUNBUFFERED=1 \ | |
| PYTHONDONTWRITEBYTECODE=1 \ | |
| PIP_NO_CACHE_DIR=1 \ | |
| PIP_DISABLE_PIP_VERSION_CHECK=1 | |
| # OS-level dependencies: | |
| # - tesseract-ocr (eng + hun + deu): scanned PDF OCR fallback (multilingual demo support) | |
| # - poppler-utils: pdfplumber table extraction | |
| # - libmupdf-dev: PyMuPDF native lib | |
| # - curl: healthcheck | |
| RUN apt-get update && apt-get install -y --no-install-recommends \ | |
| tesseract-ocr \ | |
| tesseract-ocr-eng \ | |
| tesseract-ocr-hun \ | |
| tesseract-ocr-deu \ | |
| poppler-utils \ | |
| libmupdf-dev \ | |
| curl \ | |
| && rm -rf /var/lib/apt/lists/* | |
| WORKDIR /app | |
| # Python deps — CPU-only torch first (smaller image), then the rest | |
| COPY requirements.txt . | |
| RUN pip install --upgrade pip \ | |
| && pip install --index-url https://download.pytorch.org/whl/cpu torch \ | |
| && pip install -r requirements.txt | |
| # Sentence-transformers model pre-download (no runtime network call). | |
| # BAAI/bge-m3 = 2.27 GB, 1024 dim, multilingual (EN/HU/DE/FR/...). | |
| RUN python -c "from sentence_transformers import SentenceTransformer; \ | |
| SentenceTransformer('BAAI/bge-m3')" | |
| # Source code | |
| COPY . . | |
| # Streamlit healthcheck — port 7860 for HF Space deployment (HF expects this) | |
| EXPOSE 7860 | |
| HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ | |
| CMD curl -f http://localhost:7860/_stcore/health || exit 1 | |
| CMD ["streamlit", "run", "app/main.py", \ | |
| "--server.address=0.0.0.0", \ | |
| "--server.port=7860", \ | |
| "--server.headless=true"] | |