paperhawk / Dockerfile
Nándorfi Vince
Initial paperhawk push to HF Space (LFS for binaries)
7ff7119
# syntax=docker/dockerfile:1.6
FROM python:3.12-slim AS base
ENV PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_NO_CACHE_DIR=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1
# OS-level dependencies:
# - tesseract-ocr (eng + hun + deu): scanned PDF OCR fallback (multilingual demo support)
# - poppler-utils: pdfplumber table extraction
# - libmupdf-dev: PyMuPDF native lib
# - curl: healthcheck
RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
tesseract-ocr-eng \
tesseract-ocr-hun \
tesseract-ocr-deu \
poppler-utils \
libmupdf-dev \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Python deps — CPU-only torch first (smaller image), then the rest
COPY requirements.txt .
RUN pip install --upgrade pip \
&& pip install --index-url https://download.pytorch.org/whl/cpu torch \
&& pip install -r requirements.txt
# Sentence-transformers model pre-download (no runtime network call).
# BAAI/bge-m3 = 2.27 GB, 1024 dim, multilingual (EN/HU/DE/FR/...).
RUN python -c "from sentence_transformers import SentenceTransformer; \
SentenceTransformer('BAAI/bge-m3')"
# Source code
COPY . .
# Streamlit healthcheck — port 7860 for HF Space deployment (HF expects this)
EXPOSE 7860
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -f http://localhost:7860/_stcore/health || exit 1
CMD ["streamlit", "run", "app/main.py", \
"--server.address=0.0.0.0", \
"--server.port=7860", \
"--server.headless=true"]