Upload model artifacts and classifier scripts

Browse files

Files changed (9) hide show

.gitattributes +6 -35
README.md +79 -0
classifiers/__init__.py +144 -0
classifiers/base_classifier.py +205 -0
classifiers/classifier_onnx.py +90 -0
classifiers/classifier_ov.py +114 -0
classifiers/classifier_torch.py +159 -0
classifiers/models.py +209 -0
config.json +128 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.bin filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.xml filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+        ---
+        license: apache-2.0
+        pipeline_tag: image-classification
+        tags:
+        - image-classification
+        - multi-label-classification
+        - onnx
+        - openvino
+        - pdf
+        - document-understanding
+        - rag
+        datasets:
+        - Wikit/PdfVisClassif
+        ---
+        # PDF Page Classifier
+        Multi-label classifier for PDF page images. Determines whether a PDF page
+        requires image embedding (vs. text-only) in RAG pipelines.
+        Backbone: EfficientNet-Lite0. Exported to ONNX and OpenVINO INT8 via
+        Quantization-Aware Training (QAT). **No PyTorch required at inference time.**
+        ## Classes
+        Pages matching any of the following classes should trigger image embedding:
+        - `Visual Essential`
+- `Complex Table`
+        Default threshold: `0.5`
+        ## Usage
+        ### With [chunknorris](https://github.com/wikit-ai/chunknorris) (recommended)
+        ```bash
+        pip install "chunknorris[ml-onnx]"       # ONNX backend
+        pip install "chunknorris[ml-openvino]"   # OpenVINO INT8, fastest on CPU
+        ```
+        ```python
+        from chunknorris.ml import load_classifier
+        clf = load_classifier("Wikit/pdf-pages-classifier")   # auto-selects best available backend
+        result = clf.predict("page.png")
+        # {"needs_image_embedding": True, "predicted_classes": [...], "probabilities": {...}}
+        ```
+        ### Standalone (no chunknorris)
+        ```bash
+        git clone https://huggingface.co/Wikit/pdf-pages-classifier
+        cd pdf-pages-classifier
+        pip install onnxruntime Pillow numpy   # or: openvino Pillow numpy
+        ```
+        ```python
+        from classifiers import load_classifier
+        clf = load_classifier(".")            # auto-selects available backend
+        result = clf.predict("page.png")
+        ```
+        ## Files
+        | File | Format | Notes |
+        |------|--------|-------|
+        | `model.onnx` | ONNX FP32 | Cross-platform CPU/GPU inference |
+        | `openvino_model.xml/.bin` | OpenVINO INT8 | Fastest CPU inference (QAT) |
+        | `pytorch_model.bin` | PyTorch | Raw checkpoint; requires `torch` + `timm` |
+        | `config.json` | JSON | Preprocessing config and class names |
+        | `classifiers/` | Python | Standalone inference scripts (no chunknorris needed) |
+        ## Dataset
+        Trained on [Wikit/PdfVisClassif](https://huggingface.co/datasets/Wikit/PdfVisClassif).

classifiers/__init__.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""PDF page classifier — public factory with HuggingFace auto-download.
+Standalone usage (files downloaded from HF repo):
+    from classifiers import load_classifier
+    clf = load_classifier(".")          # local directory with model files
+    result = clf.predict("page.png")
+HuggingFace usage:
+    from classifiers import load_classifier
+    clf = load_classifier("Wikit/pdf-pages-classifier")
+    result = clf.predict("page.png")
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any
+# INT8 preferred over FP32 for both backends — matches classifier lookup order
+_HF_ONNX_INT8_FILES = ["model_int8.onnx", "config.json"]
+_HF_ONNX_FP32_FILES = ["model.onnx", "config.json"]
+_HF_OV_INT8_FILES = ["openvino_model_int8.xml", "openvino_model_int8.bin", "config.json"]
+_HF_OV_FP32_FILES = ["openvino_model.xml", "openvino_model.bin", "config.json"]
+def _is_hf_repo_id(path: str) -> bool:
+    """Return True if path looks like 'owner/repo' rather than a local path."""
+    if os.path.exists(path):
+        return False
+    # HF repo IDs have exactly one '/' and no OS path separators or leading dots
+    normalized = path.replace("\\", "/")
+    if normalized.startswith((".", "/", "~")):
+        return False
+    parts = normalized.split("/")
+    return len(parts) == 2 and all(p.strip() for p in parts)
+def _download_from_hf(repo_id: str, filenames: list[str], cache_dir: str | None) -> Path:
+    """Download specific files from a HF repo and return the local snapshot directory."""
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError as e:
+        raise ImportError(
+            "huggingface_hub is required to load from a HuggingFace repo.\n"
+            "Install with: pip install huggingface-hub"
+        ) from e
+    last: Path | None = None
+    for filename in filenames:
+        last = Path(hf_hub_download(repo_id=repo_id, filename=filename, cache_dir=cache_dir))
+    assert last is not None
+    return last.parent
+def _download_with_int8_fallback(
+    repo_id: str,
+    int8_files: list[str],
+    fp32_files: list[str],
+    cache_dir: str | None,
+) -> Path:
+    """Download files from HF, preferring INT8 over FP32 when available."""
+    try:
+        from huggingface_hub import EntryNotFoundError
+    except ImportError as e:
+        raise ImportError(
+            "huggingface_hub is required to load from a HuggingFace repo.\n"
+            "Install with: pip install huggingface-hub"
+        ) from e
+    try:
+        return _download_from_hf(repo_id, int8_files, cache_dir)
+    except EntryNotFoundError:
+        return _download_from_hf(repo_id, fp32_files, cache_dir)
+def load_classifier(
+    repo_or_dir: str = "Wikit/pdf-pages-classifier",
+    backend: str = "auto",
+    device: str = "CPU",
+    cache_dir: str | None = None,
+) -> Any:
+    """Load a PDF page classifier with automatic backend selection.
+    Args:
+        repo_or_dir: HuggingFace repo ID (e.g. ``"Wikit/pdf-pages-classifier"``)
+            or local directory containing ``config.json`` and model files.
+        backend: ``"auto"`` tries OpenVINO first, falls back to ONNX.
+            Pass ``"openvino"`` or ``"onnx"`` to force a specific backend.
+        device: OpenVINO device string (``"CPU"``, ``"GPU"``, ``"AUTO"``).
+            Ignored for ONNX.
+        cache_dir: Custom cache directory for HuggingFace downloads.
+    Returns:
+        A classifier instance exposing ``predict(images)``.
+    Example::
+        clf = load_classifier("Wikit/pdf-pages-classifier")
+        result = clf.predict("page.png")
+        print(result["needs_image_embedding"], result["predicted_classes"])
+    """
+    if backend not in ("auto", "onnx", "openvino"):
+        raise ValueError(f"Unknown backend {backend!r}. Choose 'auto', 'onnx', or 'openvino'.")
+    is_hf = _is_hf_repo_id(repo_or_dir)
+    if backend in ("auto", "openvino"):
+        try:
+            return _load_openvino(repo_or_dir, device=device, cache_dir=cache_dir, is_hf=is_hf)
+        except (ImportError, FileNotFoundError):
+            if backend == "openvino":
+                raise
+    return _load_onnx(repo_or_dir, cache_dir=cache_dir, is_hf=is_hf)
+def _load_onnx(repo_or_dir: str, cache_dir: str | None, is_hf: bool) -> Any:
+    try:
+        from .classifier_onnx import PDFPageClassifierONNX
+    except ImportError:
+        from classifier_onnx import PDFPageClassifierONNX  # type: ignore[no-redef]
+    model_dir = (
+        _download_with_int8_fallback(repo_or_dir, _HF_ONNX_INT8_FILES, _HF_ONNX_FP32_FILES, cache_dir)
+        if is_hf else Path(repo_or_dir)
+    )
+    return PDFPageClassifierONNX.from_pretrained(str(model_dir))
+def _load_openvino(repo_or_dir: str, device: str, cache_dir: str | None, is_hf: bool) -> Any:
+    try:
+        from .classifier_ov import PDFPageClassifierOV
+    except ImportError:
+        from classifier_ov import PDFPageClassifierOV  # type: ignore[no-redef]
+    model_dir = (
+        _download_with_int8_fallback(repo_or_dir, _HF_OV_INT8_FILES, _HF_OV_FP32_FILES, cache_dir)
+        if is_hf else Path(repo_or_dir)
+    )
+    return PDFPageClassifierOV.from_pretrained(str(model_dir), device=device)

classifiers/base_classifier.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from abc import abstractmethod, ABC
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Union
+from PIL import Image
+import numpy as np
+import numpy.typing as npt
+class _BasePDFPageClassifier(ABC):
+    """Shared preprocessing, formatting, and predict logic.
+    Subclasses must implement ``_run_batch`` to perform backend-specific
+    inference on a (N, C, H, W) float32 numpy array.
+    """
+    def __init__(self, config: dict[str, Any]) -> None:
+        self._image_size: int = config["image_size"]
+        self._mean = np.array(config["mean"], dtype=np.float32)
+        self._std = np.array(config["std"], dtype=np.float32)
+        self._center_crop: bool = config.get("center_crop_shortest", True)
+        self._whiteout: bool = config.get("whiteout_header", False)
+        self._whiteout_cutoff: int = int(
+            self._image_size * config.get("whiteout_fraction", 0.15)
+        )
+        self._class_names: list[str] = config["class_names"]
+        self._threshold: float = float(config.get("threshold", 0.5))
+        self._image_required_classes: set[str] = set(
+            config.get("image_required_classes", [])
+        )
+    @abstractmethod
+    def _run_batch(self, batch_input: "npt.NDArray[np.float32]") -> "npt.NDArray[np.float32]":
+        """Run inference on a (N, C, H, W) float32 batch.
+        Returns:
+            (N, num_classes) float32 array of probabilities.
+        """
+    @staticmethod
+    def _load_image(item: Any) -> "Image.Image":
+        """Load an image from a file path or PIL image and convert to RGB.
+        Args:
+            item: File path string or PIL image (any mode).
+        Returns:
+            RGB PIL image.
+        Raises:
+            TypeError: If ``item`` is neither a str nor a PIL.Image.
+        """
+        if isinstance(item, str):
+            return Image.open(item).convert("RGB")
+        if isinstance(item, Image.Image):
+            return item.convert("RGB")
+        raise TypeError(f"Expected str or PIL.Image, got {type(item).__name__}")
+    def _pil_to_array(self, img: "Image.Image") -> "npt.NDArray[np.float32]":
+        """Apply spatial transforms and return a (H, W, C) float32 array in [0, 1].
+        Normalization and the channel transpose are intentionally deferred so
+        they can be applied in a single vectorised pass over the whole batch in
+        ``_normalize_batch``.
+        Steps:
+          1. Center-crop to square (shortest side), if enabled.
+          2. Resize to (image_size, image_size) with bicubic interpolation.
+          3. Scale pixel values to [0, 1].
+          4. White out top header rows, if enabled.
+        Args:
+            img: RGB PIL image.
+        Returns:
+            Float32 array of shape (image_size, image_size, 3).
+        """
+        if self._center_crop:
+            w, h = img.size
+            sq = min(w, h)
+            img = img.crop(
+                ((w - sq) // 2, (h - sq) // 2, (w + sq) // 2, (h + sq) // 2)
+            )
+        img = img.resize((self._image_size, self._image_size), Image.Resampling.BICUBIC)
+        arr = np.asarray(img, dtype=np.float32) * (1.0 / 255.0)  # (H, W, C)
+        if self._whiteout:
+            arr[: self._whiteout_cutoff] = 1.0
+        return arr
+    def _normalize_batch(
+        self, arrays: list["npt.NDArray[np.float32]"]
+    ) -> "npt.NDArray[np.float32]":
+        """Stack a list of (H, W, C) arrays and apply ImageNet normalization.
+        Args:
+            arrays: List of float32 arrays, each of shape (H, W, C) in [0, 1].
+        Returns:
+            Float32 array of shape (N, C, H, W), normalized with ImageNet stats.
+        """
+        batch = np.stack(arrays, axis=0)           # (N, H, W, C)
+        batch = (batch - self._mean) / self._std   # broadcast over (H, W, C)
+        return batch.transpose(0, 3, 1, 2)         # (N, C, H, W)
+    def _format(
+        self,
+        probabilities: "npt.NDArray[np.float32]",
+        threshold: float,
+    ) -> dict[str, Any]:
+        """Format model output probabilities into a result dict.
+        Args:
+            probabilities: 1-D float32 array of per-class probabilities.
+            threshold: Probability cutoff for a positive prediction.
+        Returns:
+            Dict with keys ``needs_image_embedding``, ``predicted_classes``,
+            and ``probabilities``.
+        """
+        predicted_classes = [
+            name
+            for name, prob in zip(self._class_names, probabilities)
+            if prob >= threshold
+        ]
+        return {
+            "needs_image_embedding": any(
+                c in self._image_required_classes for c in predicted_classes
+            ),
+            "predicted_classes": predicted_classes,
+            "probabilities": {
+                name: float(prob)
+                for name, prob in zip(self._class_names, probabilities)
+            },
+        }
+    def predict(
+        self,
+        images: Union[str, "Image.Image", list[Any]],
+        threshold: float | None = None,
+        batch_size: int = 32,
+        num_workers: int = 4,
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """Classify one or more PDF page images.
+        Args:
+            images: A single image (file path string or PIL.Image) or a list
+                of images.
+            threshold: Override the default probability threshold from config.
+                The override is local to this call and does not mutate the
+                classifier instance.
+            batch_size: Number of images to process per inference call.
+            num_workers: Number of threads for parallel image loading and
+                preprocessing. Set to 1 to disable threading.
+        Returns:
+            A single result dict when ``images`` is not a list, or a list of
+            result dicts otherwise.  Each dict contains:
+              - ``needs_image_embedding`` (bool)
+              - ``predicted_classes`` (list[str])
+              - ``probabilities`` (dict[str, float])
+        """
+        effective_threshold = self._threshold if threshold is None else threshold
+        is_single = not isinstance(images, list)
+        image_list: list[Any] = [images] if is_single else images
+        all_results: list[dict[str, Any]] = []
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            for batch_start in range(0, len(image_list), batch_size):
+                batch_items = image_list[batch_start : batch_start + batch_size]
+                # Load (file I/O + RGB conversion) in parallel, then free after use.
+                loaded: list[Image.Image] = list(
+                    executor.map(self._load_image, batch_items)
+                )
+                # PIL transforms (crop + bicubic resize) in parallel.
+                arrays: list[npt.NDArray[np.float32]] = list(
+                    executor.map(self._pil_to_array, loaded)
+                )
+                # Vectorised normalization + transpose, then inference.
+                batch_input = self._normalize_batch(arrays)  # (N, C, H, W)
+                probs_batch: npt.NDArray[np.float32] = self._run_batch(batch_input)
+                all_results.extend(
+                    self._format(probs, effective_threshold) for probs in probs_batch
+                )
+        return all_results[0] if is_single else all_results
+    def __call__(
+        self,
+        images: Union[str, "Image.Image", list[Any]],
+        threshold: float | None = None,
+        batch_size: int = 32,
+        num_workers: int = 4,
+    ) -> Union[dict[str, Any], list[dict[str, Any]]]:
+        """Delegate to predict(). See predict() for full documentation."""
+        return self.predict(
+            images, threshold=threshold, batch_size=batch_size, num_workers=num_workers
+        )

classifiers/classifier_onnx.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""PDF page classifier for production inference."""
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+try:
+    from .base_classifier import _BasePDFPageClassifier
+except ImportError:
+    from base_classifier import _BasePDFPageClassifier  # standalone / HF usage
+try:
+    import onnxruntime as ort
+except ImportError as _e:
+    raise ImportError(
+        "onnxruntime is required for inference.\n"
+        "Install with: pip install onnxruntime"
+    ) from _e
+class PDFPageClassifierONNX(_BasePDFPageClassifier):
+    """Classify PDF pages using a deployed ONNX model.
+    Loads a self-contained deployment directory produced by
+    ``export_onnx.save_for_deployment`` and exposes a simple ``predict``
+    interface.  All preprocessing (center-crop, resize, normalization) is
+    performed in pure PIL + numpy, matching the pipeline used during training.
+    Example::
+        clf = PDFPageClassifier.from_pretrained("outputs/run-42/deployment")
+        result = clf.predict("page_001.png")
+        print(result["needs_image_embedding"], result["predicted_classes"])
+    """
+    def __init__(self, model_path: str, config: dict[str, Any]) -> None:
+        """Initialise the classifier.
+        Args:
+            model_path: Path to the ONNX model file.
+            config: Deployment config dict (same schema as config.json written
+                by save_for_deployment).
+        """
+        super().__init__(config)
+        self._session = ort.InferenceSession(model_path)
+        self._input_name: str = self._session.get_inputs()[0].name
+    @classmethod
+    def from_pretrained(cls, model_dir: str) -> "PDFPageClassifier":
+        """Load a classifier from a deployment directory.
+        The directory must contain:
+          - ``model.onnx``  — exported by save_for_deployment
+          - ``config.json`` — written by save_for_deployment
+        Args:
+            model_dir: Path to the deployment directory.
+        Returns:
+            Initialised PDFPageClassifier.
+        """
+        path = Path(model_dir)
+        config_path = path / "config.json"
+        if not config_path.exists():
+            raise FileNotFoundError(f"config.json not found in {model_dir}")
+        # Prefer INT8 (QAT export) over FP32 when both are present
+        candidates = ["model_int8.onnx", "model.onnx"]
+        for candidate in candidates:
+            if (path / candidate).exists():
+                model_path = path / candidate
+                break
+        else:
+            raise FileNotFoundError(
+                f"No ONNX model found in {model_dir}. "
+                f"Expected one of: {', '.join(candidates)}."
+            )
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        return cls(str(model_path), config)
+    def _run_batch(self, batch_input: "npt.NDArray[np.float32]") -> "npt.NDArray[np.float32]":
+        return self._session.run(None, {self._input_name: batch_input})[0]

classifiers/classifier_ov.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""OpenVINO-based PDF page classifier for production inference."""
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+try:
+    from openvino import Core
+    from openvino import CompiledModel
+except ImportError as _e:
+    raise ImportError(
+        "openvino is required for OpenVINO inference.\n"
+        "Install with: pip install openvino"
+    ) from _e
+try:
+    from .base_classifier import _BasePDFPageClassifier
+except ImportError:
+    from base_classifier import _BasePDFPageClassifier  # standalone / HF usage
+class PDFPageClassifierOV(_BasePDFPageClassifier):
+    """Classify PDF pages using a deployed OpenVINO IR model.
+    Loads a self-contained deployment directory produced by
+    ``export_onnx.save_for_deployment`` (with ``export_openvino=True``) and
+    exposes the same ``predict`` interface as ``PDFPageClassifier``.
+    Automatically selects the INT8 variant (``model_ov_int8.xml``) when it
+    exists alongside the FP32 model, falling back to ``model_ov.xml``.
+    Example::
+        clf = PDFPageClassifierOV.from_pretrained("outputs/run-42/deployment")
+        result = clf.predict("page_001.png")
+        print(result["needs_image_embedding"], result["predicted_classes"])
+    """
+    def __init__(
+        self,
+        model_path: str,
+        config: dict[str, Any],
+        device: str = "CPU",
+    ) -> None:
+        """Initialise the classifier.
+        Args:
+            model_path: Path to the OpenVINO IR ``.xml`` file.
+            config: Deployment config dict (same schema as config.json written
+                by save_for_deployment).
+            device: OpenVINO device string (``"CPU"``, ``"GPU"``, ``"AUTO"``).
+        """
+        super().__init__(config)
+        compiled: CompiledModel = Core().compile_model(model_path, device)
+        self._session: CompiledModel = compiled
+        self._input_name: str = compiled.input(0).get_any_name()
+        self._output = compiled.output(0)
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_dir: str,
+        device: str = "CPU",
+    ) -> "PDFPageClassifierOV":
+        """Load a classifier from a deployment directory.
+        The directory must contain:
+          - ``model_ov.xml`` / ``model_ov_int8.xml`` — exported by
+            save_for_deployment with ``export_openvino=True``
+          - ``config.json`` — written by save_for_deployment
+        The INT8 model (``model_ov_int8.xml``) is preferred when present.
+        Args:
+            model_dir: Path to the deployment directory.
+            device: OpenVINO device string (``"CPU"``, ``"GPU"``, ``"AUTO"``).
+        Returns:
+            Initialised PDFPageClassifierOV.
+        """
+        path = Path(model_dir)
+        config_path = path / "config.json"
+        if not config_path.exists():
+            raise FileNotFoundError(f"config.json not found in {model_dir}")
+        # Search order: prefer INT8 over FP32, HF/Optimum names over legacy names
+        candidates = [
+            "openvino_model_int8.xml",  # HF-style INT8 (preferred)
+            "openvino_model.xml",       # HF-style FP32
+            "model_ov_int8.xml",        # legacy local INT8
+            "model_ov.xml",             # legacy local FP32
+        ]
+        for candidate in candidates:
+            if (path / candidate).exists():
+                model_path = path / candidate
+                break
+        else:
+            raise FileNotFoundError(
+                f"No OpenVINO model found in {model_dir}. "
+                f"Expected one of: {', '.join(candidates)}. "
+                "Export with save_for_deployment(..., export_openvino=True)."
+            )
+        with open(config_path, encoding="utf-8") as f:
+            config = json.load(f)
+        return cls(str(model_path), config, device=device)
+    def _run_batch(self, batch_input: "npt.NDArray[np.float32]") -> "npt.NDArray[np.float32]":
+        return self._session({self._input_name: batch_input})[self._output]

classifiers/classifier_torch.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""PyTorch-based PDF page classifier for native inference."""
+import json
+from pathlib import Path
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+try:
+    import torch
+except ImportError as _e:
+    raise ImportError(
+        "torch is required for PyTorch inference.\n"
+        "Install with: pip install torch"
+    ) from _e
+from classifiers.base_classifier import _BasePDFPageClassifier
+from models import create_model
+class PDFPageClassifierTorch(_BasePDFPageClassifier):
+    """Classify PDF pages using a native PyTorch checkpoint.
+    Loads a checkpoint produced by the training script and exposes the same
+    ``predict`` interface as the ONNX and OpenVINO classifiers.  All
+    preprocessing (center-crop, resize, normalization) is handled by the
+    shared base class.
+    Example::
+        clf = PDFPageClassifierTorch.from_checkpoint("outputs/run-42/best_model.pt")
+        result = clf.predict("page_001.png")
+        print(result["needs_image_embedding"], result["predicted_classes"])
+    """
+    def __init__(
+        self,
+        model: "torch.nn.Module",
+        config: dict[str, Any],
+        device: "torch.device | str" = "cpu",
+    ) -> None:
+        """Initialise the classifier.
+        Args:
+            model: PyTorch model already loaded with weights and set to eval mode.
+            config: Flat config dict compatible with the base classifier schema.
+            device: Torch device to run inference on (``"cpu"``, ``"cuda"``, etc.).
+        """
+        super().__init__(config)
+        self._device = torch.device(device)
+        self._model = model.to(self._device)
+        self._model.eval()
+    @classmethod
+    def from_checkpoint(
+        cls,
+        checkpoint_path: str,
+        device: "torch.device | str" = "cpu",
+        image_required_classes: list[str] | None = None,
+        threshold: float = 0.5,
+    ) -> "PDFPageClassifierTorch":
+        """Load a classifier from a training checkpoint.
+        The checkpoint must contain:
+          - ``model_state_dict`` — model weights
+          - ``config``           — training config with ``model`` and ``data`` keys
+          - ``class_names``      — ordered list of class names
+        Args:
+            checkpoint_path: Path to the ``.pt`` checkpoint file.
+            device: Torch device string (``"cpu"``, ``"cuda"``, ``"mps"``).
+            image_required_classes: Class names that trigger image embedding.
+                Defaults to an empty list when not provided.
+            threshold: Default prediction threshold (can be overridden per call).
+        Returns:
+            Initialised PDFPageClassifierTorch.
+        """
+        ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
+        train_cfg = ckpt["config"]
+        class_names: list[str] = ckpt["class_names"]
+        data_cfg = train_cfg["data"]
+        model = create_model(
+            model_name=train_cfg["model"]["name"],
+            num_classes=len(class_names),
+            pretrained=False,
+            dropout=train_cfg["model"]["dropout"],
+            use_spatial_pooling=train_cfg["model"].get("use_spatial_pooling", False),
+        )
+        model.load_state_dict(ckpt["model_state_dict"])
+        # Build a flat config dict that matches the base-class schema.
+        config: dict[str, Any] = {
+            "image_size": data_cfg["image_size"],
+            "mean": data_cfg.get("mean", [0.485, 0.456, 0.406]),
+            "std": data_cfg.get("std", [0.229, 0.224, 0.225]),
+            "center_crop_shortest": data_cfg.get("center_crop_shortest", True),
+            "whiteout_header": data_cfg.get("whiteout_header", False),
+            "whiteout_fraction": data_cfg.get("whiteout_fraction", 0.15),
+            "class_names": class_names,
+            "threshold": threshold,
+            "image_required_classes": image_required_classes or [],
+        }
+        return cls(model, config, device=device)
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_dir: str,
+        device: "torch.device | str" = "cpu",
+    ) -> "PDFPageClassifierTorch":
+        """Load a classifier from a deployment directory.
+        The directory must contain:
+          - ``model.pt``    — PyTorch checkpoint written by save_for_deployment
+          - ``config.json`` — deployment config written by save_for_deployment
+        Args:
+            model_dir: Path to the deployment directory.
+            device: Torch device string (``"cpu"``, ``"cuda"``, ``"mps"``).
+        Returns:
+            Initialised PDFPageClassifierTorch.
+        """
+        path = Path(model_dir)
+        config_path = path / "config.json"
+        model_path = path / "model.pt"
+        if not config_path.exists():
+            raise FileNotFoundError(f"config.json not found in {model_dir}")
+        if not model_path.exists():
+            raise FileNotFoundError(f"model.pt not found in {model_dir}")
+        with open(config_path, encoding="utf-8") as f:
+            config: dict[str, Any] = json.load(f)
+        ckpt = torch.load(str(model_path), map_location="cpu", weights_only=False)
+        model = create_model(
+            model_name=config["model_name"],
+            num_classes=len(config["class_names"]),
+            pretrained=False,
+            dropout=config.get("dropout", 0.2),
+            use_spatial_pooling=config.get("use_spatial_pooling", False),
+        )
+        model.load_state_dict(ckpt["model_state_dict"] if "model_state_dict" in ckpt else ckpt)
+        return cls(model, config, device=device)
+    def _run_batch(self, batch_input: "npt.NDArray[np.float32]") -> "npt.NDArray[np.float32]":
+        tensor = torch.from_numpy(batch_input).to(self._device) # type:ignore
+        with torch.no_grad():
+            logits = self._model(tensor)
+            probs = torch.sigmoid(logits)
+        return probs.cpu().numpy() # type: ignore

classifiers/models.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""Model definitions for PDF page classification."""
+import torch
+import torch.nn as nn
+import timm
+class MultiLabelClassifier(nn.Module):
+    """Multi-label image classifier with configurable backbone.
+    Args:
+        model_name: Name of the timm model to use as backbone
+        num_classes: Number of output classes
+        pretrained: Whether to use pretrained weights
+        dropout: Dropout probability before final layer
+        use_spatial_pooling: If True, use spatial max pooling (CAM-style) instead of global pooling
+    """
+    def __init__(
+        self,
+        model_name: str,
+        num_classes: int,
+        pretrained: bool = True,
+        dropout: float = 0.2,
+        use_spatial_pooling: bool = False
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.num_classes = num_classes
+        self.use_spatial_pooling = use_spatial_pooling
+        # Load pretrained backbone from timm
+        if use_spatial_pooling:
+            # No global pooling - keep spatial dimensions
+            self.backbone = timm.create_model(
+                model_name,
+                pretrained=pretrained,
+                num_classes=0,  # Remove classification head
+                global_pool=''  # No pooling
+            )
+        else:
+            # Standard global average pooling
+            self.backbone = timm.create_model(
+                model_name,
+                pretrained=pretrained,
+                num_classes=0,  # Remove classification head
+                global_pool='avg'
+            )
+        # Get feature dimension
+        with torch.no_grad():
+            dummy_input = torch.randn(1, 3, 224, 224)
+            features = self.backbone(dummy_input)
+            if use_spatial_pooling:
+                # features shape: [B, C, H, W]
+                self.feature_dim = features.shape[1]
+                print(f"Spatial pooling enabled - feature map shape: {features.shape}")
+            else:
+                # features shape: [B, C]
+                self.feature_dim = features.shape[1]
+        # Classification head
+        if use_spatial_pooling:
+            # 1x1 conv for spatial classification + dropout
+            self.classifier = nn.Sequential(
+                nn.Dropout2d(p=dropout),  # Spatial dropout
+                nn.Conv2d(self.feature_dim, num_classes, kernel_size=1)
+            )
+        else:
+            # Standard linear classifier
+            self.classifier = nn.Sequential(
+                nn.Dropout(p=dropout),
+                nn.Linear(self.feature_dim, num_classes)
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+        Args:
+            x: Input tensor of shape (batch_size, 3, H, W)
+        Returns:
+            Logits of shape (batch_size, num_classes)
+        """
+        features = self.backbone(x)
+        if self.use_spatial_pooling:
+            # features: [B, C, H, W]
+            # spatial_logits: [B, num_classes, H, W]
+            spatial_logits = self.classifier(features)
+            # Global max pooling per class: [B, num_classes]
+            logits = torch.amax(spatial_logits, dim=(2, 3))
+        else:
+            # features: [B, C]
+            # logits: [B, num_classes]
+            logits = self.classifier(features)
+        return logits
+    def get_features(self, x: torch.Tensor) -> torch.Tensor:
+        """Extract features without classification head.
+        Useful for feature visualization or transfer learning.
+        Args:
+            x: Input tensor of shape (batch_size, 3, H, W)
+        Returns:
+            Features of shape (batch_size, feature_dim) or (batch_size, feature_dim, H, W)
+        """
+        return self.backbone(x)
+    def get_activation_maps(self, x: torch.Tensor) -> torch.Tensor:
+        """Get spatial activation maps (only for spatial pooling mode).
+        Args:
+            x: Input tensor of shape (batch_size, 3, H, W)
+        Returns:
+            Activation maps of shape (batch_size, num_classes, H, W)
+        Raises:
+            ValueError: If spatial pooling is not enabled
+        """
+        if not self.use_spatial_pooling:
+            raise ValueError("Activation maps only available with spatial pooling enabled")
+        features = self.backbone(x)
+        spatial_logits = self.classifier(features)
+        return spatial_logits
+def create_model(
+    model_name: str,
+    num_classes: int,
+    pretrained: bool = True,
+    dropout: float = 0.2,
+    use_spatial_pooling: bool = False
+) -> MultiLabelClassifier:
+    """Factory function to create a model.
+    Args:
+        model_name: Name of the model architecture. Example : mobilenetv3_small_100
+        num_classes: Number of output classes
+        pretrained: Whether to use pretrained weights
+        dropout: Dropout probability
+        use_spatial_pooling: If True, use spatial max pooling (CAM-style)
+    Returns:
+        Initialized model
+    """
+    # Verify model exists in timm
+    available_models = timm.list_models(model_name)
+    if not available_models:
+        raise ValueError(
+            f"Model '{model_name}' not found in timm."
+            f"Available options: {timm.list_models()}"
+        )
+    model = MultiLabelClassifier(
+        model_name=model_name,
+        num_classes=num_classes,
+        pretrained=pretrained,
+        dropout=dropout,
+        use_spatial_pooling=use_spatial_pooling
+    )
+    return model
+def count_parameters(model: nn.Module) -> dict[str, int | float]:
+    """Count model parameters.
+    Args:
+        model: PyTorch model
+    Returns:
+        Dictionary with parameter counts
+    """
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return {
+        'total': total_params,
+        'trainable': trainable_params,
+        'non_trainable': total_params - trainable_params,
+        'total_millions': total_params / 1e6,
+        'trainable_millions': trainable_params / 1e6
+    }
+def print_model_info(model: nn.Module, model_name: str = "Model"):
+    """Print model information.
+    Args:
+        model: PyTorch model
+        model_name: Name to display
+    """
+    params = count_parameters(model)
+    print(f"\n{'='*60}")
+    print(f"{model_name} Information")
+    print(f"{'='*60}")
+    print(f"Total parameters:      {params['total']:,} ({params['total_millions']:.2f}M)")
+    print(f"Trainable parameters:  {params['trainable']:,} ({params['trainable_millions']:.2f}M)")
+    print(f"Non-trainable params:  {params['non_trainable']:,}")
+    print(f"{'='*60}\n")

config.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "model": {
+    "name": "efficientnet_lite0",
+    "pretrained": true,
+    "dropout": 0,
+    "use_spatial_pooling": false
+  },
+  "classes": [
+    "Visual - Essential",
+    "Simple Table",
+    "Chart/Graph",
+    "Visual - Supportive",
+    "Annotated figure",
+    "No Specific Feature",
+    "Diagram/Flowchart",
+    "Visual - Decorative",
+    "Complex Table",
+    "Infographic",
+    "Form",
+    "Text to OCR"
+  ],
+  "class_mapping": {
+    "Form": null,
+    "No Specific Feature": null,
+    "Text to OCR": null,
+    "Visual - Decorative": null,
+    "Infographic": null,
+    "Chart/Graph": null,
+    "Annotated figure": null,
+    "Diagram/Flowchart": null
+  },
+  "image_required_classes": [
+    "Visual Essential",
+    "Complex Table"
+  ],
+  "data": {
+    "train_split": 0.8,
+    "val_split": 0.1,
+    "test_split": 0.1,
+    "image_size": 224,
+    "batch_size": 32,
+    "num_workers": 4,
+    "seed": 42
+  },
+  "augmentation": {
+    "center_crop_shortest": true,
+    "whiteout_header": false,
+    "whiteout_fraction": 0.15,
+    "train": {
+      "horizontal_flip": 0.5,
+      "rotation_degrees": 5,
+      "color_jitter": {
+        "brightness": 0.2,
+        "contrast": 0.2,
+        "saturation": 0.1,
+        "hue": 0.05
+      },
+      "random_erasing": 0.1
+    },
+    "val": {
+      "enabled": false
+    }
+  },
+  "training": {
+    "epochs": 40,
+    "learning_rate": 0.0001,
+    "weight_decay": 0.0001,
+    "optimizer": "adamw",
+    "scheduler": "cosine",
+    "warmup_epochs": 5,
+    "label_smoothing": 0.0,
+    "gradient_clip_norm": 1.0,
+    "pos_weight": [
+      3.6715595722198486,
+      6.668674468994141,
+      2.3281044960021973,
+      6.0722222328186035
+    ]
+  },
+  "monitoring": {
+    "metric": "val_f1",
+    "mode": "max"
+  },
+  "early_stopping": {
+    "enabled": true,
+    "patience": 20
+  },
+  "evaluation": {
+    "threshold": 0.5,
+    "save_confusion_matrix": true,
+    "save_per_class_metrics": true
+  },
+  "checkpointing": {
+    "save_best_only": true,
+    "save_last": true
+  },
+  "paths": {
+    "data_dir": "data",
+    "output_dir": "outputs",
+    "checkpoint_dir": "checkpoints",
+    "logs_dir": "logs"
+  },
+  "logging": {
+    "use_tensorboard": false,
+    "use_wandb": true,
+    "wandb_project": "pdf-page-classifier",
+    "log_interval": 10,
+    "wandb_run_name": "silver-line-69"
+  },
+  "qat": {
+    "enabled": true,
+    "epochs": 5,
+    "learning_rate": "1e-5",
+    "preset": "mixed",
+    "num_init_samples": 300
+  },
+  "onnx": {
+    "opset_version": 14,
+    "dynamic_axes": true,
+    "simplify": true,
+    "input_names": [
+      "image"
+    ],
+    "output_names": [
+      "probabilities"
+    ]
+  }
+}