Clean up docs, make syncs optional, update examples for new modes

- Remove unused _SUPPORTS_PAD_COLOR and import inspect
- Rename _HF_* constants to _DEFAULT_*
- Update module and class docstrings for current features
- Gate all CUDA syncs and per-batch logging behind verbose_post flag
- Update example.py to use NemotronOCRV2 with --detector-only and
--skip-relational flags
- Add inference modes section to README.md and quickstart.md

Files changed (4) hide show

README.md +21 -0
example.py +43 -24
nemotron-ocr/src/nemotron_ocr/inference/pipeline_v2.py +36 -51
quickstart.md +19 -20

README.md CHANGED Viewed

@@ -245,6 +245,27 @@ for pred in predictions:
     )
 ```
 **Constructor rules**
 - You can choose model weights with either **`lang`** or **`model_dir`**.

     )
 ```
+#### Inference modes
+```python
+# Detector only — returns bounding boxes without text recognition.
+# Loads only the detector (~37% less GPU memory, ~20% faster).
+ocr_det = NemotronOCRV2(detector_only=True)
+boxes = ocr_det("page.png")
+# Each prediction has: confidence, left, right, upper, lower, quad
+# Skip relational — returns per-word text without reading-order grouping.
+# Skips the relational model (~35% less GPU memory, ~8% faster).
+ocr_fast = NemotronOCRV2(skip_relational=True)
+words = ocr_fast("page.png", merge_level="word")
+# Each prediction has: text, confidence, left, right, upper, lower
+# Profiling mode — enables per-phase CUDA-synced timing in the logs.
+import logging
+logging.basicConfig(level=logging.INFO)
+ocr_profile = NemotronOCRV2(verbose_post=True)
+```
 **Constructor rules**
 - You can choose model weights with either **`lang`** or **`model_dir`**.

example.py CHANGED Viewed

@@ -4,51 +4,68 @@
 import argparse
-from nemotron_ocr.inference.pipeline import NemotronOCR
-def main(image_path, merge_level, no_visualize, model_dir, lang):
     if model_dir is not None:
-        ocr_pipeline = NemotronOCR(model_dir=model_dir)
     else:
-        ocr_pipeline = NemotronOCR(lang=lang)
-    predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
     print(f"Found {len(predictions)} text regions.")
     for pred in predictions:
-        print(
-            f"  - Text: '{pred['text']}', "
-            f"Confidence: {pred['confidence']:.2f}, "
-            f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
-            f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
-        )
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
     parser.add_argument("image_path", type=str, help="Path to the input image.")
     parser.add_argument(
         "--merge-level",
         type=str,
         choices=["word", "sentence", "paragraph"],
         default="paragraph",
-        help="Merge level for OCR output (word, sentence, paragraph).",
     )
-    parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
     parser.add_argument(
-        "--model-dir",
-        type=str,
-        default=None,
-        help="Path to a directory with detector.pth, recognizer.pth, relational.pth, charset.txt. "
-        "If omitted, weights are downloaded from Hugging Face (default: v2 multilingual).",
     )
     parser.add_argument(
-        "--lang",
-        type=str,
-        choices=["en", "multi", "v1"],
-        default=None,
-        help="Hub checkpoint when --model-dir is omitted: en=v2 English, multi=v2 multilingual (default), v1=legacy.",
     )
     args = parser.parse_args()
@@ -58,4 +75,6 @@ if __name__ == "__main__":
         no_visualize=args.no_visualize,
         model_dir=args.model_dir,
         lang=args.lang,
     )

 import argparse
+from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2
+def main(image_path, merge_level, no_visualize, model_dir, lang,
+         detector_only, skip_relational):
+    kwargs = {}
     if model_dir is not None:
+        kwargs["model_dir"] = model_dir
     else:
+        kwargs["lang"] = lang
+    if detector_only:
+        kwargs["detector_only"] = True
+    if skip_relational:
+        kwargs["skip_relational"] = True
+    ocr = NemotronOCRV2(**kwargs)
+    predictions = ocr(image_path, merge_level=merge_level)
     print(f"Found {len(predictions)} text regions.")
     for pred in predictions:
+        if "text" in pred:
+            print(
+                f"  - Text: '{pred['text']}', "
+                f"Confidence: {pred['confidence']:.2f}, "
+                f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
+                f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
+            )
+        else:
+            print(
+                f"  - Confidence: {pred['confidence']:.2f}, "
+                f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
+                f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
+            )
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run OCR inference on an image.")
     parser.add_argument("image_path", type=str, help="Path to the input image.")
     parser.add_argument(
         "--merge-level",
         type=str,
         choices=["word", "sentence", "paragraph"],
         default="paragraph",
+        help="Merge level for OCR output (default: paragraph).",
     )
+    parser.add_argument("--no-visualize", action="store_true", help="(unused, kept for compat)")
     parser.add_argument(
+        "--model-dir", type=str, default=None,
+        help="Local checkpoint directory. If omitted, downloads from Hugging Face.",
     )
     parser.add_argument(
+        "--lang", type=str, choices=["en", "multi", "v1"], default=None,
+        help="Hub checkpoint: en, multi (default), or v1.",
+    )
+    parser.add_argument(
+        "--detector-only", action="store_true",
+        help="Run detector only — returns boxes without text.",
+    )
+    parser.add_argument(
+        "--skip-relational", action="store_true",
+        help="Skip relational model — returns per-word text without reading order.",
     )
     args = parser.parse_args()
         no_visualize=args.no_visualize,
         model_dir=args.model_dir,
         lang=args.lang,
+        detector_only=args.detector_only,
+        skip_relational=args.skip_relational,
     )

nemotron-ocr/src/nemotron_ocr/inference/pipeline_v2.py CHANGED Viewed

@@ -1,22 +1,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
-"""Batched OCR inference pipeline — single-file drop-in.
-Extends :class:`NemotronOCR` (HuggingFace release or locally modified) with:
-  - Multi-image detector batching (N images in one forward pass)
-  - Chunked recognizer (controls VRAM for dense pages)
-  - Per-image shape tracking for correct coordinate un-normalization
-  - Vectorized scale factor computation
-All configuration (pad_color, pad_how, infer_length, …) is optional and
-detected automatically from the parent class when available.
-Requirements beyond the HuggingFace release: ``nemotron_ocr_cpp`` (already
-required by the base pipeline).
 """
-import inspect as _inspect
 import logging
 import time
@@ -48,47 +42,38 @@ from nemotron_ocr_cpp import (
 logger = logging.getLogger(__name__)
-# ── Capability detection ─────────────────────────────────────────────
-# pad_to_square gained an optional ``pad_color`` kwarg in later versions.
-# Detect once at import time so _preprocess_batch can adapt.
-_SUPPORTS_PAD_COLOR = "pad_color" in _inspect.signature(pad_to_square).parameters
-# Defaults matching the HuggingFace-released pipeline.py.
-_HF_PAD_COLOR = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float16)
-_HF_INFER_LENGTH = 1024
-_HF_MAX_WIDTH = 32
-_HF_NUM_TOKENS = 858
 class NemotronOCRV2(NemotronOCR):
     """Batched OCR inference pipeline.
-    Inherits all model loading and configuration from :class:`NemotronOCR`.
-    Adds multi-image batching through the detector, chunked recognizer
-    execution, and vectorized coordinate scaling.
-    Configuration priority for ``pad_color``, ``pad_how``, and
-    ``infer_length``: explicit constructor arg → parent attribute (if set
-    by a modified :class:`NemotronOCR`) → HuggingFace default.
     Args:
         detector_max_batch_size: Max images per detector forward pass.
-        recognizer_chunk_size: Fixed number of regions per recognizer
-            forward call.  Regions are padded to this size for consistent
-            CUDA kernel shapes.  Larger → more GPU utilisation, more VRAM.
-        relational_chunk_size: Pad-to multiple for per-image region counts
-            inside the relational model.  Controls the fixed tensor shapes
-            seen by convolutions and the transformer encoder.
-        pad_color: RGB padding color as a 3-element list of floats in [0, 1].
-            ``None`` inherits from the parent or uses the HF default
-            (ImageNet mean).
-        pad_how: Padding placement — ``"bottom_right"`` or ``"center"``.
-            ``None`` inherits from the parent or defaults to
-            ``"bottom_right"``.
-        infer_length: Detector input resolution in pixels.  ``None`` inherits
-            from the parent or defaults to 1024.
-        verbose_post: Emit detailed per-phase timing at ``logging.INFO``.
-        **kwargs: Forwarded to :class:`NemotronOCR` (e.g. ``model_dir``).
     """
     def __init__(
@@ -128,7 +113,7 @@ class NemotronOCRV2(NemotronOCR):
             self._pad_color_cpu = torch.tensor(pad_color, dtype=torch.float16)
             self._pad_color = None  # reset lazy CUDA cache
         if not hasattr(self, "_pad_color_cpu"):
-            self._pad_color_cpu = _HF_PAD_COLOR.clone()
         if not hasattr(self, "_pad_color"):
             self._pad_color = None
@@ -142,13 +127,13 @@ class NemotronOCRV2(NemotronOCR):
         if infer_length is not None:
             self.infer_length = infer_length
         if not hasattr(self, "infer_length"):
-            self.infer_length = _HF_INFER_LENGTH
         # ── recognizer dims (may already be set by a local parent) ───
         if not hasattr(self, "max_width"):
-            self.max_width = _HF_MAX_WIDTH
         if not hasattr(self, "num_tokens"):
-            self.num_tokens = _HF_NUM_TOKENS
         if hasattr(self, "relational"):
             self.relational.chunk_size = relational_chunk_size

 # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+"""Batched OCR inference pipeline.
+Extends :class:`NemotronOCR` with:
+  - Multi-image detector batching
+  - Chunked recognizer with early argmax (low VRAM)
+  - Pre-NMS centerness + peak filter for consistent speed
+  - Detector-only and skip-relational inference modes
+  - Optional per-phase timing via ``verbose_post``
 """
 import logging
 import time
 logger = logging.getLogger(__name__)
+# Fallback defaults (used when the parent class doesn't set these).
+_DEFAULT_PAD_COLOR = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float16)
+_DEFAULT_INFER_LENGTH = 1024
+_DEFAULT_MAX_WIDTH = 32
+_DEFAULT_NUM_TOKENS = 858
 class NemotronOCRV2(NemotronOCR):
     """Batched OCR inference pipeline.
+    Inherits model loading from :class:`NemotronOCR` and adds batched
+    detection, chunked recognition, and optional relational grouping.
     Args:
         detector_max_batch_size: Max images per detector forward pass.
+        recognizer_chunk_size: Regions per recognizer forward call.
+            Padded to this size for consistent CUDA kernel shapes.
+        relational_chunk_size: Pad-to multiple for per-image region
+            counts inside the relational model.
+        use_prefilter: Apply centerness + local-peak filter before NMS
+            to prevent O(n^2) slowdowns on dense confidence maps.
+        prefilter_peak_kernel: Kernel size for the local-max filter.
+        detector_only: Load only the detector; ``__call__`` returns
+            bounding boxes without text.
+        skip_relational: Skip the relational model; returns per-word
+            text without reading-order grouping.
+        pad_color: RGB padding colour as ``[R, G, B]`` floats in [0, 1].
+        pad_how: ``"bottom_right"`` or ``"center"`` padding placement.
+        infer_length: Detector input resolution in pixels (default 1024).
+        verbose_post: When True, CUDA-syncs each phase and emits
+            per-batch timing via ``logger.info``.
+        **kwargs: Forwarded to :class:`NemotronOCR` (``model_dir``, etc.).
     """
     def __init__(
             self._pad_color_cpu = torch.tensor(pad_color, dtype=torch.float16)
             self._pad_color = None  # reset lazy CUDA cache
         if not hasattr(self, "_pad_color_cpu"):
+            self._pad_color_cpu = _DEFAULT_PAD_COLOR.clone()
         if not hasattr(self, "_pad_color"):
             self._pad_color = None
         if infer_length is not None:
             self.infer_length = infer_length
         if not hasattr(self, "infer_length"):
+            self.infer_length = _DEFAULT_INFER_LENGTH
         # ── recognizer dims (may already be set by a local parent) ───
         if not hasattr(self, "max_width"):
+            self.max_width = _DEFAULT_MAX_WIDTH
         if not hasattr(self, "num_tokens"):
+            self.num_tokens = _DEFAULT_NUM_TOKENS
         if hasattr(self, "relational"):
             self.relational.chunk_size = relational_chunk_size

quickstart.md CHANGED Viewed

@@ -29,45 +29,44 @@ uv run python -c "import nemotron_ocr; import nemotron_ocr_cpp"
 ## Usage
-`nemotron_ocr.inference.pipeline.NemotronOCR` is the main entry point for performing OCR inference; it can be used to iterate over predictions for a given input image:
 ```python
-from nemotron_ocr.inference.pipeline import NemotronOCR
-ocr = NemotronOCR()
 predictions = ocr("ocr-example-input-1.png")
 for pred in predictions:
-    print(
-        f"  - Text: '{pred['text']}', "
-        f"Confidence: {pred['confidence']:.2f}, "
-        f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
-    )
 ```
-Or predictions can be superimposed on the input image for visualization:
 ```python
-ocr(image_path, visualize=True)
 ```
-The level of detection merging can be adjusted by modifying the `merge_level` argument (defaulting to "paragraph"):
 ```python
-ocr(image_path, merge_level="word")      # leave detected words unmerged
-ocr(image_path, merge_level="sentence")  # merge detected words into sentences
-```
-An example script `example.py` is provided for convenience:
-```bash
-uv run python example.py ocr-example-input-1.png
 ```
-Detection merging can be adjusted by modifying the `--merge-level` option:
 ```bash
 uv run python example.py ocr-example-input-1.png --merge-level word
-uv run python example.py ocr-example-input-1.png --merge-level sentence
 ```

 ## Usage
+`NemotronOCRV2` is the recommended entry point for OCR inference:
 ```python
+from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2
+ocr = NemotronOCRV2()
 predictions = ocr("ocr-example-input-1.png")
 for pred in predictions:
+    print(f"  - Text: '{pred['text']}', Confidence: {pred['confidence']:.2f}")
 ```
+The level of detection merging can be adjusted with `merge_level`:
 ```python
+ocr(image_path, merge_level="word")       # individual words
+ocr(image_path, merge_level="sentence")   # merged into sentences
+ocr(image_path, merge_level="paragraph")  # merged into paragraphs (default)
 ```
+### Inference modes
 ```python
+# Detector only — bounding boxes, no text (fastest, lowest memory)
+ocr_det = NemotronOCRV2(detector_only=True)
+# Skip relational — per-word text, no reading-order grouping
+ocr_fast = NemotronOCRV2(skip_relational=True)
+# Profiling — per-phase CUDA-synced timing in logs
+ocr_profile = NemotronOCRV2(verbose_post=True)
 ```
+### Example script
 ```bash
+uv run python example.py ocr-example-input-1.png
 uv run python example.py ocr-example-input-1.png --merge-level word
+uv run python example.py ocr-example-input-1.png --detector-only
+uv run python example.py ocr-example-input-1.png --skip-relational
 ```