emelryan commited on
Commit
4c3050e
·
1 Parent(s): 23a4218

Clean up docs, make syncs optional, update examples for new modes

Browse files

- Remove unused _SUPPORTS_PAD_COLOR and import inspect
- Rename _HF_* constants to _DEFAULT_*
- Update module and class docstrings for current features
- Gate all CUDA syncs and per-batch logging behind verbose_post flag
- Update example.py to use NemotronOCRV2 with --detector-only and
--skip-relational flags
- Add inference modes section to README.md and quickstart.md

README.md CHANGED
@@ -245,6 +245,27 @@ for pred in predictions:
245
  )
246
  ```
247
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  **Constructor rules**
249
 
250
  - You can choose model weights with either **`lang`** or **`model_dir`**.
 
245
  )
246
  ```
247
 
248
+ #### Inference modes
249
+
250
+ ```python
251
+ # Detector only — returns bounding boxes without text recognition.
252
+ # Loads only the detector (~37% less GPU memory, ~20% faster).
253
+ ocr_det = NemotronOCRV2(detector_only=True)
254
+ boxes = ocr_det("page.png")
255
+ # Each prediction has: confidence, left, right, upper, lower, quad
256
+
257
+ # Skip relational — returns per-word text without reading-order grouping.
258
+ # Skips the relational model (~35% less GPU memory, ~8% faster).
259
+ ocr_fast = NemotronOCRV2(skip_relational=True)
260
+ words = ocr_fast("page.png", merge_level="word")
261
+ # Each prediction has: text, confidence, left, right, upper, lower
262
+
263
+ # Profiling mode — enables per-phase CUDA-synced timing in the logs.
264
+ import logging
265
+ logging.basicConfig(level=logging.INFO)
266
+ ocr_profile = NemotronOCRV2(verbose_post=True)
267
+ ```
268
+
269
  **Constructor rules**
270
 
271
  - You can choose model weights with either **`lang`** or **`model_dir`**.
example.py CHANGED
@@ -4,51 +4,68 @@
4
 
5
  import argparse
6
 
7
- from nemotron_ocr.inference.pipeline import NemotronOCR
8
 
9
 
10
- def main(image_path, merge_level, no_visualize, model_dir, lang):
 
 
11
  if model_dir is not None:
12
- ocr_pipeline = NemotronOCR(model_dir=model_dir)
13
  else:
14
- ocr_pipeline = NemotronOCR(lang=lang)
 
 
 
 
15
 
16
- predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)
 
 
17
 
18
  print(f"Found {len(predictions)} text regions.")
19
  for pred in predictions:
20
- print(
21
- f" - Text: '{pred['text']}', "
22
- f"Confidence: {pred['confidence']:.2f}, "
23
- f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
24
- f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
25
- )
 
 
 
 
 
 
 
26
 
27
 
28
  if __name__ == "__main__":
29
- parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
30
  parser.add_argument("image_path", type=str, help="Path to the input image.")
31
  parser.add_argument(
32
  "--merge-level",
33
  type=str,
34
  choices=["word", "sentence", "paragraph"],
35
  default="paragraph",
36
- help="Merge level for OCR output (word, sentence, paragraph).",
37
  )
38
- parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
39
  parser.add_argument(
40
- "--model-dir",
41
- type=str,
42
- default=None,
43
- help="Path to a directory with detector.pth, recognizer.pth, relational.pth, charset.txt. "
44
- "If omitted, weights are downloaded from Hugging Face (default: v2 multilingual).",
45
  )
46
  parser.add_argument(
47
- "--lang",
48
- type=str,
49
- choices=["en", "multi", "v1"],
50
- default=None,
51
- help="Hub checkpoint when --model-dir is omitted: en=v2 English, multi=v2 multilingual (default), v1=legacy.",
 
 
 
 
 
52
  )
53
  args = parser.parse_args()
54
 
@@ -58,4 +75,6 @@ if __name__ == "__main__":
58
  no_visualize=args.no_visualize,
59
  model_dir=args.model_dir,
60
  lang=args.lang,
 
 
61
  )
 
4
 
5
  import argparse
6
 
7
+ from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2
8
 
9
 
10
+ def main(image_path, merge_level, no_visualize, model_dir, lang,
11
+ detector_only, skip_relational):
12
+ kwargs = {}
13
  if model_dir is not None:
14
+ kwargs["model_dir"] = model_dir
15
  else:
16
+ kwargs["lang"] = lang
17
+ if detector_only:
18
+ kwargs["detector_only"] = True
19
+ if skip_relational:
20
+ kwargs["skip_relational"] = True
21
 
22
+ ocr = NemotronOCRV2(**kwargs)
23
+
24
+ predictions = ocr(image_path, merge_level=merge_level)
25
 
26
  print(f"Found {len(predictions)} text regions.")
27
  for pred in predictions:
28
+ if "text" in pred:
29
+ print(
30
+ f" - Text: '{pred['text']}', "
31
+ f"Confidence: {pred['confidence']:.2f}, "
32
+ f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
33
+ f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
34
+ )
35
+ else:
36
+ print(
37
+ f" - Confidence: {pred['confidence']:.2f}, "
38
+ f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
39
+ f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
40
+ )
41
 
42
 
43
  if __name__ == "__main__":
44
+ parser = argparse.ArgumentParser(description="Run OCR inference on an image.")
45
  parser.add_argument("image_path", type=str, help="Path to the input image.")
46
  parser.add_argument(
47
  "--merge-level",
48
  type=str,
49
  choices=["word", "sentence", "paragraph"],
50
  default="paragraph",
51
+ help="Merge level for OCR output (default: paragraph).",
52
  )
53
+ parser.add_argument("--no-visualize", action="store_true", help="(unused, kept for compat)")
54
  parser.add_argument(
55
+ "--model-dir", type=str, default=None,
56
+ help="Local checkpoint directory. If omitted, downloads from Hugging Face.",
 
 
 
57
  )
58
  parser.add_argument(
59
+ "--lang", type=str, choices=["en", "multi", "v1"], default=None,
60
+ help="Hub checkpoint: en, multi (default), or v1.",
61
+ )
62
+ parser.add_argument(
63
+ "--detector-only", action="store_true",
64
+ help="Run detector only — returns boxes without text.",
65
+ )
66
+ parser.add_argument(
67
+ "--skip-relational", action="store_true",
68
+ help="Skip relational model — returns per-word text without reading order.",
69
  )
70
  args = parser.parse_args()
71
 
 
75
  no_visualize=args.no_visualize,
76
  model_dir=args.model_dir,
77
  lang=args.lang,
78
+ detector_only=args.detector_only,
79
+ skip_relational=args.skip_relational,
80
  )
nemotron-ocr/src/nemotron_ocr/inference/pipeline_v2.py CHANGED
@@ -1,22 +1,16 @@
1
  # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
  # SPDX-License-Identifier: Apache-2.0
3
 
4
- """Batched OCR inference pipeline — single-file drop-in.
5
-
6
- Extends :class:`NemotronOCR` (HuggingFace release or locally modified) with:
7
- - Multi-image detector batching (N images in one forward pass)
8
- - Chunked recognizer (controls VRAM for dense pages)
9
- - Per-image shape tracking for correct coordinate un-normalization
10
- - Vectorized scale factor computation
11
-
12
- All configuration (pad_color, pad_how, infer_length, …) is optional and
13
- detected automatically from the parent class when available.
14
-
15
- Requirements beyond the HuggingFace release: ``nemotron_ocr_cpp`` (already
16
- required by the base pipeline).
17
  """
18
 
19
- import inspect as _inspect
20
  import logging
21
  import time
22
 
@@ -48,47 +42,38 @@ from nemotron_ocr_cpp import (
48
 
49
  logger = logging.getLogger(__name__)
50
 
51
- # ── Capability detection ─────────────────────────────────────────────
52
- # pad_to_square gained an optional ``pad_color`` kwarg in later versions.
53
- # Detect once at import time so _preprocess_batch can adapt.
54
- _SUPPORTS_PAD_COLOR = "pad_color" in _inspect.signature(pad_to_square).parameters
55
-
56
- # Defaults matching the HuggingFace-released pipeline.py.
57
- _HF_PAD_COLOR = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float16)
58
- _HF_INFER_LENGTH = 1024
59
- _HF_MAX_WIDTH = 32
60
- _HF_NUM_TOKENS = 858
61
 
62
 
63
  class NemotronOCRV2(NemotronOCR):
64
  """Batched OCR inference pipeline.
65
 
66
- Inherits all model loading and configuration from :class:`NemotronOCR`.
67
- Adds multi-image batching through the detector, chunked recognizer
68
- execution, and vectorized coordinate scaling.
69
-
70
- Configuration priority for ``pad_color``, ``pad_how``, and
71
- ``infer_length``: explicit constructor arg → parent attribute (if set
72
- by a modified :class:`NemotronOCR`) → HuggingFace default.
73
 
74
  Args:
75
  detector_max_batch_size: Max images per detector forward pass.
76
- recognizer_chunk_size: Fixed number of regions per recognizer
77
- forward call. Regions are padded to this size for consistent
78
- CUDA kernel shapes. Larger → more GPU utilisation, more VRAM.
79
- relational_chunk_size: Pad-to multiple for per-image region counts
80
- inside the relational model. Controls the fixed tensor shapes
81
- seen by convolutions and the transformer encoder.
82
- pad_color: RGB padding color as a 3-element list of floats in [0, 1].
83
- ``None`` inherits from the parent or uses the HF default
84
- (ImageNet mean).
85
- pad_how: Padding placement — ``"bottom_right"`` or ``"center"``.
86
- ``None`` inherits from the parent or defaults to
87
- ``"bottom_right"``.
88
- infer_length: Detector input resolution in pixels. ``None`` inherits
89
- from the parent or defaults to 1024.
90
- verbose_post: Emit detailed per-phase timing at ``logging.INFO``.
91
- **kwargs: Forwarded to :class:`NemotronOCR` (e.g. ``model_dir``).
 
92
  """
93
 
94
  def __init__(
@@ -128,7 +113,7 @@ class NemotronOCRV2(NemotronOCR):
128
  self._pad_color_cpu = torch.tensor(pad_color, dtype=torch.float16)
129
  self._pad_color = None # reset lazy CUDA cache
130
  if not hasattr(self, "_pad_color_cpu"):
131
- self._pad_color_cpu = _HF_PAD_COLOR.clone()
132
  if not hasattr(self, "_pad_color"):
133
  self._pad_color = None
134
 
@@ -142,13 +127,13 @@ class NemotronOCRV2(NemotronOCR):
142
  if infer_length is not None:
143
  self.infer_length = infer_length
144
  if not hasattr(self, "infer_length"):
145
- self.infer_length = _HF_INFER_LENGTH
146
 
147
  # ── recognizer dims (may already be set by a local parent) ───
148
  if not hasattr(self, "max_width"):
149
- self.max_width = _HF_MAX_WIDTH
150
  if not hasattr(self, "num_tokens"):
151
- self.num_tokens = _HF_NUM_TOKENS
152
 
153
  if hasattr(self, "relational"):
154
  self.relational.chunk_size = relational_chunk_size
 
1
  # SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
  # SPDX-License-Identifier: Apache-2.0
3
 
4
+ """Batched OCR inference pipeline.
5
+
6
+ Extends :class:`NemotronOCR` with:
7
+ - Multi-image detector batching
8
+ - Chunked recognizer with early argmax (low VRAM)
9
+ - Pre-NMS centerness + peak filter for consistent speed
10
+ - Detector-only and skip-relational inference modes
11
+ - Optional per-phase timing via ``verbose_post``
 
 
 
 
 
12
  """
13
 
 
14
  import logging
15
  import time
16
 
 
42
 
43
  logger = logging.getLogger(__name__)
44
 
45
+ # Fallback defaults (used when the parent class doesn't set these).
46
+ _DEFAULT_PAD_COLOR = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float16)
47
+ _DEFAULT_INFER_LENGTH = 1024
48
+ _DEFAULT_MAX_WIDTH = 32
49
+ _DEFAULT_NUM_TOKENS = 858
 
 
 
 
 
50
 
51
 
52
  class NemotronOCRV2(NemotronOCR):
53
  """Batched OCR inference pipeline.
54
 
55
+ Inherits model loading from :class:`NemotronOCR` and adds batched
56
+ detection, chunked recognition, and optional relational grouping.
 
 
 
 
 
57
 
58
  Args:
59
  detector_max_batch_size: Max images per detector forward pass.
60
+ recognizer_chunk_size: Regions per recognizer forward call.
61
+ Padded to this size for consistent CUDA kernel shapes.
62
+ relational_chunk_size: Pad-to multiple for per-image region
63
+ counts inside the relational model.
64
+ use_prefilter: Apply centerness + local-peak filter before NMS
65
+ to prevent O(n^2) slowdowns on dense confidence maps.
66
+ prefilter_peak_kernel: Kernel size for the local-max filter.
67
+ detector_only: Load only the detector; ``__call__`` returns
68
+ bounding boxes without text.
69
+ skip_relational: Skip the relational model; returns per-word
70
+ text without reading-order grouping.
71
+ pad_color: RGB padding colour as ``[R, G, B]`` floats in [0, 1].
72
+ pad_how: ``"bottom_right"`` or ``"center"`` padding placement.
73
+ infer_length: Detector input resolution in pixels (default 1024).
74
+ verbose_post: When True, CUDA-syncs each phase and emits
75
+ per-batch timing via ``logger.info``.
76
+ **kwargs: Forwarded to :class:`NemotronOCR` (``model_dir``, etc.).
77
  """
78
 
79
  def __init__(
 
113
  self._pad_color_cpu = torch.tensor(pad_color, dtype=torch.float16)
114
  self._pad_color = None # reset lazy CUDA cache
115
  if not hasattr(self, "_pad_color_cpu"):
116
+ self._pad_color_cpu = _DEFAULT_PAD_COLOR.clone()
117
  if not hasattr(self, "_pad_color"):
118
  self._pad_color = None
119
 
 
127
  if infer_length is not None:
128
  self.infer_length = infer_length
129
  if not hasattr(self, "infer_length"):
130
+ self.infer_length = _DEFAULT_INFER_LENGTH
131
 
132
  # ── recognizer dims (may already be set by a local parent) ───
133
  if not hasattr(self, "max_width"):
134
+ self.max_width = _DEFAULT_MAX_WIDTH
135
  if not hasattr(self, "num_tokens"):
136
+ self.num_tokens = _DEFAULT_NUM_TOKENS
137
 
138
  if hasattr(self, "relational"):
139
  self.relational.chunk_size = relational_chunk_size
quickstart.md CHANGED
@@ -29,45 +29,44 @@ uv run python -c "import nemotron_ocr; import nemotron_ocr_cpp"
29
 
30
  ## Usage
31
 
32
- `nemotron_ocr.inference.pipeline.NemotronOCR` is the main entry point for performing OCR inference; it can be used to iterate over predictions for a given input image:
33
 
34
  ```python
35
- from nemotron_ocr.inference.pipeline import NemotronOCR
36
-
37
- ocr = NemotronOCR()
38
 
 
39
  predictions = ocr("ocr-example-input-1.png")
40
 
41
  for pred in predictions:
42
- print(
43
- f" - Text: '{pred['text']}', "
44
- f"Confidence: {pred['confidence']:.2f}, "
45
- f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
46
- )
47
  ```
48
 
49
- Or predictions can be superimposed on the input image for visualization:
50
 
51
  ```python
52
- ocr(image_path, visualize=True)
 
 
53
  ```
54
 
55
- The level of detection merging can be adjusted by modifying the `merge_level` argument (defaulting to "paragraph"):
56
 
57
  ```python
58
- ocr(image_path, merge_level="word") # leave detected words unmerged
59
- ocr(image_path, merge_level="sentence") # merge detected words into sentences
60
- ```
61
 
62
- An example script `example.py` is provided for convenience:
 
63
 
64
- ```bash
65
- uv run python example.py ocr-example-input-1.png
66
  ```
67
 
68
- Detection merging can be adjusted by modifying the `--merge-level` option:
69
 
70
  ```bash
 
71
  uv run python example.py ocr-example-input-1.png --merge-level word
72
- uv run python example.py ocr-example-input-1.png --merge-level sentence
 
73
  ```
 
29
 
30
  ## Usage
31
 
32
+ `NemotronOCRV2` is the recommended entry point for OCR inference:
33
 
34
  ```python
35
+ from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2
 
 
36
 
37
+ ocr = NemotronOCRV2()
38
  predictions = ocr("ocr-example-input-1.png")
39
 
40
  for pred in predictions:
41
+ print(f" - Text: '{pred['text']}', Confidence: {pred['confidence']:.2f}")
 
 
 
 
42
  ```
43
 
44
+ The level of detection merging can be adjusted with `merge_level`:
45
 
46
  ```python
47
+ ocr(image_path, merge_level="word") # individual words
48
+ ocr(image_path, merge_level="sentence") # merged into sentences
49
+ ocr(image_path, merge_level="paragraph") # merged into paragraphs (default)
50
  ```
51
 
52
+ ### Inference modes
53
 
54
  ```python
55
+ # Detector only — bounding boxes, no text (fastest, lowest memory)
56
+ ocr_det = NemotronOCRV2(detector_only=True)
 
57
 
58
+ # Skip relational — per-word text, no reading-order grouping
59
+ ocr_fast = NemotronOCRV2(skip_relational=True)
60
 
61
+ # Profiling — per-phase CUDA-synced timing in logs
62
+ ocr_profile = NemotronOCRV2(verbose_post=True)
63
  ```
64
 
65
+ ### Example script
66
 
67
  ```bash
68
+ uv run python example.py ocr-example-input-1.png
69
  uv run python example.py ocr-example-input-1.png --merge-level word
70
+ uv run python example.py ocr-example-input-1.png --detector-only
71
+ uv run python example.py ocr-example-input-1.png --skip-relational
72
  ```