nemotron-ocr-v2 / example.py
emelryan's picture
Clean up docs, make syncs optional, update examples for new modes
4c3050e
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import argparse
from nemotron_ocr.inference.pipeline_v2 import NemotronOCRV2
def main(image_path, merge_level, no_visualize, model_dir, lang,
detector_only, skip_relational):
kwargs = {}
if model_dir is not None:
kwargs["model_dir"] = model_dir
else:
kwargs["lang"] = lang
if detector_only:
kwargs["detector_only"] = True
if skip_relational:
kwargs["skip_relational"] = True
ocr = NemotronOCRV2(**kwargs)
predictions = ocr(image_path, merge_level=merge_level)
print(f"Found {len(predictions)} text regions.")
for pred in predictions:
if "text" in pred:
print(
f" - Text: '{pred['text']}', "
f"Confidence: {pred['confidence']:.2f}, "
f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
)
else:
print(
f" - Confidence: {pred['confidence']:.2f}, "
f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run OCR inference on an image.")
parser.add_argument("image_path", type=str, help="Path to the input image.")
parser.add_argument(
"--merge-level",
type=str,
choices=["word", "sentence", "paragraph"],
default="paragraph",
help="Merge level for OCR output (default: paragraph).",
)
parser.add_argument("--no-visualize", action="store_true", help="(unused, kept for compat)")
parser.add_argument(
"--model-dir", type=str, default=None,
help="Local checkpoint directory. If omitted, downloads from Hugging Face.",
)
parser.add_argument(
"--lang", type=str, choices=["en", "multi", "v1"], default=None,
help="Hub checkpoint: en, multi (default), or v1.",
)
parser.add_argument(
"--detector-only", action="store_true",
help="Run detector only — returns boxes without text.",
)
parser.add_argument(
"--skip-relational", action="store_true",
help="Skip relational model — returns per-word text without reading order.",
)
args = parser.parse_args()
main(
args.image_path,
merge_level=args.merge_level,
no_visualize=args.no_visualize,
model_dir=args.model_dir,
lang=args.lang,
detector_only=args.detector_only,
skip_relational=args.skip_relational,
)