| """OCR engine initializers and runners with safer Tesseract handling.""" |
|
|
| import os |
| import sys |
| import tempfile |
| import numpy as np |
|
|
| try: |
| import easyocr |
| except Exception: |
| easyocr = None |
|
|
| try: |
| from doctr.io import DocumentFile |
| from doctr.models import ocr_predictor |
| except Exception: |
| DocumentFile = None |
| ocr_predictor = None |
|
|
| try: |
| from paddleocr import PaddleOCR |
| except Exception: |
| PaddleOCR = None |
|
|
| try: |
| import pytesseract |
| except Exception: |
| pytesseract = None |
|
|
| try: |
| import cv2 |
| except Exception: |
| cv2 = None |
|
|
|
|
| def initialize_ocr_models(ocr_models, language_code, device): |
| ocr_readers = {} |
|
|
| if "EasyOCR" in ocr_models and easyocr is not None: |
| ocr_readers["EasyOCR"] = easyocr.Reader( |
| [language_code], gpu=(device == "GPU (CUDA)") |
| ) |
|
|
| if "DocTR" in ocr_models and ocr_predictor is not None: |
| ocr_readers["DocTR"] = ocr_predictor(pretrained=True) |
|
|
| if "PaddleOCR" in ocr_models and PaddleOCR is not None: |
| use_gpu = True if device == "GPU (CUDA)" else False |
| ocr_readers["PaddleOCR"] = PaddleOCR(lang=language_code, use_gpu=use_gpu) |
|
|
| |
| if "Tesseract" in ocr_models and pytesseract is not None: |
| if sys.platform.startswith("win"): |
| |
| pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" |
| else: |
| |
| for p in ("/usr/bin/tesseract", "/usr/local/bin/tesseract"): |
| if os.path.exists(p): |
| pytesseract.pytesseract.tesseract_cmd = p |
| break |
|
|
| return ocr_readers |
|
|
|
|
| def perform_ocr(model_name, ocr_readers, image, language_code): |
| text = "" |
|
|
| if model_name == "EasyOCR": |
| reader = ocr_readers.get("EasyOCR") |
| if reader is None: |
| return "[EasyOCR not available]" |
| result = reader.readtext(np.array(image)) |
| text = "\n".join([res[1] for res in result]) |
|
|
| elif model_name == "DocTR": |
| predictor = ocr_readers.get("DocTR") |
| if predictor is None or DocumentFile is None: |
| return "[DocTR not available]" |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file: |
| image.save(tmp_file, format="PNG") |
| file_path = tmp_file.name |
| doc = DocumentFile.from_images(file_path) |
| result = predictor(doc) |
| |
| pages = [] |
| for page in result.pages: |
| page_text_blocks = [] |
| for block in page.blocks: |
| lines = [" ".join([word.value for word in line.words]) for line in block.lines] |
| page_text_blocks.append("\n".join(lines)) |
| pages.append("\n\n".join(page_text_blocks)) |
| text = "\n\n".join(pages) |
| try: |
| os.unlink(file_path) |
| except Exception: |
| pass |
|
|
| elif model_name == "PaddleOCR": |
| reader = ocr_readers.get("PaddleOCR") |
| if reader is None: |
| return "[PaddleOCR not available]" |
| result = reader.ocr(np.array(image)) |
| |
| try: |
| text = "\n".join([line[1][0] for line in result[0]]) |
| except Exception: |
| |
| tokens = [] |
| for page in result: |
| for line in page: |
| if len(line) > 1 and isinstance(line[1], (list, tuple)): |
| tokens.append(line[1][0]) |
| text = "\n".join(tokens) |
|
|
| elif model_name == "Tesseract": |
| if pytesseract is None: |
| return "[pytesseract not available]" |
| |
| try: |
| if image.mode != "RGB": |
| image = image.convert("RGB") |
| except Exception: |
| pass |
| |
| if cv2 is not None: |
| opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) |
| else: |
| |
| opencv_image = np.array(image) |
| config = f"--oem 3 --psm 6 -l {language_code}" |
| try: |
| text = pytesseract.image_to_string(opencv_image) |
| except Exception as e: |
| text = f"[Tesseract error: {e}]" |
|
|
| return text |
|
|