ClauseGuard / ocr_engine.py
gaurv007's picture
v4.0: Add ocr_engine.py β€” OCR + RAG Chatbot + Clause Redlining
b5350d6 verified
raw
history blame
8.26 kB
"""
ClauseGuard β€” OCR Engine v1.0
═════════════════════════════
Smart PDF Router: detects native vs scanned PDFs.
β€’ Native PDF β†’ pdfplumber (fast, existing)
β€’ Scanned PDF β†’ docTR OCR (CPU-friendly, ~150MB models)
Architecture:
PDF uploaded
↓
[detect_if_scanned] β€” pdfplumber gets <50 chars/page?
↓ ↓
Native PDF Scanned PDF
↓ ↓
pdfplumber docTR OCR (CPU)
↓ ↓
Contract text β†’ existing analysis pipeline
"""
import os
import re
# ── docTR (soft-fail) ───────────────────────────────────────────────
_HAS_DOCTR = False
_ocr_predictor = None
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor as _make_predictor
_HAS_DOCTR = True
except ImportError:
pass
# ── pdfplumber (soft-fail) ──────────────────────────────────────────
try:
import pdfplumber
_HAS_PDF = True
except ImportError:
_HAS_PDF = False
# ═══════════════════════════════════════════════════════════════════════
# OCR MODEL LOADING
# ═══════════════════════════════════════════════════════════════════════
_ocr_status = "not_loaded"
def _load_ocr_model():
"""Load docTR OCR predictor (lazy, on first use)."""
global _ocr_predictor, _ocr_status
if _ocr_predictor is not None:
return _ocr_predictor
if not _HAS_DOCTR:
_ocr_status = "unavailable (python-doctr not installed)"
return None
try:
print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
_ocr_predictor = _make_predictor(
det_arch="fast_base",
reco_arch="crnn_vgg16_bn",
pretrained=True,
assume_straight_pages=True,
)
_ocr_status = "loaded"
print("[ClauseGuard OCR] docTR models loaded successfully")
return _ocr_predictor
except Exception as e:
_ocr_status = f"failed: {e}"
print(f"[ClauseGuard OCR] docTR load failed: {e}")
return None
def get_ocr_status():
"""Return human-readable OCR engine status."""
if _ocr_predictor is not None:
return "βœ… OCR: docTR loaded"
elif _HAS_DOCTR:
return "⏳ OCR: docTR available (not yet loaded)"
else:
return "❌ OCR: unavailable (python-doctr not installed)"
# ═══════════════════════════════════════════════════════════════════════
# SMART PDF ROUTER
# ═══════════════════════════════════════════════════════════════════════
def _is_scanned_pdf(file_path, min_chars_per_page=50):
"""
Detect if a PDF is scanned (image-based) by checking if pdfplumber
extracts fewer than `min_chars_per_page` characters on average.
"""
if not _HAS_PDF:
return True # Can't check with pdfplumber, assume scanned
try:
with pdfplumber.open(file_path) as pdf:
if len(pdf.pages) == 0:
return True
total_chars = 0
pages_checked = min(len(pdf.pages), 5) # Check first 5 pages
for i in range(pages_checked):
page_text = pdf.pages[i].extract_text() or ""
total_chars += len(page_text.strip())
avg_chars = total_chars / pages_checked
return avg_chars < min_chars_per_page
except Exception:
return True # If pdfplumber fails, try OCR
def _extract_native_pdf(file_path):
"""Extract text from a native (digital) PDF using pdfplumber."""
if not _HAS_PDF:
return None, "pdfplumber not installed"
try:
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
if not text.strip():
return None, "No text extracted from PDF"
return text.strip(), None
except Exception as e:
return None, f"PDF parse error: {e}"
def _extract_scanned_pdf(file_path):
"""Extract text from a scanned PDF using docTR OCR."""
predictor = _load_ocr_model()
if predictor is None:
return None, (
"OCR is not available. Install python-doctr: "
"`pip install python-doctr[torch]`"
)
try:
doc = DocumentFile.from_pdf(file_path)
result = predictor(doc)
# Extract text page by page
full_text = ""
for page_idx, page in enumerate(result.pages):
page_text = ""
for block in page.blocks:
for line in block.lines:
line_text = " ".join(word.value for word in line.words)
page_text += line_text + "\n"
page_text += "\n"
full_text += page_text + "\n\n"
if not full_text.strip():
return None, "OCR could not extract text from scanned PDF"
# Clean up OCR artifacts
full_text = _clean_ocr_text(full_text)
return full_text.strip(), None
except Exception as e:
return None, f"OCR error: {e}"
def _clean_ocr_text(text):
"""Clean common OCR artifacts."""
# Remove excessive whitespace
text = re.sub(r'[ \t]{3,}', ' ', text)
# Fix common OCR substitutions
text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital β†’ I
# Normalize line breaks
text = re.sub(r'\n{4,}', '\n\n\n', text)
# Remove single-char lines (OCR noise)
lines = text.split('\n')
cleaned_lines = []
for line in lines:
stripped = line.strip()
if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
continue
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
# ═══════════════════════════════════════════════════════════════════════
# PUBLIC API
# ═══════════════════════════════════════════════════════════════════════
def parse_pdf_smart(file_path):
"""
Smart PDF parser with OCR fallback.
Returns: (text, error, method)
text: extracted text (or None)
error: error message (or None)
method: "native" | "ocr" | None
"""
if not os.path.exists(file_path):
return None, "File not found", None
# Step 1: Check if PDF is scanned
is_scanned = _is_scanned_pdf(file_path)
if not is_scanned:
# Step 2a: Native PDF β€” use pdfplumber
text, error = _extract_native_pdf(file_path)
if text:
return text, None, "native"
# If pdfplumber returns empty, fall through to OCR
print("[ClauseGuard OCR] pdfplumber returned empty β€” falling back to OCR")
# Step 2b: Scanned PDF or pdfplumber failed β€” use OCR
print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected β€” running docTR OCR...")
text, error = _extract_scanned_pdf(file_path)
if text:
return text, None, "ocr"
return None, error, None
def ocr_extract(file_path):
"""
Force OCR extraction on a PDF (bypass native text check).
Useful when user explicitly wants OCR.
"""
return _extract_scanned_pdf(file_path)