Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

gaurv007 commited on 15 days ago

Commit

b5350d6

verified ·

1 Parent(s): 549ed6e

v4.0: Add ocr_engine.py — OCR + RAG Chatbot + Clause Redlining

Browse files

Files changed (1) hide show

ocr_engine.py +218 -0

ocr_engine.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+ClauseGuard — OCR Engine v1.0
+═════════════════════════════
+Smart PDF Router: detects native vs scanned PDFs.
+  • Native PDF → pdfplumber (fast, existing)
+  • Scanned PDF → docTR OCR (CPU-friendly, ~150MB models)
+Architecture:
+  PDF uploaded
+      ↓
+  [detect_if_scanned] — pdfplumber gets <50 chars/page?
+      ↓                           ↓
+    Native PDF               Scanned PDF
+      ↓                           ↓
+    pdfplumber              docTR OCR (CPU)
+      ↓                           ↓
+    Contract text → existing analysis pipeline
+"""
+import os
+import re
+# ── docTR (soft-fail) ───────────────────────────────────────────────
+_HAS_DOCTR = False
+_ocr_predictor = None
+try:
+    from doctr.io import DocumentFile
+    from doctr.models import ocr_predictor as _make_predictor
+    _HAS_DOCTR = True
+except ImportError:
+    pass
+# ── pdfplumber (soft-fail) ──────────────────────────────────────────
+try:
+    import pdfplumber
+    _HAS_PDF = True
+except ImportError:
+    _HAS_PDF = False
+# ═══════════════════════════════════════════════════════════════════════
+# OCR MODEL LOADING
+# ═══════════════════════════════════════════════════════════════════════
+_ocr_status = "not_loaded"
+def _load_ocr_model():
+    """Load docTR OCR predictor (lazy, on first use)."""
+    global _ocr_predictor, _ocr_status
+    if _ocr_predictor is not None:
+        return _ocr_predictor
+    if not _HAS_DOCTR:
+        _ocr_status = "unavailable (python-doctr not installed)"
+        return None
+    try:
+        print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
+        _ocr_predictor = _make_predictor(
+            det_arch="fast_base",
+            reco_arch="crnn_vgg16_bn",
+            pretrained=True,
+            assume_straight_pages=True,
+        )
+        _ocr_status = "loaded"
+        print("[ClauseGuard OCR] docTR models loaded successfully")
+        return _ocr_predictor
+    except Exception as e:
+        _ocr_status = f"failed: {e}"
+        print(f"[ClauseGuard OCR] docTR load failed: {e}")
+        return None
+def get_ocr_status():
+    """Return human-readable OCR engine status."""
+    if _ocr_predictor is not None:
+        return "✅ OCR: docTR loaded"
+    elif _HAS_DOCTR:
+        return "⏳ OCR: docTR available (not yet loaded)"
+    else:
+        return "❌ OCR: unavailable (python-doctr not installed)"
+# ═══════════════════════════════════════════════════════════════════════
+# SMART PDF ROUTER
+# ═══════════════════════════════════════════════════════════════════════
+def _is_scanned_pdf(file_path, min_chars_per_page=50):
+    """
+    Detect if a PDF is scanned (image-based) by checking if pdfplumber
+    extracts fewer than `min_chars_per_page` characters on average.
+    """
+    if not _HAS_PDF:
+        return True  # Can't check with pdfplumber, assume scanned
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            if len(pdf.pages) == 0:
+                return True
+            total_chars = 0
+            pages_checked = min(len(pdf.pages), 5)  # Check first 5 pages
+            for i in range(pages_checked):
+                page_text = pdf.pages[i].extract_text() or ""
+                total_chars += len(page_text.strip())
+            avg_chars = total_chars / pages_checked
+            return avg_chars < min_chars_per_page
+    except Exception:
+        return True  # If pdfplumber fails, try OCR
+def _extract_native_pdf(file_path):
+    """Extract text from a native (digital) PDF using pdfplumber."""
+    if not _HAS_PDF:
+        return None, "pdfplumber not installed"
+    try:
+        text = ""
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n\n"
+        if not text.strip():
+            return None, "No text extracted from PDF"
+        return text.strip(), None
+    except Exception as e:
+        return None, f"PDF parse error: {e}"
+def _extract_scanned_pdf(file_path):
+    """Extract text from a scanned PDF using docTR OCR."""
+    predictor = _load_ocr_model()
+    if predictor is None:
+        return None, (
+            "OCR is not available. Install python-doctr: "
+            "`pip install python-doctr[torch]`"
+        )
+    try:
+        doc = DocumentFile.from_pdf(file_path)
+        result = predictor(doc)
+        # Extract text page by page
+        full_text = ""
+        for page_idx, page in enumerate(result.pages):
+            page_text = ""
+            for block in page.blocks:
+                for line in block.lines:
+                    line_text = " ".join(word.value for word in line.words)
+                    page_text += line_text + "\n"
+                page_text += "\n"
+            full_text += page_text + "\n\n"
+        if not full_text.strip():
+            return None, "OCR could not extract text from scanned PDF"
+        # Clean up OCR artifacts
+        full_text = _clean_ocr_text(full_text)
+        return full_text.strip(), None
+    except Exception as e:
+        return None, f"OCR error: {e}"
+def _clean_ocr_text(text):
+    """Clean common OCR artifacts."""
+    # Remove excessive whitespace
+    text = re.sub(r'[ \t]{3,}', '  ', text)
+    # Fix common OCR substitutions
+    text = re.sub(r'\bl\b(?=[A-Z])', 'I', text)  # l before capital → I
+    # Normalize line breaks
+    text = re.sub(r'\n{4,}', '\n\n\n', text)
+    # Remove single-char lines (OCR noise)
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        stripped = line.strip()
+        if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
+            continue
+        cleaned_lines.append(line)
+    return '\n'.join(cleaned_lines)
+# ═══════════════════════════════════════════════════════════════════════
+# PUBLIC API
+# ═══════════════════════════════════════════════════════════════════════
+def parse_pdf_smart(file_path):
+    """
+    Smart PDF parser with OCR fallback.
+    Returns: (text, error, method)
+        text: extracted text (or None)
+        error: error message (or None)
+        method: "native" | "ocr" | None
+    """
+    if not os.path.exists(file_path):
+        return None, "File not found", None
+    # Step 1: Check if PDF is scanned
+    is_scanned = _is_scanned_pdf(file_path)
+    if not is_scanned:
+        # Step 2a: Native PDF — use pdfplumber
+        text, error = _extract_native_pdf(file_path)
+        if text:
+            return text, None, "native"
+        # If pdfplumber returns empty, fall through to OCR
+        print("[ClauseGuard OCR] pdfplumber returned empty — falling back to OCR")
+    # Step 2b: Scanned PDF or pdfplumber failed — use OCR
+    print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected — running docTR OCR...")
+    text, error = _extract_scanned_pdf(file_path)
+    if text:
+        return text, None, "ocr"
+    return None, error, None
+def ocr_extract(file_path):
+    """
+    Force OCR extraction on a PDF (bypass native text check).
+    Useful when user explicitly wants OCR.
+    """
+    return _extract_scanned_pdf(file_path)