""" ClauseGuard — OCR Engine v1.0 ═════════════════════════════ Smart PDF Router: detects native vs scanned PDFs. • Native PDF → pdfplumber (fast, existing) • Scanned PDF → docTR OCR (CPU-friendly, ~150MB models) Architecture: PDF uploaded ↓ [detect_if_scanned] — pdfplumber gets <50 chars/page? ↓ ↓ Native PDF Scanned PDF ↓ ↓ pdfplumber docTR OCR (CPU) ↓ ↓ Contract text → existing analysis pipeline """ import os import re # ── docTR (soft-fail) ─────────────────────────────────────────────── _HAS_DOCTR = False _ocr_predictor = None try: from doctr.io import DocumentFile from doctr.models import ocr_predictor as _make_predictor _HAS_DOCTR = True except ImportError: pass # ── pdfplumber (soft-fail) ────────────────────────────────────────── try: import pdfplumber _HAS_PDF = True except ImportError: _HAS_PDF = False # ═══════════════════════════════════════════════════════════════════════ # OCR MODEL LOADING # ═══════════════════════════════════════════════════════════════════════ _ocr_status = "not_loaded" def _load_ocr_model(): """Load docTR OCR predictor (lazy, on first use).""" global _ocr_predictor, _ocr_status if _ocr_predictor is not None: return _ocr_predictor if not _HAS_DOCTR: _ocr_status = "unavailable (python-doctr not installed)" return None try: print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...") _ocr_predictor = _make_predictor( det_arch="fast_base", reco_arch="crnn_vgg16_bn", pretrained=True, assume_straight_pages=True, ) _ocr_status = "loaded" print("[ClauseGuard OCR] docTR models loaded successfully") return _ocr_predictor except Exception as e: _ocr_status = f"failed: {e}" print(f"[ClauseGuard OCR] docTR load failed: {e}") return None def get_ocr_status(): """Return human-readable OCR engine status.""" if _ocr_predictor is not None: return "✅ OCR: docTR loaded" elif _HAS_DOCTR: return "⏳ OCR: docTR available (not yet loaded)" else: return "❌ OCR: unavailable (python-doctr not installed)" # ═══════════════════════════════════════════════════════════════════════ # SMART PDF ROUTER # ═══════════════════════════════════════════════════════════════════════ def _is_scanned_pdf(file_path, min_chars_per_page=50): """ Detect if a PDF is scanned (image-based) by checking if pdfplumber extracts fewer than `min_chars_per_page` characters on average. """ if not _HAS_PDF: return True # Can't check with pdfplumber, assume scanned try: with pdfplumber.open(file_path) as pdf: if len(pdf.pages) == 0: return True total_chars = 0 pages_checked = min(len(pdf.pages), 5) # Check first 5 pages for i in range(pages_checked): page_text = pdf.pages[i].extract_text() or "" total_chars += len(page_text.strip()) avg_chars = total_chars / pages_checked return avg_chars < min_chars_per_page except Exception: return True # If pdfplumber fails, try OCR def _extract_native_pdf(file_path): """Extract text from a native (digital) PDF using pdfplumber.""" if not _HAS_PDF: return None, "pdfplumber not installed" try: text = "" with pdfplumber.open(file_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n\n" if not text.strip(): return None, "No text extracted from PDF" return text.strip(), None except Exception as e: return None, f"PDF parse error: {e}" def _extract_scanned_pdf(file_path): """Extract text from a scanned PDF using docTR OCR.""" predictor = _load_ocr_model() if predictor is None: return None, ( "OCR is not available. Install python-doctr: " "`pip install python-doctr[torch]`" ) try: doc = DocumentFile.from_pdf(file_path) result = predictor(doc) # Extract text page by page full_text = "" for page_idx, page in enumerate(result.pages): page_text = "" for block in page.blocks: for line in block.lines: line_text = " ".join(word.value for word in line.words) page_text += line_text + "\n" page_text += "\n" full_text += page_text + "\n\n" if not full_text.strip(): return None, "OCR could not extract text from scanned PDF" # Clean up OCR artifacts full_text = _clean_ocr_text(full_text) return full_text.strip(), None except Exception as e: return None, f"OCR error: {e}" def _clean_ocr_text(text): """Clean common OCR artifacts.""" # Remove excessive whitespace text = re.sub(r'[ \t]{3,}', ' ', text) # Fix common OCR substitutions text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital → I # Normalize line breaks text = re.sub(r'\n{4,}', '\n\n\n', text) # Remove single-char lines (OCR noise) lines = text.split('\n') cleaned_lines = [] for line in lines: stripped = line.strip() if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'): continue cleaned_lines.append(line) return '\n'.join(cleaned_lines) # ═══════════════════════════════════════════════════════════════════════ # PUBLIC API # ═══════════════════════════════════════════════════════════════════════ def parse_pdf_smart(file_path): """ Smart PDF parser with OCR fallback. Returns: (text, error, method) text: extracted text (or None) error: error message (or None) method: "native" | "ocr" | None """ if not os.path.exists(file_path): return None, "File not found", None # Step 1: Check if PDF is scanned is_scanned = _is_scanned_pdf(file_path) if not is_scanned: # Step 2a: Native PDF — use pdfplumber text, error = _extract_native_pdf(file_path) if text: return text, None, "native" # If pdfplumber returns empty, fall through to OCR print("[ClauseGuard OCR] pdfplumber returned empty — falling back to OCR") # Step 2b: Scanned PDF or pdfplumber failed — use OCR print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected — running docTR OCR...") text, error = _extract_scanned_pdf(file_path) if text: return text, None, "ocr" return None, error, None def ocr_extract(file_path): """ Force OCR extraction on a PDF (bypass native text check). Useful when user explicitly wants OCR. """ return _extract_scanned_pdf(file_path)