Spaces:

gaurv007
/

ClauseGuard

Sleeping

File size: 8,255 Bytes

b5350d6

"""
ClauseGuard — OCR Engine v1.0
═════════════════════════════
Smart PDF Router: detects native vs scanned PDFs.
  • Native PDF → pdfplumber (fast, existing)
  • Scanned PDF → docTR OCR (CPU-friendly, ~150MB models)

Architecture:
  PDF uploaded
      ↓
  [detect_if_scanned] — pdfplumber gets <50 chars/page?
      ↓                           ↓
    Native PDF               Scanned PDF
      ↓                           ↓
    pdfplumber              docTR OCR (CPU)
      ↓                           ↓
    Contract text → existing analysis pipeline
"""

import os
import re

# ── docTR (soft-fail) ───────────────────────────────────────────────
_HAS_DOCTR = False
_ocr_predictor = None

try:
    from doctr.io import DocumentFile
    from doctr.models import ocr_predictor as _make_predictor
    _HAS_DOCTR = True
except ImportError:
    pass

# ── pdfplumber (soft-fail) ──────────────────────────────────────────
try:
    import pdfplumber
    _HAS_PDF = True
except ImportError:
    _HAS_PDF = False

# ═══════════════════════════════════════════════════════════════════════
# OCR MODEL LOADING
# ═══════════════════════════════════════════════════════════════════════

_ocr_status = "not_loaded"

def _load_ocr_model():
    """Load docTR OCR predictor (lazy, on first use)."""
    global _ocr_predictor, _ocr_status
    if _ocr_predictor is not None:
        return _ocr_predictor
    if not _HAS_DOCTR:
        _ocr_status = "unavailable (python-doctr not installed)"
        return None
    try:
        print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
        _ocr_predictor = _make_predictor(
            det_arch="fast_base",
            reco_arch="crnn_vgg16_bn",
            pretrained=True,
            assume_straight_pages=True,
        )
        _ocr_status = "loaded"
        print("[ClauseGuard OCR] docTR models loaded successfully")
        return _ocr_predictor
    except Exception as e:
        _ocr_status = f"failed: {e}"
        print(f"[ClauseGuard OCR] docTR load failed: {e}")
        return None


def get_ocr_status():
    """Return human-readable OCR engine status."""
    if _ocr_predictor is not None:
        return "✅ OCR: docTR loaded"
    elif _HAS_DOCTR:
        return "⏳ OCR: docTR available (not yet loaded)"
    else:
        return "❌ OCR: unavailable (python-doctr not installed)"


# ═══════════════════════════════════════════════════════════════════════
# SMART PDF ROUTER
# ═══════════════════════════════════════════════════════════════════════

def _is_scanned_pdf(file_path, min_chars_per_page=50):
    """
    Detect if a PDF is scanned (image-based) by checking if pdfplumber
    extracts fewer than `min_chars_per_page` characters on average.
    """
    if not _HAS_PDF:
        return True  # Can't check with pdfplumber, assume scanned
    try:
        with pdfplumber.open(file_path) as pdf:
            if len(pdf.pages) == 0:
                return True
            total_chars = 0
            pages_checked = min(len(pdf.pages), 5)  # Check first 5 pages
            for i in range(pages_checked):
                page_text = pdf.pages[i].extract_text() or ""
                total_chars += len(page_text.strip())
            avg_chars = total_chars / pages_checked
            return avg_chars < min_chars_per_page
    except Exception:
        return True  # If pdfplumber fails, try OCR


def _extract_native_pdf(file_path):
    """Extract text from a native (digital) PDF using pdfplumber."""
    if not _HAS_PDF:
        return None, "pdfplumber not installed"
    try:
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n\n"
        if not text.strip():
            return None, "No text extracted from PDF"
        return text.strip(), None
    except Exception as e:
        return None, f"PDF parse error: {e}"


def _extract_scanned_pdf(file_path):
    """Extract text from a scanned PDF using docTR OCR."""
    predictor = _load_ocr_model()
    if predictor is None:
        return None, (
            "OCR is not available. Install python-doctr: "
            "`pip install python-doctr[torch]`"
        )
    try:
        doc = DocumentFile.from_pdf(file_path)
        result = predictor(doc)

        # Extract text page by page
        full_text = ""
        for page_idx, page in enumerate(result.pages):
            page_text = ""
            for block in page.blocks:
                for line in block.lines:
                    line_text = " ".join(word.value for word in line.words)
                    page_text += line_text + "\n"
                page_text += "\n"
            full_text += page_text + "\n\n"

        if not full_text.strip():
            return None, "OCR could not extract text from scanned PDF"

        # Clean up OCR artifacts
        full_text = _clean_ocr_text(full_text)
        return full_text.strip(), None
    except Exception as e:
        return None, f"OCR error: {e}"


def _clean_ocr_text(text):
    """Clean common OCR artifacts."""
    # Remove excessive whitespace
    text = re.sub(r'[ \t]{3,}', '  ', text)
    # Fix common OCR substitutions
    text = re.sub(r'\bl\b(?=[A-Z])', 'I', text)  # l before capital → I
    # Normalize line breaks
    text = re.sub(r'\n{4,}', '\n\n\n', text)
    # Remove single-char lines (OCR noise)
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        stripped = line.strip()
        if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
            continue
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)


# ═══════════════════════════════════════════════════════════════════════
# PUBLIC API
# ═══════════════════════════════════════════════════════════════════════

def parse_pdf_smart(file_path):
    """
    Smart PDF parser with OCR fallback.
    
    Returns: (text, error, method)
        text: extracted text (or None)
        error: error message (or None)
        method: "native" | "ocr" | None
    """
    if not os.path.exists(file_path):
        return None, "File not found", None

    # Step 1: Check if PDF is scanned
    is_scanned = _is_scanned_pdf(file_path)

    if not is_scanned:
        # Step 2a: Native PDF — use pdfplumber
        text, error = _extract_native_pdf(file_path)
        if text:
            return text, None, "native"
        # If pdfplumber returns empty, fall through to OCR
        print("[ClauseGuard OCR] pdfplumber returned empty — falling back to OCR")

    # Step 2b: Scanned PDF or pdfplumber failed — use OCR
    print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected — running docTR OCR...")
    text, error = _extract_scanned_pdf(file_path)
    if text:
        return text, None, "ocr"
    return None, error, None


def ocr_extract(file_path):
    """
    Force OCR extraction on a PDF (bypass native text check).
    Useful when user explicitly wants OCR.
    """
    return _extract_scanned_pdf(file_path)