Spaces:
Sleeping
Sleeping
File size: 8,255 Bytes
b5350d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | """
ClauseGuard β OCR Engine v1.0
βββββββββββββββββββββββββββββ
Smart PDF Router: detects native vs scanned PDFs.
β’ Native PDF β pdfplumber (fast, existing)
β’ Scanned PDF β docTR OCR (CPU-friendly, ~150MB models)
Architecture:
PDF uploaded
β
[detect_if_scanned] β pdfplumber gets <50 chars/page?
β β
Native PDF Scanned PDF
β β
pdfplumber docTR OCR (CPU)
β β
Contract text β existing analysis pipeline
"""
import os
import re
# ββ docTR (soft-fail) βββββββββββββββββββββββββββββββββββββββββββββββ
_HAS_DOCTR = False
_ocr_predictor = None
try:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor as _make_predictor
_HAS_DOCTR = True
except ImportError:
pass
# ββ pdfplumber (soft-fail) ββββββββββββββββββββββββββββββββββββββββββ
try:
import pdfplumber
_HAS_PDF = True
except ImportError:
_HAS_PDF = False
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# OCR MODEL LOADING
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_ocr_status = "not_loaded"
def _load_ocr_model():
"""Load docTR OCR predictor (lazy, on first use)."""
global _ocr_predictor, _ocr_status
if _ocr_predictor is not None:
return _ocr_predictor
if not _HAS_DOCTR:
_ocr_status = "unavailable (python-doctr not installed)"
return None
try:
print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
_ocr_predictor = _make_predictor(
det_arch="fast_base",
reco_arch="crnn_vgg16_bn",
pretrained=True,
assume_straight_pages=True,
)
_ocr_status = "loaded"
print("[ClauseGuard OCR] docTR models loaded successfully")
return _ocr_predictor
except Exception as e:
_ocr_status = f"failed: {e}"
print(f"[ClauseGuard OCR] docTR load failed: {e}")
return None
def get_ocr_status():
"""Return human-readable OCR engine status."""
if _ocr_predictor is not None:
return "β
OCR: docTR loaded"
elif _HAS_DOCTR:
return "β³ OCR: docTR available (not yet loaded)"
else:
return "β OCR: unavailable (python-doctr not installed)"
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# SMART PDF ROUTER
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _is_scanned_pdf(file_path, min_chars_per_page=50):
"""
Detect if a PDF is scanned (image-based) by checking if pdfplumber
extracts fewer than `min_chars_per_page` characters on average.
"""
if not _HAS_PDF:
return True # Can't check with pdfplumber, assume scanned
try:
with pdfplumber.open(file_path) as pdf:
if len(pdf.pages) == 0:
return True
total_chars = 0
pages_checked = min(len(pdf.pages), 5) # Check first 5 pages
for i in range(pages_checked):
page_text = pdf.pages[i].extract_text() or ""
total_chars += len(page_text.strip())
avg_chars = total_chars / pages_checked
return avg_chars < min_chars_per_page
except Exception:
return True # If pdfplumber fails, try OCR
def _extract_native_pdf(file_path):
"""Extract text from a native (digital) PDF using pdfplumber."""
if not _HAS_PDF:
return None, "pdfplumber not installed"
try:
text = ""
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n\n"
if not text.strip():
return None, "No text extracted from PDF"
return text.strip(), None
except Exception as e:
return None, f"PDF parse error: {e}"
def _extract_scanned_pdf(file_path):
"""Extract text from a scanned PDF using docTR OCR."""
predictor = _load_ocr_model()
if predictor is None:
return None, (
"OCR is not available. Install python-doctr: "
"`pip install python-doctr[torch]`"
)
try:
doc = DocumentFile.from_pdf(file_path)
result = predictor(doc)
# Extract text page by page
full_text = ""
for page_idx, page in enumerate(result.pages):
page_text = ""
for block in page.blocks:
for line in block.lines:
line_text = " ".join(word.value for word in line.words)
page_text += line_text + "\n"
page_text += "\n"
full_text += page_text + "\n\n"
if not full_text.strip():
return None, "OCR could not extract text from scanned PDF"
# Clean up OCR artifacts
full_text = _clean_ocr_text(full_text)
return full_text.strip(), None
except Exception as e:
return None, f"OCR error: {e}"
def _clean_ocr_text(text):
"""Clean common OCR artifacts."""
# Remove excessive whitespace
text = re.sub(r'[ \t]{3,}', ' ', text)
# Fix common OCR substitutions
text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital β I
# Normalize line breaks
text = re.sub(r'\n{4,}', '\n\n\n', text)
# Remove single-char lines (OCR noise)
lines = text.split('\n')
cleaned_lines = []
for line in lines:
stripped = line.strip()
if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
continue
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# PUBLIC API
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def parse_pdf_smart(file_path):
"""
Smart PDF parser with OCR fallback.
Returns: (text, error, method)
text: extracted text (or None)
error: error message (or None)
method: "native" | "ocr" | None
"""
if not os.path.exists(file_path):
return None, "File not found", None
# Step 1: Check if PDF is scanned
is_scanned = _is_scanned_pdf(file_path)
if not is_scanned:
# Step 2a: Native PDF β use pdfplumber
text, error = _extract_native_pdf(file_path)
if text:
return text, None, "native"
# If pdfplumber returns empty, fall through to OCR
print("[ClauseGuard OCR] pdfplumber returned empty β falling back to OCR")
# Step 2b: Scanned PDF or pdfplumber failed β use OCR
print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected β running docTR OCR...")
text, error = _extract_scanned_pdf(file_path)
if text:
return text, None, "ocr"
return None, error, None
def ocr_extract(file_path):
"""
Force OCR extraction on a PDF (bypass native text check).
Useful when user explicitly wants OCR.
"""
return _extract_scanned_pdf(file_path)
|