gaurv007 commited on
Commit
b5350d6
·
verified ·
1 Parent(s): 549ed6e

v4.0: Add ocr_engine.py — OCR + RAG Chatbot + Clause Redlining

Browse files
Files changed (1) hide show
  1. ocr_engine.py +218 -0
ocr_engine.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ClauseGuard — OCR Engine v1.0
3
+ ═════════════════════════════
4
+ Smart PDF Router: detects native vs scanned PDFs.
5
+ • Native PDF → pdfplumber (fast, existing)
6
+ • Scanned PDF → docTR OCR (CPU-friendly, ~150MB models)
7
+
8
+ Architecture:
9
+ PDF uploaded
10
+
11
+ [detect_if_scanned] — pdfplumber gets <50 chars/page?
12
+ ↓ ↓
13
+ Native PDF Scanned PDF
14
+ ↓ ↓
15
+ pdfplumber docTR OCR (CPU)
16
+ ↓ ↓
17
+ Contract text → existing analysis pipeline
18
+ """
19
+
20
+ import os
21
+ import re
22
+
23
+ # ── docTR (soft-fail) ───────────────────────────────────────────────
24
+ _HAS_DOCTR = False
25
+ _ocr_predictor = None
26
+
27
+ try:
28
+ from doctr.io import DocumentFile
29
+ from doctr.models import ocr_predictor as _make_predictor
30
+ _HAS_DOCTR = True
31
+ except ImportError:
32
+ pass
33
+
34
+ # ── pdfplumber (soft-fail) ──────────────────────────────────────────
35
+ try:
36
+ import pdfplumber
37
+ _HAS_PDF = True
38
+ except ImportError:
39
+ _HAS_PDF = False
40
+
41
+ # ═══════════════════════════════════════════════════════════════════════
42
+ # OCR MODEL LOADING
43
+ # ═══════════════════════════════════════════════════════════════════════
44
+
45
+ _ocr_status = "not_loaded"
46
+
47
+ def _load_ocr_model():
48
+ """Load docTR OCR predictor (lazy, on first use)."""
49
+ global _ocr_predictor, _ocr_status
50
+ if _ocr_predictor is not None:
51
+ return _ocr_predictor
52
+ if not _HAS_DOCTR:
53
+ _ocr_status = "unavailable (python-doctr not installed)"
54
+ return None
55
+ try:
56
+ print("[ClauseGuard OCR] Loading docTR models (fast_base + crnn_vgg16_bn)...")
57
+ _ocr_predictor = _make_predictor(
58
+ det_arch="fast_base",
59
+ reco_arch="crnn_vgg16_bn",
60
+ pretrained=True,
61
+ assume_straight_pages=True,
62
+ )
63
+ _ocr_status = "loaded"
64
+ print("[ClauseGuard OCR] docTR models loaded successfully")
65
+ return _ocr_predictor
66
+ except Exception as e:
67
+ _ocr_status = f"failed: {e}"
68
+ print(f"[ClauseGuard OCR] docTR load failed: {e}")
69
+ return None
70
+
71
+
72
+ def get_ocr_status():
73
+ """Return human-readable OCR engine status."""
74
+ if _ocr_predictor is not None:
75
+ return "✅ OCR: docTR loaded"
76
+ elif _HAS_DOCTR:
77
+ return "⏳ OCR: docTR available (not yet loaded)"
78
+ else:
79
+ return "❌ OCR: unavailable (python-doctr not installed)"
80
+
81
+
82
+ # ═══════════════════════════════════════════════════════════════════════
83
+ # SMART PDF ROUTER
84
+ # ═══════════════════════════════════════════════════════════════════════
85
+
86
+ def _is_scanned_pdf(file_path, min_chars_per_page=50):
87
+ """
88
+ Detect if a PDF is scanned (image-based) by checking if pdfplumber
89
+ extracts fewer than `min_chars_per_page` characters on average.
90
+ """
91
+ if not _HAS_PDF:
92
+ return True # Can't check with pdfplumber, assume scanned
93
+ try:
94
+ with pdfplumber.open(file_path) as pdf:
95
+ if len(pdf.pages) == 0:
96
+ return True
97
+ total_chars = 0
98
+ pages_checked = min(len(pdf.pages), 5) # Check first 5 pages
99
+ for i in range(pages_checked):
100
+ page_text = pdf.pages[i].extract_text() or ""
101
+ total_chars += len(page_text.strip())
102
+ avg_chars = total_chars / pages_checked
103
+ return avg_chars < min_chars_per_page
104
+ except Exception:
105
+ return True # If pdfplumber fails, try OCR
106
+
107
+
108
+ def _extract_native_pdf(file_path):
109
+ """Extract text from a native (digital) PDF using pdfplumber."""
110
+ if not _HAS_PDF:
111
+ return None, "pdfplumber not installed"
112
+ try:
113
+ text = ""
114
+ with pdfplumber.open(file_path) as pdf:
115
+ for page in pdf.pages:
116
+ page_text = page.extract_text()
117
+ if page_text:
118
+ text += page_text + "\n\n"
119
+ if not text.strip():
120
+ return None, "No text extracted from PDF"
121
+ return text.strip(), None
122
+ except Exception as e:
123
+ return None, f"PDF parse error: {e}"
124
+
125
+
126
+ def _extract_scanned_pdf(file_path):
127
+ """Extract text from a scanned PDF using docTR OCR."""
128
+ predictor = _load_ocr_model()
129
+ if predictor is None:
130
+ return None, (
131
+ "OCR is not available. Install python-doctr: "
132
+ "`pip install python-doctr[torch]`"
133
+ )
134
+ try:
135
+ doc = DocumentFile.from_pdf(file_path)
136
+ result = predictor(doc)
137
+
138
+ # Extract text page by page
139
+ full_text = ""
140
+ for page_idx, page in enumerate(result.pages):
141
+ page_text = ""
142
+ for block in page.blocks:
143
+ for line in block.lines:
144
+ line_text = " ".join(word.value for word in line.words)
145
+ page_text += line_text + "\n"
146
+ page_text += "\n"
147
+ full_text += page_text + "\n\n"
148
+
149
+ if not full_text.strip():
150
+ return None, "OCR could not extract text from scanned PDF"
151
+
152
+ # Clean up OCR artifacts
153
+ full_text = _clean_ocr_text(full_text)
154
+ return full_text.strip(), None
155
+ except Exception as e:
156
+ return None, f"OCR error: {e}"
157
+
158
+
159
+ def _clean_ocr_text(text):
160
+ """Clean common OCR artifacts."""
161
+ # Remove excessive whitespace
162
+ text = re.sub(r'[ \t]{3,}', ' ', text)
163
+ # Fix common OCR substitutions
164
+ text = re.sub(r'\bl\b(?=[A-Z])', 'I', text) # l before capital → I
165
+ # Normalize line breaks
166
+ text = re.sub(r'\n{4,}', '\n\n\n', text)
167
+ # Remove single-char lines (OCR noise)
168
+ lines = text.split('\n')
169
+ cleaned_lines = []
170
+ for line in lines:
171
+ stripped = line.strip()
172
+ if len(stripped) <= 1 and stripped not in ('', '.', ',', ';'):
173
+ continue
174
+ cleaned_lines.append(line)
175
+ return '\n'.join(cleaned_lines)
176
+
177
+
178
+ # ═══════════════════════════════════════════════════════════════════════
179
+ # PUBLIC API
180
+ # ═══════════════════════════════════════════════════════════════════════
181
+
182
+ def parse_pdf_smart(file_path):
183
+ """
184
+ Smart PDF parser with OCR fallback.
185
+
186
+ Returns: (text, error, method)
187
+ text: extracted text (or None)
188
+ error: error message (or None)
189
+ method: "native" | "ocr" | None
190
+ """
191
+ if not os.path.exists(file_path):
192
+ return None, "File not found", None
193
+
194
+ # Step 1: Check if PDF is scanned
195
+ is_scanned = _is_scanned_pdf(file_path)
196
+
197
+ if not is_scanned:
198
+ # Step 2a: Native PDF — use pdfplumber
199
+ text, error = _extract_native_pdf(file_path)
200
+ if text:
201
+ return text, None, "native"
202
+ # If pdfplumber returns empty, fall through to OCR
203
+ print("[ClauseGuard OCR] pdfplumber returned empty — falling back to OCR")
204
+
205
+ # Step 2b: Scanned PDF or pdfplumber failed — use OCR
206
+ print(f"[ClauseGuard OCR] {'Scanned' if is_scanned else 'Empty native'} PDF detected — running docTR OCR...")
207
+ text, error = _extract_scanned_pdf(file_path)
208
+ if text:
209
+ return text, None, "ocr"
210
+ return None, error, None
211
+
212
+
213
+ def ocr_extract(file_path):
214
+ """
215
+ Force OCR extraction on a PDF (bypass native text check).
216
+ Useful when user explicitly wants OCR.
217
+ """
218
+ return _extract_scanned_pdf(file_path)