JaydeepR Claude Sonnet 4.6 commited on
Commit
1564d1d
·
1 Parent(s): 61e2cc7

Step 7: OCR pipeline — 3-tier extraction with caching

Browse files

Implements specs/04_ocr_pipeline.md. Tier 1 PyMuPDF text extraction for typed
PDFs; Tier 2 Tesseract for scans/images; Tier 3 DeepSeek Vision LLM when
Tesseract confidence < 0.65. Results cached per-file via MD5 hash. Gracefully
handles Tesseract not installed and LLMUnavailable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. core/ocr_pipeline.py +130 -1
  2. specs/04_ocr_pipeline.md +97 -0
core/ocr_pipeline.py CHANGED
@@ -1,6 +1,18 @@
 
 
 
 
1
  from pathlib import Path
2
 
 
 
 
 
3
 
 
 
 
 
4
  class ExtractedPage:
5
  page: int
6
  text: str
@@ -9,5 +21,122 @@ class ExtractedPage:
9
  raw_tier_results: dict
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def extract_document(file_path: Path) -> list[ExtractedPage]:
13
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import hashlib
3
+ import io
4
+ import json
5
  from pathlib import Path
6
 
7
+ from core import audit
8
+ from core.config import OCR_CACHE_DIR, OCR_TESSERACT_MIN_CONF
9
+ from core.llm_client import LLM, LLMUnavailable
10
+ from core.prompts import VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER
11
 
12
+ _IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"}
13
+
14
+
15
+ @dataclasses.dataclass
16
  class ExtractedPage:
17
  page: int
18
  text: str
 
21
  raw_tier_results: dict
22
 
23
 
24
+ def _cache_path(file_path: Path) -> Path:
25
+ h = hashlib.md5(file_path.read_bytes()).hexdigest()
26
+ OCR_CACHE_DIR.mkdir(parents=True, exist_ok=True)
27
+ return OCR_CACHE_DIR / f"{h}.json"
28
+
29
+
30
+ def _load_cache(file_path: Path) -> list[ExtractedPage] | None:
31
+ cp = _cache_path(file_path)
32
+ if cp.exists():
33
+ data = json.loads(cp.read_text(encoding="utf-8"))
34
+ return [ExtractedPage(**d) for d in data]
35
+ return None
36
+
37
+
38
+ def _save_cache(file_path: Path, pages: list[ExtractedPage]) -> None:
39
+ cp = _cache_path(file_path)
40
+ cp.write_text(
41
+ json.dumps([dataclasses.asdict(p) for p in pages], ensure_ascii=False),
42
+ encoding="utf-8",
43
+ )
44
+
45
+
46
+ def _tesseract_extract(pil_image) -> tuple[str, float]:
47
+ try:
48
+ import pytesseract
49
+ data = pytesseract.image_to_data(
50
+ pil_image, output_type=pytesseract.Output.DATAFRAME
51
+ )
52
+ valid = data[data["conf"] != -1]
53
+ mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
54
+ text = " ".join(str(w) for w in valid["text"] if str(w).strip())
55
+ return text, mean_conf
56
+ except Exception:
57
+ return "", 0.0
58
+
59
+
60
+ def _vision_extract(pil_image) -> str | None:
61
+ buf = io.BytesIO()
62
+ pil_image.convert("RGB").save(buf, format="PNG")
63
+ buf.seek(0)
64
+ try:
65
+ llm = LLM()
66
+ result = llm.chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, buf.getvalue())
67
+ return result
68
+ except LLMUnavailable:
69
+ return None
70
+
71
+
72
+ def _process_image(pil_image, page_no: int) -> ExtractedPage:
73
+ text, conf = _tesseract_extract(pil_image)
74
+ if conf >= OCR_TESSERACT_MIN_CONF and len(text.strip()) >= 20:
75
+ return ExtractedPage(
76
+ page=page_no,
77
+ text=text,
78
+ source_type="tesseract",
79
+ confidence=conf,
80
+ raw_tier_results={"tesseract_conf": conf, "vision_used": False},
81
+ )
82
+ # Tier 3
83
+ vision_text = _vision_extract(pil_image)
84
+ if vision_text:
85
+ audit.log("vision_ocr_invoked", page=page_no,
86
+ tesseract_conf=round(conf, 3))
87
+ return ExtractedPage(
88
+ page=page_no,
89
+ text=vision_text,
90
+ source_type="vision_llm",
91
+ confidence=0.95,
92
+ raw_tier_results={"tesseract_conf": conf, "vision_used": True},
93
+ )
94
+ # Tier 3 failed — use Tier 2 result as-is
95
+ return ExtractedPage(
96
+ page=page_no,
97
+ text=text,
98
+ source_type="tesseract",
99
+ confidence=conf,
100
+ raw_tier_results={"tesseract_conf": conf, "vision_used": False},
101
+ )
102
+
103
+
104
  def extract_document(file_path: Path) -> list[ExtractedPage]:
105
+ cached = _load_cache(file_path)
106
+ if cached is not None:
107
+ return cached
108
+
109
+ suffix = file_path.suffix.lower()
110
+
111
+ if suffix in _IMAGE_SUFFIXES:
112
+ from PIL import Image
113
+ img = Image.open(file_path).convert("RGB")
114
+ pages = [_process_image(img, 1)]
115
+ else:
116
+ from core.pdf_utils import extract_pages, is_text_pdf, render_page_to_image
117
+
118
+ if is_text_pdf(file_path):
119
+ raw_pages = extract_pages(file_path)
120
+ pages = [
121
+ ExtractedPage(
122
+ page=p["page"],
123
+ text=p["text"],
124
+ source_type="text_pdf",
125
+ confidence=1.0,
126
+ raw_tier_results={"tesseract_conf": None, "vision_used": False},
127
+ )
128
+ for p in raw_pages
129
+ if p["text"].strip()
130
+ ]
131
+ else:
132
+ import fitz
133
+ doc = fitz.open(str(file_path))
134
+ n_pages = doc.page_count
135
+ doc.close()
136
+ pages = []
137
+ for i in range(1, n_pages + 1):
138
+ img = render_page_to_image(file_path, i)
139
+ pages.append(_process_image(img, i))
140
+
141
+ _save_cache(file_path, pages)
142
+ return pages
specs/04_ocr_pipeline.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spec 04 — OCR Pipeline
2
+
3
+ **Step:** 7 of 15
4
+ **Time budget:** ~30 min
5
+ **Checkpoint:** `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` returns a list with `source_type` reflecting the OCR tier used.
6
+
7
+ ---
8
+
9
+ ## Goal
10
+
11
+ Implement `core/ocr_pipeline.py` — the three-tier OCR orchestrator. For each document/image, determines the best extraction method: PyMuPDF text (Tier 1), Tesseract (Tier 2), or DeepSeek Vision LLM (Tier 3). Caches results per file to avoid re-OCR on re-runs.
12
+
13
+ ---
14
+
15
+ ## `ExtractedPage` dataclass
16
+
17
+ ```python
18
+ @dataclasses.dataclass
19
+ class ExtractedPage:
20
+ page: int
21
+ text: str
22
+ source_type: str # "text_pdf" | "tesseract" | "vision_llm"
23
+ confidence: float
24
+ raw_tier_results: dict
25
+ ```
26
+
27
+ ---
28
+
29
+ ## `extract_document(file_path: Path) -> list[ExtractedPage]`
30
+
31
+ ### Cache check
32
+
33
+ - Compute `file_hash = hashlib.md5(file_path.read_bytes()).hexdigest()`.
34
+ - Cache path: `OCR_CACHE_DIR / f"{file_hash}.json"`.
35
+ - If cache exists: deserialize and return `list[ExtractedPage]`.
36
+
37
+ ### Routing
38
+
39
+ **Case A — Image file (PNG/JPG/JPEG/BMP/TIFF):**
40
+ - Treat as single page (page=1).
41
+ - Go directly to Tier 2 (Tesseract).
42
+ - If Tier 2 confidence < `OCR_TESSERACT_MIN_CONF`: try Tier 3.
43
+
44
+ **Case B — PDF file:**
45
+ - Call `pdf_utils.is_text_pdf(file_path)`.
46
+ - If `True`: Tier 1 — call `pdf_utils.extract_pages(file_path)`, set `source_type="text_pdf"`, `confidence=1.0`.
47
+ - If `False`: for each page, render to image via `pdf_utils.render_page_to_image`, then Tier 2.
48
+
49
+ ### Tier 2 — Tesseract
50
+
51
+ ```python
52
+ import pytesseract
53
+ data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DATAFRAME)
54
+ # Filter rows with conf != -1
55
+ valid = data[data["conf"] != -1]
56
+ mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
57
+ text = " ".join(str(w) for w in valid["text"] if str(w).strip())
58
+ ```
59
+
60
+ If `mean_conf < OCR_TESSERACT_MIN_CONF` OR `len(text.strip()) < 20`: attempt Tier 3.
61
+
62
+ ### Tier 3 — DeepSeek Vision LLM
63
+
64
+ - Convert PIL Image to PNG bytes via `io.BytesIO`.
65
+ - Call `LLM().chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, image_bytes)`.
66
+ - On success: `source_type="vision_llm"`, `confidence=0.95`.
67
+ - Log `vision_ocr_invoked` audit entry.
68
+ - On `LLMUnavailable`: keep Tier 2 result with its `confidence` (will trigger `needs_review` downstream).
69
+
70
+ ### Cache write
71
+
72
+ After processing all pages, serialize to JSON and save to cache file.
73
+
74
+ ---
75
+
76
+ ## Serialization format for cache
77
+
78
+ ```json
79
+ [
80
+ {
81
+ "page": 1,
82
+ "text": "...",
83
+ "source_type": "text_pdf",
84
+ "confidence": 1.0,
85
+ "raw_tier_results": {"tesseract_conf": null, "vision_used": false}
86
+ }
87
+ ]
88
+ ```
89
+
90
+ ---
91
+
92
+ ## Acceptance Criteria
93
+
94
+ 1. `extract_document(Path("data/bidders/bidder_a/audited_financials.pdf"))` returns pages with `source_type="text_pdf"`.
95
+ 2. `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` — if Tesseract is available and confidence < 0.65, attempts vision LLM (or returns tesseract result with low confidence when LLM unavailable).
96
+ 3. Second call to `extract_document` on same file returns cached result (no re-processing).
97
+ 4. Each returned `ExtractedPage` has non-empty `text`.