Step 7: OCR pipeline — 3-tier extraction with caching
Browse filesImplements specs/04_ocr_pipeline.md. Tier 1 PyMuPDF text extraction for typed
PDFs; Tier 2 Tesseract for scans/images; Tier 3 DeepSeek Vision LLM when
Tesseract confidence < 0.65. Results cached per-file via MD5 hash. Gracefully
handles Tesseract not installed and LLMUnavailable.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- core/ocr_pipeline.py +130 -1
- specs/04_ocr_pipeline.md +97 -0
core/ocr_pipeline.py
CHANGED
|
@@ -1,6 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
class ExtractedPage:
|
| 5 |
page: int
|
| 6 |
text: str
|
|
@@ -9,5 +21,122 @@ class ExtractedPage:
|
|
| 9 |
raw_tier_results: dict
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def extract_document(file_path: Path) -> list[ExtractedPage]:
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
import hashlib
|
| 3 |
+
import io
|
| 4 |
+
import json
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
+
from core import audit
|
| 8 |
+
from core.config import OCR_CACHE_DIR, OCR_TESSERACT_MIN_CONF
|
| 9 |
+
from core.llm_client import LLM, LLMUnavailable
|
| 10 |
+
from core.prompts import VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER
|
| 11 |
|
| 12 |
+
_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclasses.dataclass
|
| 16 |
class ExtractedPage:
|
| 17 |
page: int
|
| 18 |
text: str
|
|
|
|
| 21 |
raw_tier_results: dict
|
| 22 |
|
| 23 |
|
| 24 |
+
def _cache_path(file_path: Path) -> Path:
|
| 25 |
+
h = hashlib.md5(file_path.read_bytes()).hexdigest()
|
| 26 |
+
OCR_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 27 |
+
return OCR_CACHE_DIR / f"{h}.json"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _load_cache(file_path: Path) -> list[ExtractedPage] | None:
|
| 31 |
+
cp = _cache_path(file_path)
|
| 32 |
+
if cp.exists():
|
| 33 |
+
data = json.loads(cp.read_text(encoding="utf-8"))
|
| 34 |
+
return [ExtractedPage(**d) for d in data]
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _save_cache(file_path: Path, pages: list[ExtractedPage]) -> None:
|
| 39 |
+
cp = _cache_path(file_path)
|
| 40 |
+
cp.write_text(
|
| 41 |
+
json.dumps([dataclasses.asdict(p) for p in pages], ensure_ascii=False),
|
| 42 |
+
encoding="utf-8",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _tesseract_extract(pil_image) -> tuple[str, float]:
|
| 47 |
+
try:
|
| 48 |
+
import pytesseract
|
| 49 |
+
data = pytesseract.image_to_data(
|
| 50 |
+
pil_image, output_type=pytesseract.Output.DATAFRAME
|
| 51 |
+
)
|
| 52 |
+
valid = data[data["conf"] != -1]
|
| 53 |
+
mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
|
| 54 |
+
text = " ".join(str(w) for w in valid["text"] if str(w).strip())
|
| 55 |
+
return text, mean_conf
|
| 56 |
+
except Exception:
|
| 57 |
+
return "", 0.0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _vision_extract(pil_image) -> str | None:
|
| 61 |
+
buf = io.BytesIO()
|
| 62 |
+
pil_image.convert("RGB").save(buf, format="PNG")
|
| 63 |
+
buf.seek(0)
|
| 64 |
+
try:
|
| 65 |
+
llm = LLM()
|
| 66 |
+
result = llm.chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, buf.getvalue())
|
| 67 |
+
return result
|
| 68 |
+
except LLMUnavailable:
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _process_image(pil_image, page_no: int) -> ExtractedPage:
|
| 73 |
+
text, conf = _tesseract_extract(pil_image)
|
| 74 |
+
if conf >= OCR_TESSERACT_MIN_CONF and len(text.strip()) >= 20:
|
| 75 |
+
return ExtractedPage(
|
| 76 |
+
page=page_no,
|
| 77 |
+
text=text,
|
| 78 |
+
source_type="tesseract",
|
| 79 |
+
confidence=conf,
|
| 80 |
+
raw_tier_results={"tesseract_conf": conf, "vision_used": False},
|
| 81 |
+
)
|
| 82 |
+
# Tier 3
|
| 83 |
+
vision_text = _vision_extract(pil_image)
|
| 84 |
+
if vision_text:
|
| 85 |
+
audit.log("vision_ocr_invoked", page=page_no,
|
| 86 |
+
tesseract_conf=round(conf, 3))
|
| 87 |
+
return ExtractedPage(
|
| 88 |
+
page=page_no,
|
| 89 |
+
text=vision_text,
|
| 90 |
+
source_type="vision_llm",
|
| 91 |
+
confidence=0.95,
|
| 92 |
+
raw_tier_results={"tesseract_conf": conf, "vision_used": True},
|
| 93 |
+
)
|
| 94 |
+
# Tier 3 failed — use Tier 2 result as-is
|
| 95 |
+
return ExtractedPage(
|
| 96 |
+
page=page_no,
|
| 97 |
+
text=text,
|
| 98 |
+
source_type="tesseract",
|
| 99 |
+
confidence=conf,
|
| 100 |
+
raw_tier_results={"tesseract_conf": conf, "vision_used": False},
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def extract_document(file_path: Path) -> list[ExtractedPage]:
|
| 105 |
+
cached = _load_cache(file_path)
|
| 106 |
+
if cached is not None:
|
| 107 |
+
return cached
|
| 108 |
+
|
| 109 |
+
suffix = file_path.suffix.lower()
|
| 110 |
+
|
| 111 |
+
if suffix in _IMAGE_SUFFIXES:
|
| 112 |
+
from PIL import Image
|
| 113 |
+
img = Image.open(file_path).convert("RGB")
|
| 114 |
+
pages = [_process_image(img, 1)]
|
| 115 |
+
else:
|
| 116 |
+
from core.pdf_utils import extract_pages, is_text_pdf, render_page_to_image
|
| 117 |
+
|
| 118 |
+
if is_text_pdf(file_path):
|
| 119 |
+
raw_pages = extract_pages(file_path)
|
| 120 |
+
pages = [
|
| 121 |
+
ExtractedPage(
|
| 122 |
+
page=p["page"],
|
| 123 |
+
text=p["text"],
|
| 124 |
+
source_type="text_pdf",
|
| 125 |
+
confidence=1.0,
|
| 126 |
+
raw_tier_results={"tesseract_conf": None, "vision_used": False},
|
| 127 |
+
)
|
| 128 |
+
for p in raw_pages
|
| 129 |
+
if p["text"].strip()
|
| 130 |
+
]
|
| 131 |
+
else:
|
| 132 |
+
import fitz
|
| 133 |
+
doc = fitz.open(str(file_path))
|
| 134 |
+
n_pages = doc.page_count
|
| 135 |
+
doc.close()
|
| 136 |
+
pages = []
|
| 137 |
+
for i in range(1, n_pages + 1):
|
| 138 |
+
img = render_page_to_image(file_path, i)
|
| 139 |
+
pages.append(_process_image(img, i))
|
| 140 |
+
|
| 141 |
+
_save_cache(file_path, pages)
|
| 142 |
+
return pages
|
specs/04_ocr_pipeline.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spec 04 — OCR Pipeline
|
| 2 |
+
|
| 3 |
+
**Step:** 7 of 15
|
| 4 |
+
**Time budget:** ~30 min
|
| 5 |
+
**Checkpoint:** `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` returns a list with `source_type` reflecting the OCR tier used.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Goal
|
| 10 |
+
|
| 11 |
+
Implement `core/ocr_pipeline.py` — the three-tier OCR orchestrator. For each document/image, determines the best extraction method: PyMuPDF text (Tier 1), Tesseract (Tier 2), or DeepSeek Vision LLM (Tier 3). Caches results per file to avoid re-OCR on re-runs.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## `ExtractedPage` dataclass
|
| 16 |
+
|
| 17 |
+
```python
|
| 18 |
+
@dataclasses.dataclass
|
| 19 |
+
class ExtractedPage:
|
| 20 |
+
page: int
|
| 21 |
+
text: str
|
| 22 |
+
source_type: str # "text_pdf" | "tesseract" | "vision_llm"
|
| 23 |
+
confidence: float
|
| 24 |
+
raw_tier_results: dict
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## `extract_document(file_path: Path) -> list[ExtractedPage]`
|
| 30 |
+
|
| 31 |
+
### Cache check
|
| 32 |
+
|
| 33 |
+
- Compute `file_hash = hashlib.md5(file_path.read_bytes()).hexdigest()`.
|
| 34 |
+
- Cache path: `OCR_CACHE_DIR / f"{file_hash}.json"`.
|
| 35 |
+
- If cache exists: deserialize and return `list[ExtractedPage]`.
|
| 36 |
+
|
| 37 |
+
### Routing
|
| 38 |
+
|
| 39 |
+
**Case A — Image file (PNG/JPG/JPEG/BMP/TIFF):**
|
| 40 |
+
- Treat as single page (page=1).
|
| 41 |
+
- Go directly to Tier 2 (Tesseract).
|
| 42 |
+
- If Tier 2 confidence < `OCR_TESSERACT_MIN_CONF`: try Tier 3.
|
| 43 |
+
|
| 44 |
+
**Case B — PDF file:**
|
| 45 |
+
- Call `pdf_utils.is_text_pdf(file_path)`.
|
| 46 |
+
- If `True`: Tier 1 — call `pdf_utils.extract_pages(file_path)`, set `source_type="text_pdf"`, `confidence=1.0`.
|
| 47 |
+
- If `False`: for each page, render to image via `pdf_utils.render_page_to_image`, then Tier 2.
|
| 48 |
+
|
| 49 |
+
### Tier 2 — Tesseract
|
| 50 |
+
|
| 51 |
+
```python
|
| 52 |
+
import pytesseract
|
| 53 |
+
data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DATAFRAME)
|
| 54 |
+
# Filter rows with conf != -1
|
| 55 |
+
valid = data[data["conf"] != -1]
|
| 56 |
+
mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
|
| 57 |
+
text = " ".join(str(w) for w in valid["text"] if str(w).strip())
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
If `mean_conf < OCR_TESSERACT_MIN_CONF` OR `len(text.strip()) < 20`: attempt Tier 3.
|
| 61 |
+
|
| 62 |
+
### Tier 3 — DeepSeek Vision LLM
|
| 63 |
+
|
| 64 |
+
- Convert PIL Image to PNG bytes via `io.BytesIO`.
|
| 65 |
+
- Call `LLM().chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, image_bytes)`.
|
| 66 |
+
- On success: `source_type="vision_llm"`, `confidence=0.95`.
|
| 67 |
+
- Log `vision_ocr_invoked` audit entry.
|
| 68 |
+
- On `LLMUnavailable`: keep Tier 2 result with its `confidence` (will trigger `needs_review` downstream).
|
| 69 |
+
|
| 70 |
+
### Cache write
|
| 71 |
+
|
| 72 |
+
After processing all pages, serialize to JSON and save to cache file.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Serialization format for cache
|
| 77 |
+
|
| 78 |
+
```json
|
| 79 |
+
[
|
| 80 |
+
{
|
| 81 |
+
"page": 1,
|
| 82 |
+
"text": "...",
|
| 83 |
+
"source_type": "text_pdf",
|
| 84 |
+
"confidence": 1.0,
|
| 85 |
+
"raw_tier_results": {"tesseract_conf": null, "vision_used": false}
|
| 86 |
+
}
|
| 87 |
+
]
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Acceptance Criteria
|
| 93 |
+
|
| 94 |
+
1. `extract_document(Path("data/bidders/bidder_a/audited_financials.pdf"))` returns pages with `source_type="text_pdf"`.
|
| 95 |
+
2. `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` — if Tesseract is available and confidence < 0.65, attempts vision LLM (or returns tesseract result with low confidence when LLM unavailable).
|
| 96 |
+
3. Second call to `extract_document` on same file returns cached result (no re-processing).
|
| 97 |
+
4. Each returned `ExtractedPage` has non-empty `text`.
|