Spaces:

JaydeepR
/

TenderIQ

Sleeping

JaydeepR Claude Sonnet 4.6 commited on 15 days ago

Commit

1564d1d

1 Parent(s): 61e2cc7

Step 7: OCR pipeline — 3-tier extraction with caching

Implements specs/04_ocr_pipeline.md. Tier 1 PyMuPDF text extraction for typed
PDFs; Tier 2 Tesseract for scans/images; Tier 3 DeepSeek Vision LLM when
Tesseract confidence < 0.65. Results cached per-file via MD5 hash. Gracefully
handles Tesseract not installed and LLMUnavailable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

core/ocr_pipeline.py +130 -1
specs/04_ocr_pipeline.md +97 -0

core/ocr_pipeline.py CHANGED Viewed

@@ -1,6 +1,18 @@
 from pathlib import Path
 class ExtractedPage:
     page: int
     text: str
@@ -9,5 +21,122 @@ class ExtractedPage:
     raw_tier_results: dict
 def extract_document(file_path: Path) -> list[ExtractedPage]:
-    raise NotImplementedError

+import dataclasses
+import hashlib
+import io
+import json
 from pathlib import Path
+from core import audit
+from core.config import OCR_CACHE_DIR, OCR_TESSERACT_MIN_CONF
+from core.llm_client import LLM, LLMUnavailable
+from core.prompts import VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER
+_IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"}
+@dataclasses.dataclass
 class ExtractedPage:
     page: int
     text: str
     raw_tier_results: dict
+def _cache_path(file_path: Path) -> Path:
+    h = hashlib.md5(file_path.read_bytes()).hexdigest()
+    OCR_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    return OCR_CACHE_DIR / f"{h}.json"
+def _load_cache(file_path: Path) -> list[ExtractedPage] | None:
+    cp = _cache_path(file_path)
+    if cp.exists():
+        data = json.loads(cp.read_text(encoding="utf-8"))
+        return [ExtractedPage(**d) for d in data]
+    return None
+def _save_cache(file_path: Path, pages: list[ExtractedPage]) -> None:
+    cp = _cache_path(file_path)
+    cp.write_text(
+        json.dumps([dataclasses.asdict(p) for p in pages], ensure_ascii=False),
+        encoding="utf-8",
+    )
+def _tesseract_extract(pil_image) -> tuple[str, float]:
+    try:
+        import pytesseract
+        data = pytesseract.image_to_data(
+            pil_image, output_type=pytesseract.Output.DATAFRAME
+        )
+        valid = data[data["conf"] != -1]
+        mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
+        text = " ".join(str(w) for w in valid["text"] if str(w).strip())
+        return text, mean_conf
+    except Exception:
+        return "", 0.0
+def _vision_extract(pil_image) -> str | None:
+    buf = io.BytesIO()
+    pil_image.convert("RGB").save(buf, format="PNG")
+    buf.seek(0)
+    try:
+        llm = LLM()
+        result = llm.chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, buf.getvalue())
+        return result
+    except LLMUnavailable:
+        return None
+def _process_image(pil_image, page_no: int) -> ExtractedPage:
+    text, conf = _tesseract_extract(pil_image)
+    if conf >= OCR_TESSERACT_MIN_CONF and len(text.strip()) >= 20:
+        return ExtractedPage(
+            page=page_no,
+            text=text,
+            source_type="tesseract",
+            confidence=conf,
+            raw_tier_results={"tesseract_conf": conf, "vision_used": False},
+        )
+    # Tier 3
+    vision_text = _vision_extract(pil_image)
+    if vision_text:
+        audit.log("vision_ocr_invoked", page=page_no,
+                  tesseract_conf=round(conf, 3))
+        return ExtractedPage(
+            page=page_no,
+            text=vision_text,
+            source_type="vision_llm",
+            confidence=0.95,
+            raw_tier_results={"tesseract_conf": conf, "vision_used": True},
+        )
+    # Tier 3 failed — use Tier 2 result as-is
+    return ExtractedPage(
+        page=page_no,
+        text=text,
+        source_type="tesseract",
+        confidence=conf,
+        raw_tier_results={"tesseract_conf": conf, "vision_used": False},
+    )
 def extract_document(file_path: Path) -> list[ExtractedPage]:
+    cached = _load_cache(file_path)
+    if cached is not None:
+        return cached
+    suffix = file_path.suffix.lower()
+    if suffix in _IMAGE_SUFFIXES:
+        from PIL import Image
+        img = Image.open(file_path).convert("RGB")
+        pages = [_process_image(img, 1)]
+    else:
+        from core.pdf_utils import extract_pages, is_text_pdf, render_page_to_image
+        if is_text_pdf(file_path):
+            raw_pages = extract_pages(file_path)
+            pages = [
+                ExtractedPage(
+                    page=p["page"],
+                    text=p["text"],
+                    source_type="text_pdf",
+                    confidence=1.0,
+                    raw_tier_results={"tesseract_conf": None, "vision_used": False},
+                )
+                for p in raw_pages
+                if p["text"].strip()
+            ]
+        else:
+            import fitz
+            doc = fitz.open(str(file_path))
+            n_pages = doc.page_count
+            doc.close()
+            pages = []
+            for i in range(1, n_pages + 1):
+                img = render_page_to_image(file_path, i)
+                pages.append(_process_image(img, i))
+    _save_cache(file_path, pages)
+    return pages

specs/04_ocr_pipeline.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# Spec 04 — OCR Pipeline
+**Step:** 7 of 15
+**Time budget:** ~30 min
+**Checkpoint:** `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` returns a list with `source_type` reflecting the OCR tier used.
+---
+## Goal
+Implement `core/ocr_pipeline.py` — the three-tier OCR orchestrator. For each document/image, determines the best extraction method: PyMuPDF text (Tier 1), Tesseract (Tier 2), or DeepSeek Vision LLM (Tier 3). Caches results per file to avoid re-OCR on re-runs.
+---
+## `ExtractedPage` dataclass
+```python
+@dataclasses.dataclass
+class ExtractedPage:
+    page: int
+    text: str
+    source_type: str  # "text_pdf" | "tesseract" | "vision_llm"
+    confidence: float
+    raw_tier_results: dict
+```
+---
+## `extract_document(file_path: Path) -> list[ExtractedPage]`
+### Cache check
+- Compute `file_hash = hashlib.md5(file_path.read_bytes()).hexdigest()`.
+- Cache path: `OCR_CACHE_DIR / f"{file_hash}.json"`.
+- If cache exists: deserialize and return `list[ExtractedPage]`.
+### Routing
+**Case A — Image file (PNG/JPG/JPEG/BMP/TIFF):**
+- Treat as single page (page=1).
+- Go directly to Tier 2 (Tesseract).
+- If Tier 2 confidence < `OCR_TESSERACT_MIN_CONF`: try Tier 3.
+**Case B — PDF file:**
+- Call `pdf_utils.is_text_pdf(file_path)`.
+- If `True`: Tier 1 — call `pdf_utils.extract_pages(file_path)`, set `source_type="text_pdf"`, `confidence=1.0`.
+- If `False`: for each page, render to image via `pdf_utils.render_page_to_image`, then Tier 2.
+### Tier 2 — Tesseract
+```python
+import pytesseract
+data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DATAFRAME)
+# Filter rows with conf != -1
+valid = data[data["conf"] != -1]
+mean_conf = float(valid["conf"].mean()) / 100 if len(valid) > 0 else 0.0
+text = " ".join(str(w) for w in valid["text"] if str(w).strip())
+```
+If `mean_conf < OCR_TESSERACT_MIN_CONF` OR `len(text.strip()) < 20`: attempt Tier 3.
+### Tier 3 — DeepSeek Vision LLM
+- Convert PIL Image to PNG bytes via `io.BytesIO`.
+- Call `LLM().chat_vision(VISION_OCR_PROMPT_SYSTEM, VISION_OCR_USER, image_bytes)`.
+- On success: `source_type="vision_llm"`, `confidence=0.95`.
+- Log `vision_ocr_invoked` audit entry.
+- On `LLMUnavailable`: keep Tier 2 result with its `confidence` (will trigger `needs_review` downstream).
+### Cache write
+After processing all pages, serialize to JSON and save to cache file.
+---
+## Serialization format for cache
+```json
+[
+  {
+    "page": 1,
+    "text": "...",
+    "source_type": "text_pdf",
+    "confidence": 1.0,
+    "raw_tier_results": {"tesseract_conf": null, "vision_used": false}
+  }
+]
+```
+---
+## Acceptance Criteria
+1. `extract_document(Path("data/bidders/bidder_a/audited_financials.pdf"))` returns pages with `source_type="text_pdf"`.
+2. `extract_document(Path("data/bidders/bidder_c/turnover_certificate_scan.png"))` — if Tesseract is available and confidence < 0.65, attempts vision LLM (or returns tesseract result with low confidence when LLM unavailable).
+3. Second call to `extract_document` on same file returns cached result (no re-processing).
+4. Each returned `ExtractedPage` has non-empty `text`.