Step 5: pdf_utils and chunker — PyMuPDF extraction and text chunking
Browse filesImplements specs/03_pdf_utils_and_chunker.md. extract_pages returns per-page
dicts; is_text_pdf uses avg-chars heuristic; render_page_to_image produces PIL
images for OCR. chunk_tender splits on clause headings up to 2000 chars;
chunk_bidder emits one chunk per ExtractedPage with full OCR metadata.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- core/chunker.py +54 -2
- core/pdf_utils.py +24 -3
- specs/03_pdf_utils_and_chunker.md +80 -0
core/chunker.py
CHANGED
|
@@ -1,11 +1,63 @@
|
|
|
|
|
|
|
|
| 1 |
from core.ocr_pipeline import ExtractedPage
|
| 2 |
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def chunk_bidder(
|
| 9 |
pages: list[ExtractedPage], bidder_id: str, doc_name: str
|
| 10 |
) -> list[dict]:
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
from core.ocr_pipeline import ExtractedPage
|
| 4 |
|
| 5 |
+
_MAX_CHUNK_CHARS = 2000
|
| 6 |
+
|
| 7 |
|
| 8 |
def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
|
| 9 |
+
chunks = []
|
| 10 |
+
for page_dict in pages:
|
| 11 |
+
page_no = page_dict["page"]
|
| 12 |
+
text = page_dict["text"].strip()
|
| 13 |
+
if not text:
|
| 14 |
+
continue
|
| 15 |
+
if len(text) <= _MAX_CHUNK_CHARS:
|
| 16 |
+
pieces = [text]
|
| 17 |
+
else:
|
| 18 |
+
# Split on clause headings or double newlines
|
| 19 |
+
splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text)
|
| 20 |
+
pieces = []
|
| 21 |
+
current = ""
|
| 22 |
+
for s in splits:
|
| 23 |
+
if len(current) + len(s) <= _MAX_CHUNK_CHARS:
|
| 24 |
+
current += s
|
| 25 |
+
else:
|
| 26 |
+
if current:
|
| 27 |
+
pieces.append(current)
|
| 28 |
+
current = s
|
| 29 |
+
if current:
|
| 30 |
+
pieces.append(current)
|
| 31 |
+
|
| 32 |
+
for i, piece in enumerate(pieces):
|
| 33 |
+
piece = piece.strip()
|
| 34 |
+
if not piece:
|
| 35 |
+
continue
|
| 36 |
+
chunks.append({
|
| 37 |
+
"text": piece,
|
| 38 |
+
"tender_id": tender_id,
|
| 39 |
+
"page": page_no,
|
| 40 |
+
"chunk_id": f"{tender_id}_p{page_no}_c{i}",
|
| 41 |
+
})
|
| 42 |
+
return chunks
|
| 43 |
|
| 44 |
|
| 45 |
def chunk_bidder(
|
| 46 |
pages: list[ExtractedPage], bidder_id: str, doc_name: str
|
| 47 |
) -> list[dict]:
|
| 48 |
+
chunks = []
|
| 49 |
+
for page in pages:
|
| 50 |
+
text = page.text.strip() if page.text else ""
|
| 51 |
+
if not text:
|
| 52 |
+
continue
|
| 53 |
+
safe_doc = doc_name.replace("/", "_").replace("\\", "_")
|
| 54 |
+
chunks.append({
|
| 55 |
+
"text": text,
|
| 56 |
+
"bidder_id": bidder_id,
|
| 57 |
+
"doc_name": doc_name,
|
| 58 |
+
"page": page.page,
|
| 59 |
+
"source_type": page.source_type,
|
| 60 |
+
"ocr_confidence": page.confidence,
|
| 61 |
+
"chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}",
|
| 62 |
+
})
|
| 63 |
+
return chunks
|
core/pdf_utils.py
CHANGED
|
@@ -1,14 +1,35 @@
|
|
| 1 |
from pathlib import Path
|
|
|
|
|
|
|
| 2 |
import PIL.Image
|
| 3 |
|
| 4 |
|
| 5 |
def extract_pages(path: Path) -> list[dict]:
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def is_text_pdf(path: Path) -> bool:
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image:
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import fitz
|
| 4 |
import PIL.Image
|
| 5 |
|
| 6 |
|
| 7 |
def extract_pages(path: Path) -> list[dict]:
|
| 8 |
+
doc = fitz.open(str(path))
|
| 9 |
+
pages = []
|
| 10 |
+
for i, page in enumerate(doc):
|
| 11 |
+
text = page.get_text("text")
|
| 12 |
+
pages.append({"page": i + 1, "text": text})
|
| 13 |
+
doc.close()
|
| 14 |
+
return pages
|
| 15 |
|
| 16 |
|
| 17 |
def is_text_pdf(path: Path) -> bool:
|
| 18 |
+
doc = fitz.open(str(path))
|
| 19 |
+
if not doc.page_count:
|
| 20 |
+
doc.close()
|
| 21 |
+
return False
|
| 22 |
+
total_chars = sum(len(page.get_text("text")) for page in doc)
|
| 23 |
+
avg = total_chars / doc.page_count
|
| 24 |
+
doc.close()
|
| 25 |
+
return avg >= 50
|
| 26 |
|
| 27 |
|
| 28 |
def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image:
|
| 29 |
+
doc = fitz.open(str(path))
|
| 30 |
+
page = doc[page_no - 1]
|
| 31 |
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
| 32 |
+
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
| 33 |
+
img = PIL.Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
| 34 |
+
doc.close()
|
| 35 |
+
return img
|
specs/03_pdf_utils_and_chunker.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spec 03 — PDF Utils and Chunker
|
| 2 |
+
|
| 3 |
+
**Step:** 5 of 15
|
| 4 |
+
**Time budget:** ~15 min
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Goal
|
| 9 |
+
|
| 10 |
+
Implement `core/pdf_utils.py` (PyMuPDF text extraction and page rendering) and `core/chunker.py` (text → chunks with metadata).
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## `core/pdf_utils.py`
|
| 15 |
+
|
| 16 |
+
### `extract_pages(path: Path) -> list[dict]`
|
| 17 |
+
|
| 18 |
+
- Opens the PDF with `fitz.open(str(path))`.
|
| 19 |
+
- For each page `i`: extracts text via `page.get_text("text")`.
|
| 20 |
+
- Returns `[{"page": i+1, "text": text}, ...]` (1-indexed pages).
|
| 21 |
+
|
| 22 |
+
### `is_text_pdf(path: Path) -> bool`
|
| 23 |
+
|
| 24 |
+
- Opens the PDF.
|
| 25 |
+
- Computes average characters per page across all pages.
|
| 26 |
+
- Returns `True` if average ≥ 50 characters per page (heuristic for typed PDF vs scanned blank pages).
|
| 27 |
+
|
| 28 |
+
### `render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image`
|
| 29 |
+
|
| 30 |
+
- Opens the PDF.
|
| 31 |
+
- Gets page at index `page_no - 1` (0-indexed).
|
| 32 |
+
- Creates `fitz.Matrix(dpi/72, dpi/72)` and renders via `page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)`.
|
| 33 |
+
- Converts pixmap to PIL Image via `Image.frombytes("RGB", [pix.width, pix.height], pix.samples)`.
|
| 34 |
+
- Returns the PIL Image.
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## `core/chunker.py`
|
| 39 |
+
|
| 40 |
+
### `chunk_tender(pages: list[dict], tender_id: str) -> list[dict]`
|
| 41 |
+
|
| 42 |
+
Input: list of `{"page": int, "text": str}` dicts.
|
| 43 |
+
|
| 44 |
+
Strategy:
|
| 45 |
+
- Join page text. Split on clause headings detected by regex `r'^\d+(\.\d+)*\s+'` (multiline).
|
| 46 |
+
- Each chunk: up to ~500 tokens (~2000 chars). If a section is longer, split on `\n\n` boundaries.
|
| 47 |
+
- Each chunk dict: `{"text": str, "tender_id": str, "page": int, "chunk_id": str}`.
|
| 48 |
+
- `chunk_id` = `f"{tender_id}_p{page}_c{i}"`.
|
| 49 |
+
|
| 50 |
+
Simpler implementation (sufficient for 5-page mock tender):
|
| 51 |
+
- One chunk per page section: for each page, if text > 2000 chars split into ~2000-char pieces; else one chunk.
|
| 52 |
+
|
| 53 |
+
### `chunk_bidder(pages: list[ExtractedPage], bidder_id: str, doc_name: str) -> list[dict]`
|
| 54 |
+
|
| 55 |
+
Input: list of `ExtractedPage` objects.
|
| 56 |
+
|
| 57 |
+
Strategy: one chunk per page.
|
| 58 |
+
|
| 59 |
+
Each chunk dict:
|
| 60 |
+
```python
|
| 61 |
+
{
|
| 62 |
+
"text": page.text,
|
| 63 |
+
"bidder_id": bidder_id,
|
| 64 |
+
"doc_name": doc_name,
|
| 65 |
+
"page": page.page,
|
| 66 |
+
"source_type": page.source_type,
|
| 67 |
+
"ocr_confidence": page.confidence,
|
| 68 |
+
"chunk_id": f"{bidder_id}_{doc_name}_p{page.page}",
|
| 69 |
+
}
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## Acceptance Criteria
|
| 75 |
+
|
| 76 |
+
1. `extract_pages(Path("data/tender/crpf_construction_tender.pdf"))` returns a list of dicts with non-empty text on most pages.
|
| 77 |
+
2. `is_text_pdf(Path("data/tender/crpf_construction_tender.pdf"))` returns `True`.
|
| 78 |
+
3. `render_page_to_image(Path("data/tender/crpf_construction_tender.pdf"), 1)` returns a PIL Image with width > 0.
|
| 79 |
+
4. `chunk_tender(pages, "tender_001")` returns a non-empty list of dicts each having a "text" key.
|
| 80 |
+
5. Each bidder chunk has all required metadata keys.
|