JaydeepR Claude Sonnet 4.6 commited on
Commit
f42bfb0
·
1 Parent(s): c419ba2

Step 5: pdf_utils and chunker — PyMuPDF extraction and text chunking

Browse files

Implements specs/03_pdf_utils_and_chunker.md. extract_pages returns per-page
dicts; is_text_pdf uses avg-chars heuristic; render_page_to_image produces PIL
images for OCR. chunk_tender splits on clause headings up to 2000 chars;
chunk_bidder emits one chunk per ExtractedPage with full OCR metadata.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

core/chunker.py CHANGED
@@ -1,11 +1,63 @@
 
 
1
  from core.ocr_pipeline import ExtractedPage
2
 
 
 
3
 
4
  def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
5
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
  def chunk_bidder(
9
  pages: list[ExtractedPage], bidder_id: str, doc_name: str
10
  ) -> list[dict]:
11
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
  from core.ocr_pipeline import ExtractedPage
4
 
5
+ _MAX_CHUNK_CHARS = 2000
6
+
7
 
8
  def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
9
+ chunks = []
10
+ for page_dict in pages:
11
+ page_no = page_dict["page"]
12
+ text = page_dict["text"].strip()
13
+ if not text:
14
+ continue
15
+ if len(text) <= _MAX_CHUNK_CHARS:
16
+ pieces = [text]
17
+ else:
18
+ # Split on clause headings or double newlines
19
+ splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text)
20
+ pieces = []
21
+ current = ""
22
+ for s in splits:
23
+ if len(current) + len(s) <= _MAX_CHUNK_CHARS:
24
+ current += s
25
+ else:
26
+ if current:
27
+ pieces.append(current)
28
+ current = s
29
+ if current:
30
+ pieces.append(current)
31
+
32
+ for i, piece in enumerate(pieces):
33
+ piece = piece.strip()
34
+ if not piece:
35
+ continue
36
+ chunks.append({
37
+ "text": piece,
38
+ "tender_id": tender_id,
39
+ "page": page_no,
40
+ "chunk_id": f"{tender_id}_p{page_no}_c{i}",
41
+ })
42
+ return chunks
43
 
44
 
45
  def chunk_bidder(
46
  pages: list[ExtractedPage], bidder_id: str, doc_name: str
47
  ) -> list[dict]:
48
+ chunks = []
49
+ for page in pages:
50
+ text = page.text.strip() if page.text else ""
51
+ if not text:
52
+ continue
53
+ safe_doc = doc_name.replace("/", "_").replace("\\", "_")
54
+ chunks.append({
55
+ "text": text,
56
+ "bidder_id": bidder_id,
57
+ "doc_name": doc_name,
58
+ "page": page.page,
59
+ "source_type": page.source_type,
60
+ "ocr_confidence": page.confidence,
61
+ "chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}",
62
+ })
63
+ return chunks
core/pdf_utils.py CHANGED
@@ -1,14 +1,35 @@
1
  from pathlib import Path
 
 
2
  import PIL.Image
3
 
4
 
5
  def extract_pages(path: Path) -> list[dict]:
6
- raise NotImplementedError
 
 
 
 
 
 
7
 
8
 
9
  def is_text_pdf(path: Path) -> bool:
10
- raise NotImplementedError
 
 
 
 
 
 
 
11
 
12
 
13
  def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image:
14
- raise NotImplementedError
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+
3
+ import fitz
4
  import PIL.Image
5
 
6
 
7
  def extract_pages(path: Path) -> list[dict]:
8
+ doc = fitz.open(str(path))
9
+ pages = []
10
+ for i, page in enumerate(doc):
11
+ text = page.get_text("text")
12
+ pages.append({"page": i + 1, "text": text})
13
+ doc.close()
14
+ return pages
15
 
16
 
17
  def is_text_pdf(path: Path) -> bool:
18
+ doc = fitz.open(str(path))
19
+ if not doc.page_count:
20
+ doc.close()
21
+ return False
22
+ total_chars = sum(len(page.get_text("text")) for page in doc)
23
+ avg = total_chars / doc.page_count
24
+ doc.close()
25
+ return avg >= 50
26
 
27
 
28
  def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image:
29
+ doc = fitz.open(str(path))
30
+ page = doc[page_no - 1]
31
+ mat = fitz.Matrix(dpi / 72, dpi / 72)
32
+ pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
33
+ img = PIL.Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
34
+ doc.close()
35
+ return img
specs/03_pdf_utils_and_chunker.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spec 03 — PDF Utils and Chunker
2
+
3
+ **Step:** 5 of 15
4
+ **Time budget:** ~15 min
5
+
6
+ ---
7
+
8
+ ## Goal
9
+
10
+ Implement `core/pdf_utils.py` (PyMuPDF text extraction and page rendering) and `core/chunker.py` (text → chunks with metadata).
11
+
12
+ ---
13
+
14
+ ## `core/pdf_utils.py`
15
+
16
+ ### `extract_pages(path: Path) -> list[dict]`
17
+
18
+ - Opens the PDF with `fitz.open(str(path))`.
19
+ - For each page `i`: extracts text via `page.get_text("text")`.
20
+ - Returns `[{"page": i+1, "text": text}, ...]` (1-indexed pages).
21
+
22
+ ### `is_text_pdf(path: Path) -> bool`
23
+
24
+ - Opens the PDF.
25
+ - Computes average characters per page across all pages.
26
+ - Returns `True` if average ≥ 50 characters per page (heuristic for typed PDF vs scanned blank pages).
27
+
28
+ ### `render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image`
29
+
30
+ - Opens the PDF.
31
+ - Gets page at index `page_no - 1` (0-indexed).
32
+ - Creates `fitz.Matrix(dpi/72, dpi/72)` and renders via `page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)`.
33
+ - Converts pixmap to PIL Image via `Image.frombytes("RGB", [pix.width, pix.height], pix.samples)`.
34
+ - Returns the PIL Image.
35
+
36
+ ---
37
+
38
+ ## `core/chunker.py`
39
+
40
+ ### `chunk_tender(pages: list[dict], tender_id: str) -> list[dict]`
41
+
42
+ Input: list of `{"page": int, "text": str}` dicts.
43
+
44
+ Strategy:
45
+ - Join page text. Split on clause headings detected by regex `r'^\d+(\.\d+)*\s+'` (multiline).
46
+ - Each chunk: up to ~500 tokens (~2000 chars). If a section is longer, split on `\n\n` boundaries.
47
+ - Each chunk dict: `{"text": str, "tender_id": str, "page": int, "chunk_id": str}`.
48
+ - `chunk_id` = `f"{tender_id}_p{page}_c{i}"`.
49
+
50
+ Simpler implementation (sufficient for 5-page mock tender):
51
+ - One chunk per page section: for each page, if text > 2000 chars split into ~2000-char pieces; else one chunk.
52
+
53
+ ### `chunk_bidder(pages: list[ExtractedPage], bidder_id: str, doc_name: str) -> list[dict]`
54
+
55
+ Input: list of `ExtractedPage` objects.
56
+
57
+ Strategy: one chunk per page.
58
+
59
+ Each chunk dict:
60
+ ```python
61
+ {
62
+ "text": page.text,
63
+ "bidder_id": bidder_id,
64
+ "doc_name": doc_name,
65
+ "page": page.page,
66
+ "source_type": page.source_type,
67
+ "ocr_confidence": page.confidence,
68
+ "chunk_id": f"{bidder_id}_{doc_name}_p{page.page}",
69
+ }
70
+ ```
71
+
72
+ ---
73
+
74
+ ## Acceptance Criteria
75
+
76
+ 1. `extract_pages(Path("data/tender/crpf_construction_tender.pdf"))` returns a list of dicts with non-empty text on most pages.
77
+ 2. `is_text_pdf(Path("data/tender/crpf_construction_tender.pdf"))` returns `True`.
78
+ 3. `render_page_to_image(Path("data/tender/crpf_construction_tender.pdf"), 1)` returns a PIL Image with width > 0.
79
+ 4. `chunk_tender(pages, "tender_001")` returns a non-empty list of dicts each having a "text" key.
80
+ 5. Each bidder chunk has all required metadata keys.