Spaces:

JaydeepR
/

TenderIQ

Sleeping

TenderIQ / core /chunker.py

Step 5: pdf_utils and chunker — PyMuPDF extraction and text chunking

f42bfb0 15 days ago

1.88 kB

	import re

	from core.ocr_pipeline import ExtractedPage

	_MAX_CHUNK_CHARS = 2000


	def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
	chunks = []
	for page_dict in pages:
	page_no = page_dict["page"]
	text = page_dict["text"].strip()
	if not text:
	continue
	if len(text) <= _MAX_CHUNK_CHARS:
	pieces = [text]
	else:
	# Split on clause headings or double newlines
	splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text)
	pieces = []
	current = ""
	for s in splits:
	if len(current) + len(s) <= _MAX_CHUNK_CHARS:
	current += s
	else:
	if current:
	pieces.append(current)
	current = s
	if current:
	pieces.append(current)

	for i, piece in enumerate(pieces):
	piece = piece.strip()
	if not piece:
	continue
	chunks.append({
	"text": piece,
	"tender_id": tender_id,
	"page": page_no,
	"chunk_id": f"{tender_id}_p{page_no}_c{i}",
	})
	return chunks


	def chunk_bidder(
	pages: list[ExtractedPage], bidder_id: str, doc_name: str
	) -> list[dict]:
	chunks = []
	for page in pages:
	text = page.text.strip() if page.text else ""
	if not text:
	continue
	safe_doc = doc_name.replace("/", "_").replace("\\", "_")
	chunks.append({
	"text": text,
	"bidder_id": bidder_id,
	"doc_name": doc_name,
	"page": page.page,
	"source_type": page.source_type,
	"ocr_confidence": page.confidence,
	"chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}",
	})
	return chunks