File size: 1,876 Bytes
f42bfb0 661eb14 f42bfb0 661eb14 f42bfb0 661eb14 f42bfb0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | import re
from core.ocr_pipeline import ExtractedPage
_MAX_CHUNK_CHARS = 2000
def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
chunks = []
for page_dict in pages:
page_no = page_dict["page"]
text = page_dict["text"].strip()
if not text:
continue
if len(text) <= _MAX_CHUNK_CHARS:
pieces = [text]
else:
# Split on clause headings or double newlines
splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text)
pieces = []
current = ""
for s in splits:
if len(current) + len(s) <= _MAX_CHUNK_CHARS:
current += s
else:
if current:
pieces.append(current)
current = s
if current:
pieces.append(current)
for i, piece in enumerate(pieces):
piece = piece.strip()
if not piece:
continue
chunks.append({
"text": piece,
"tender_id": tender_id,
"page": page_no,
"chunk_id": f"{tender_id}_p{page_no}_c{i}",
})
return chunks
def chunk_bidder(
pages: list[ExtractedPage], bidder_id: str, doc_name: str
) -> list[dict]:
chunks = []
for page in pages:
text = page.text.strip() if page.text else ""
if not text:
continue
safe_doc = doc_name.replace("/", "_").replace("\\", "_")
chunks.append({
"text": text,
"bidder_id": bidder_id,
"doc_name": doc_name,
"page": page.page,
"source_type": page.source_type,
"ocr_confidence": page.confidence,
"chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}",
})
return chunks
|