import re from core.ocr_pipeline import ExtractedPage _MAX_CHUNK_CHARS = 2000 def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]: chunks = [] for page_dict in pages: page_no = page_dict["page"] text = page_dict["text"].strip() if not text: continue if len(text) <= _MAX_CHUNK_CHARS: pieces = [text] else: # Split on clause headings or double newlines splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text) pieces = [] current = "" for s in splits: if len(current) + len(s) <= _MAX_CHUNK_CHARS: current += s else: if current: pieces.append(current) current = s if current: pieces.append(current) for i, piece in enumerate(pieces): piece = piece.strip() if not piece: continue chunks.append({ "text": piece, "tender_id": tender_id, "page": page_no, "chunk_id": f"{tender_id}_p{page_no}_c{i}", }) return chunks def chunk_bidder( pages: list[ExtractedPage], bidder_id: str, doc_name: str ) -> list[dict]: chunks = [] for page in pages: text = page.text.strip() if page.text else "" if not text: continue safe_doc = doc_name.replace("/", "_").replace("\\", "_") chunks.append({ "text": text, "bidder_id": bidder_id, "doc_name": doc_name, "page": page.page, "source_type": page.source_type, "ocr_confidence": page.confidence, "chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}", }) return chunks