File size: 1,876 Bytes
f42bfb0
 
661eb14
 
f42bfb0
 
661eb14
 
f42bfb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661eb14
 
 
 
 
f42bfb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import re

from core.ocr_pipeline import ExtractedPage

_MAX_CHUNK_CHARS = 2000


def chunk_tender(pages: list[dict], tender_id: str) -> list[dict]:
    chunks = []
    for page_dict in pages:
        page_no = page_dict["page"]
        text = page_dict["text"].strip()
        if not text:
            continue
        if len(text) <= _MAX_CHUNK_CHARS:
            pieces = [text]
        else:
            # Split on clause headings or double newlines
            splits = re.split(r'(?m)(?=^\d+(\.\d+)*\s+)', text)
            pieces = []
            current = ""
            for s in splits:
                if len(current) + len(s) <= _MAX_CHUNK_CHARS:
                    current += s
                else:
                    if current:
                        pieces.append(current)
                    current = s
            if current:
                pieces.append(current)

        for i, piece in enumerate(pieces):
            piece = piece.strip()
            if not piece:
                continue
            chunks.append({
                "text": piece,
                "tender_id": tender_id,
                "page": page_no,
                "chunk_id": f"{tender_id}_p{page_no}_c{i}",
            })
    return chunks


def chunk_bidder(
    pages: list[ExtractedPage], bidder_id: str, doc_name: str
) -> list[dict]:
    chunks = []
    for page in pages:
        text = page.text.strip() if page.text else ""
        if not text:
            continue
        safe_doc = doc_name.replace("/", "_").replace("\\", "_")
        chunks.append({
            "text": text,
            "bidder_id": bidder_id,
            "doc_name": doc_name,
            "page": page.page,
            "source_type": page.source_type,
            "ocr_confidence": page.confidence,
            "chunk_id": f"{bidder_id}_{safe_doc}_p{page.page}",
        })
    return chunks