File size: 5,837 Bytes
dd6303a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
Notice Parser
-------------
Extracts core tender fields from e-GP Notice PDFs.
Handles both compact text PDFs and line-by-line browser print PDFs.
"""

import re
from .pdf_reader import read_pdf_text, clean_text


def parse_notice(pdf_path: str) -> dict:
    """Parse Notice PDF and return a dict of extracted fields."""
    raw = _read_text_fast(pdf_path)
    lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
    flat = clean_text(raw)
    result = {}

    result["tender_id"] = _after_label(lines, "Tender/Proposal ID") or _regex(flat, r"Tender[/\s]*Proposal ID\s*[:\-]?\s*(\d+)")
    result["invitation_ref_no"] = _after_label(lines, "Invitation Reference No") or _regex(flat, r"Invitation Reference No\.?\s*[:\-]?\s*(.+?)(?:Tender/Proposal Status|App ID)")
    result["procuring_entity"] = _after_label(lines, "Procuring Entity Name") or _regex(flat, r"Procuring Entity Name\s*[:\-]?\s*(.+?)(?:Procuring Entity Code|$)")
    result["project_code"] = _after_label(lines, "Project Code") or _regex(flat, r"Project Code\s*[:\-]?\s*(\S+)")
    result["project_name"] = _collect_after(lines, "Project Name", ["Tender/Proposal Package No", "Tender/Proposal Package No. and"])

    package_no, work_name = _extract_package_and_work(lines)
    result["package_no"] = package_no
    result["work_name"] = work_name

    result["publication_date"] = _date_after_multiline_label(lines, "Scheduled Tender/Proposal Publication")
    result["closing_date"] = _date_after_multiline_label(lines, "Tender/Proposal Closing")
    result["document_fee_bdt"] = _extract_document_fee(lines) or 4000.0

    lot = _extract_lot_details(lines)
    result["location"] = lot.get("location", "")
    result["tender_security_amount"] = lot.get("tender_security_amount", 0.0)
    result["start_date"] = lot.get("start_date", "")
    result["completion_date"] = lot.get("completion_date", "")

    result["executive_engineer"] = _after_label(lines, "Name of Official Inviting") or ""
    result["pe_address"] = _extract_pe_address(lines)

    return result


def _read_text_fast(pdf_path: str) -> str:
    try:
        import fitz
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text() or "" for page in doc)
        doc.close()
        return text
    except Exception:
        return read_pdf_text(pdf_path)


def _after_label(lines: list, label: str) -> str:
    label_low = label.lower()
    for i, line in enumerate(lines):
        if label_low in line.lower():
            if ":" in line and line.split(":", 1)[1].strip():
                return line.split(":", 1)[1].strip()
            for j in range(i + 1, min(i + 5, len(lines))):
                if lines[j] != ":" and not lines[j].endswith(":"):
                    return lines[j].strip()
    return ""


def _collect_after(lines: list, label: str, stop_labels: list) -> str:
    start = None
    for i, line in enumerate(lines):
        if label.lower() in line.lower():
            start = i + 1
            break
    if start is None:
        return ""
    pieces = []
    for line in lines[start:]:
        if any(stop.lower() in line.lower() for stop in stop_labels):
            break
        if line != ":":
            pieces.append(line)
    return " ".join(pieces).strip()


def _extract_package_and_work(lines: list) -> tuple[str, str]:
    for i, line in enumerate(lines):
        if "description" in line.lower() and i > 0 and "package" in " ".join(lines[max(0, i-3):i+1]).lower():
            package_no = ""
            work_parts = []
            cursor = i + 1
            while cursor < len(lines) and lines[cursor] == ":":
                cursor += 1
            if cursor < len(lines):
                package_no = lines[cursor]
                cursor += 1
            while cursor < len(lines):
                current = lines[cursor]
                if current.lower().startswith("category"):
                    break
                work_parts.append(current)
                cursor += 1
            return package_no.strip(), " ".join(work_parts).strip()
    return "", ""


def _date_after_multiline_label(lines: list, label: str) -> str:
    label_low = label.lower()
    date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
    for i, line in enumerate(lines):
        if label_low in line.lower():
            for j in range(i, min(i + 8, len(lines))):
                m = date_re.search(lines[j])
                if m:
                    return m.group(0)
    return ""


def _extract_document_fee(lines: list) -> float:
    for i, line in enumerate(lines):
        if "Tender/Proposal Document Price" in line:
            for j in range(i, min(i + 8, len(lines))):
                if re.fullmatch(r"\d+(?:\.\d+)?", lines[j]):
                    return float(lines[j])
    return 0.0


def _extract_lot_details(lines: list) -> dict:
    date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
    for i in range(len(lines) - 3):
        if re.fullmatch(r"\d{5,}", lines[i]) and date_re.fullmatch(lines[i + 1]) and date_re.fullmatch(lines[i + 2]):
            location = lines[i - 1] if i > 0 else ""
            return {
                "location": location,
                "tender_security_amount": float(lines[i]),
                "start_date": lines[i + 1],
                "completion_date": lines[i + 2],
            }
    return {}


def _extract_pe_address(lines: list) -> str:
    for i, line in enumerate(lines):
        if line.strip().lower() == "address":
            if i + 1 < len(lines) and lines[i + 1].startswith(":"):
                return lines[i + 1].split(":", 1)[1].strip()
            if i + 2 < len(lines) and lines[i + 1] == ":":
                return lines[i + 2]
    return ""


def _regex(text: str, pattern: str) -> str:
    m = re.search(pattern, text, re.IGNORECASE)
    return m.group(1).strip() if m else ""