""" Notice Parser ------------- Extracts core tender fields from e-GP Notice PDFs. Handles both compact text PDFs and line-by-line browser print PDFs. """ import re from .pdf_reader import read_pdf_text, clean_text def parse_notice(pdf_path: str) -> dict: """Parse Notice PDF and return a dict of extracted fields.""" raw = _read_text_fast(pdf_path) lines = [ln.strip() for ln in raw.splitlines() if ln.strip()] flat = clean_text(raw) result = {} result["tender_id"] = _after_label(lines, "Tender/Proposal ID") or _regex(flat, r"Tender[/\s]*Proposal ID\s*[:\-]?\s*(\d+)") result["invitation_ref_no"] = _after_label(lines, "Invitation Reference No") or _regex(flat, r"Invitation Reference No\.?\s*[:\-]?\s*(.+?)(?:Tender/Proposal Status|App ID)") result["procuring_entity"] = _after_label(lines, "Procuring Entity Name") or _regex(flat, r"Procuring Entity Name\s*[:\-]?\s*(.+?)(?:Procuring Entity Code|$)") result["project_code"] = _after_label(lines, "Project Code") or _regex(flat, r"Project Code\s*[:\-]?\s*(\S+)") result["project_name"] = _collect_after(lines, "Project Name", ["Tender/Proposal Package No", "Tender/Proposal Package No. and"]) package_no, work_name = _extract_package_and_work(lines) result["package_no"] = package_no result["work_name"] = work_name result["publication_date"] = _date_after_multiline_label(lines, "Scheduled Tender/Proposal Publication") result["closing_date"] = _date_after_multiline_label(lines, "Tender/Proposal Closing") result["document_fee_bdt"] = _extract_document_fee(lines) or 4000.0 lot = _extract_lot_details(lines) result["location"] = lot.get("location", "") result["tender_security_amount"] = lot.get("tender_security_amount", 0.0) result["start_date"] = lot.get("start_date", "") result["completion_date"] = lot.get("completion_date", "") result["executive_engineer"] = _after_label(lines, "Name of Official Inviting") or "" result["pe_address"] = _extract_pe_address(lines) return result def _read_text_fast(pdf_path: str) -> str: try: import fitz doc = fitz.open(pdf_path) text = "\n".join(page.get_text() or "" for page in doc) doc.close() return text except Exception: return read_pdf_text(pdf_path) def _after_label(lines: list, label: str) -> str: label_low = label.lower() for i, line in enumerate(lines): if label_low in line.lower(): if ":" in line and line.split(":", 1)[1].strip(): return line.split(":", 1)[1].strip() for j in range(i + 1, min(i + 5, len(lines))): if lines[j] != ":" and not lines[j].endswith(":"): return lines[j].strip() return "" def _collect_after(lines: list, label: str, stop_labels: list) -> str: start = None for i, line in enumerate(lines): if label.lower() in line.lower(): start = i + 1 break if start is None: return "" pieces = [] for line in lines[start:]: if any(stop.lower() in line.lower() for stop in stop_labels): break if line != ":": pieces.append(line) return " ".join(pieces).strip() def _extract_package_and_work(lines: list) -> tuple[str, str]: for i, line in enumerate(lines): if "description" in line.lower() and i > 0 and "package" in " ".join(lines[max(0, i-3):i+1]).lower(): package_no = "" work_parts = [] cursor = i + 1 while cursor < len(lines) and lines[cursor] == ":": cursor += 1 if cursor < len(lines): package_no = lines[cursor] cursor += 1 while cursor < len(lines): current = lines[cursor] if current.lower().startswith("category"): break work_parts.append(current) cursor += 1 return package_no.strip(), " ".join(work_parts).strip() return "", "" def _date_after_multiline_label(lines: list, label: str) -> str: label_low = label.lower() date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}") for i, line in enumerate(lines): if label_low in line.lower(): for j in range(i, min(i + 8, len(lines))): m = date_re.search(lines[j]) if m: return m.group(0) return "" def _extract_document_fee(lines: list) -> float: for i, line in enumerate(lines): if "Tender/Proposal Document Price" in line: for j in range(i, min(i + 8, len(lines))): if re.fullmatch(r"\d+(?:\.\d+)?", lines[j]): return float(lines[j]) return 0.0 def _extract_lot_details(lines: list) -> dict: date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}") for i in range(len(lines) - 3): if re.fullmatch(r"\d{5,}", lines[i]) and date_re.fullmatch(lines[i + 1]) and date_re.fullmatch(lines[i + 2]): location = lines[i - 1] if i > 0 else "" return { "location": location, "tender_security_amount": float(lines[i]), "start_date": lines[i + 1], "completion_date": lines[i + 2], } return {} def _extract_pe_address(lines: list) -> str: for i, line in enumerate(lines): if line.strip().lower() == "address": if i + 1 < len(lines) and lines[i + 1].startswith(":"): return lines[i + 1].split(":", 1)[1].strip() if i + 2 < len(lines) and lines[i + 1] == ":": return lines[i + 2] return "" def _regex(text: str, pattern: str) -> str: m = re.search(pattern, text, re.IGNORECASE) return m.group(1).strip() if m else ""