Boka73's picture
Deploy Gradio app
dd6303a verified
"""
Notice Parser
-------------
Extracts core tender fields from e-GP Notice PDFs.
Handles both compact text PDFs and line-by-line browser print PDFs.
"""
import re
from .pdf_reader import read_pdf_text, clean_text
def parse_notice(pdf_path: str) -> dict:
"""Parse Notice PDF and return a dict of extracted fields."""
raw = _read_text_fast(pdf_path)
lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
flat = clean_text(raw)
result = {}
result["tender_id"] = _after_label(lines, "Tender/Proposal ID") or _regex(flat, r"Tender[/\s]*Proposal ID\s*[:\-]?\s*(\d+)")
result["invitation_ref_no"] = _after_label(lines, "Invitation Reference No") or _regex(flat, r"Invitation Reference No\.?\s*[:\-]?\s*(.+?)(?:Tender/Proposal Status|App ID)")
result["procuring_entity"] = _after_label(lines, "Procuring Entity Name") or _regex(flat, r"Procuring Entity Name\s*[:\-]?\s*(.+?)(?:Procuring Entity Code|$)")
result["project_code"] = _after_label(lines, "Project Code") or _regex(flat, r"Project Code\s*[:\-]?\s*(\S+)")
result["project_name"] = _collect_after(lines, "Project Name", ["Tender/Proposal Package No", "Tender/Proposal Package No. and"])
package_no, work_name = _extract_package_and_work(lines)
result["package_no"] = package_no
result["work_name"] = work_name
result["publication_date"] = _date_after_multiline_label(lines, "Scheduled Tender/Proposal Publication")
result["closing_date"] = _date_after_multiline_label(lines, "Tender/Proposal Closing")
result["document_fee_bdt"] = _extract_document_fee(lines) or 4000.0
lot = _extract_lot_details(lines)
result["location"] = lot.get("location", "")
result["tender_security_amount"] = lot.get("tender_security_amount", 0.0)
result["start_date"] = lot.get("start_date", "")
result["completion_date"] = lot.get("completion_date", "")
result["executive_engineer"] = _after_label(lines, "Name of Official Inviting") or ""
result["pe_address"] = _extract_pe_address(lines)
return result
def _read_text_fast(pdf_path: str) -> str:
try:
import fitz
doc = fitz.open(pdf_path)
text = "\n".join(page.get_text() or "" for page in doc)
doc.close()
return text
except Exception:
return read_pdf_text(pdf_path)
def _after_label(lines: list, label: str) -> str:
label_low = label.lower()
for i, line in enumerate(lines):
if label_low in line.lower():
if ":" in line and line.split(":", 1)[1].strip():
return line.split(":", 1)[1].strip()
for j in range(i + 1, min(i + 5, len(lines))):
if lines[j] != ":" and not lines[j].endswith(":"):
return lines[j].strip()
return ""
def _collect_after(lines: list, label: str, stop_labels: list) -> str:
start = None
for i, line in enumerate(lines):
if label.lower() in line.lower():
start = i + 1
break
if start is None:
return ""
pieces = []
for line in lines[start:]:
if any(stop.lower() in line.lower() for stop in stop_labels):
break
if line != ":":
pieces.append(line)
return " ".join(pieces).strip()
def _extract_package_and_work(lines: list) -> tuple[str, str]:
for i, line in enumerate(lines):
if "description" in line.lower() and i > 0 and "package" in " ".join(lines[max(0, i-3):i+1]).lower():
package_no = ""
work_parts = []
cursor = i + 1
while cursor < len(lines) and lines[cursor] == ":":
cursor += 1
if cursor < len(lines):
package_no = lines[cursor]
cursor += 1
while cursor < len(lines):
current = lines[cursor]
if current.lower().startswith("category"):
break
work_parts.append(current)
cursor += 1
return package_no.strip(), " ".join(work_parts).strip()
return "", ""
def _date_after_multiline_label(lines: list, label: str) -> str:
label_low = label.lower()
date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
for i, line in enumerate(lines):
if label_low in line.lower():
for j in range(i, min(i + 8, len(lines))):
m = date_re.search(lines[j])
if m:
return m.group(0)
return ""
def _extract_document_fee(lines: list) -> float:
for i, line in enumerate(lines):
if "Tender/Proposal Document Price" in line:
for j in range(i, min(i + 8, len(lines))):
if re.fullmatch(r"\d+(?:\.\d+)?", lines[j]):
return float(lines[j])
return 0.0
def _extract_lot_details(lines: list) -> dict:
date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
for i in range(len(lines) - 3):
if re.fullmatch(r"\d{5,}", lines[i]) and date_re.fullmatch(lines[i + 1]) and date_re.fullmatch(lines[i + 2]):
location = lines[i - 1] if i > 0 else ""
return {
"location": location,
"tender_security_amount": float(lines[i]),
"start_date": lines[i + 1],
"completion_date": lines[i + 2],
}
return {}
def _extract_pe_address(lines: list) -> str:
for i, line in enumerate(lines):
if line.strip().lower() == "address":
if i + 1 < len(lines) and lines[i + 1].startswith(":"):
return lines[i + 1].split(":", 1)[1].strip()
if i + 2 < len(lines) and lines[i + 1] == ":":
return lines[i + 2]
return ""
def _regex(text: str, pattern: str) -> str:
m = re.search(pattern, text, re.IGNORECASE)
return m.group(1).strip() if m else ""