Spaces:

Boka73
/

Tender-data-automation

Running

App Files Files Community

Tender-data-automation / tender_engine /parser /notice_parser.py

Boka73

Deploy Gradio app

dd6303a verified 7 days ago

raw

history blame contribute delete

5.84 kB

	"""
	Notice Parser
	-------------
	Extracts core tender fields from e-GP Notice PDFs.
	Handles both compact text PDFs and line-by-line browser print PDFs.
	"""

	import re
	from .pdf_reader import read_pdf_text, clean_text


	def parse_notice(pdf_path: str) -> dict:
	"""Parse Notice PDF and return a dict of extracted fields."""
	raw = _read_text_fast(pdf_path)
	lines = [ln.strip() for ln in raw.splitlines() if ln.strip()]
	flat = clean_text(raw)
	result = {}

	result["tender_id"] = _after_label(lines, "Tender/Proposal ID") or _regex(flat, r"Tender[/\s]Proposal ID\s[:\-]?\s*(\d+)")
	result["invitation_ref_no"] = _after_label(lines, "Invitation Reference No") or _regex(flat, r"Invitation Reference No\.?\s[:\-]?\s(.+?)(?:Tender/Proposal Status\|App ID)")
	result["procuring_entity"] = _after_label(lines, "Procuring Entity Name") or _regex(flat, r"Procuring Entity Name\s[:\-]?\s(.+?)(?:Procuring Entity Code\|$)")
	result["project_code"] = _after_label(lines, "Project Code") or _regex(flat, r"Project Code\s[:\-]?\s(\S+)")
	result["project_name"] = _collect_after(lines, "Project Name", ["Tender/Proposal Package No", "Tender/Proposal Package No. and"])

	package_no, work_name = _extract_package_and_work(lines)
	result["package_no"] = package_no
	result["work_name"] = work_name

	result["publication_date"] = _date_after_multiline_label(lines, "Scheduled Tender/Proposal Publication")
	result["closing_date"] = _date_after_multiline_label(lines, "Tender/Proposal Closing")
	result["document_fee_bdt"] = _extract_document_fee(lines) or 4000.0

	lot = _extract_lot_details(lines)
	result["location"] = lot.get("location", "")
	result["tender_security_amount"] = lot.get("tender_security_amount", 0.0)
	result["start_date"] = lot.get("start_date", "")
	result["completion_date"] = lot.get("completion_date", "")

	result["executive_engineer"] = _after_label(lines, "Name of Official Inviting") or ""
	result["pe_address"] = _extract_pe_address(lines)

	return result


	def _read_text_fast(pdf_path: str) -> str:
	try:
	import fitz
	doc = fitz.open(pdf_path)
	text = "\n".join(page.get_text() or "" for page in doc)
	doc.close()
	return text
	except Exception:
	return read_pdf_text(pdf_path)


	def _after_label(lines: list, label: str) -> str:
	label_low = label.lower()
	for i, line in enumerate(lines):
	if label_low in line.lower():
	if ":" in line and line.split(":", 1)[1].strip():
	return line.split(":", 1)[1].strip()
	for j in range(i + 1, min(i + 5, len(lines))):
	if lines[j] != ":" and not lines[j].endswith(":"):
	return lines[j].strip()
	return ""


	def _collect_after(lines: list, label: str, stop_labels: list) -> str:
	start = None
	for i, line in enumerate(lines):
	if label.lower() in line.lower():
	start = i + 1
	break
	if start is None:
	return ""
	pieces = []
	for line in lines[start:]:
	if any(stop.lower() in line.lower() for stop in stop_labels):
	break
	if line != ":":
	pieces.append(line)
	return " ".join(pieces).strip()


	def _extract_package_and_work(lines: list) -> tuple[str, str]:
	for i, line in enumerate(lines):
	if "description" in line.lower() and i > 0 and "package" in " ".join(lines[max(0, i-3):i+1]).lower():
	package_no = ""
	work_parts = []
	cursor = i + 1
	while cursor < len(lines) and lines[cursor] == ":":
	cursor += 1
	if cursor < len(lines):
	package_no = lines[cursor]
	cursor += 1
	while cursor < len(lines):
	current = lines[cursor]
	if current.lower().startswith("category"):
	break
	work_parts.append(current)
	cursor += 1
	return package_no.strip(), " ".join(work_parts).strip()
	return "", ""


	def _date_after_multiline_label(lines: list, label: str) -> str:
	label_low = label.lower()
	date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
	for i, line in enumerate(lines):
	if label_low in line.lower():
	for j in range(i, min(i + 8, len(lines))):
	m = date_re.search(lines[j])
	if m:
	return m.group(0)
	return ""


	def _extract_document_fee(lines: list) -> float:
	for i, line in enumerate(lines):
	if "Tender/Proposal Document Price" in line:
	for j in range(i, min(i + 8, len(lines))):
	if re.fullmatch(r"\d+(?:\.\d+)?", lines[j]):
	return float(lines[j])
	return 0.0


	def _extract_lot_details(lines: list) -> dict:
	date_re = re.compile(r"\d{2}-[A-Za-z]{3}-\d{4}")
	for i in range(len(lines) - 3):
	if re.fullmatch(r"\d{5,}", lines[i]) and date_re.fullmatch(lines[i + 1]) and date_re.fullmatch(lines[i + 2]):
	location = lines[i - 1] if i > 0 else ""
	return {
	"location": location,
	"tender_security_amount": float(lines[i]),
	"start_date": lines[i + 1],
	"completion_date": lines[i + 2],
	}
	return {}


	def _extract_pe_address(lines: list) -> str:
	for i, line in enumerate(lines):
	if line.strip().lower() == "address":
	if i + 1 < len(lines) and lines[i + 1].startswith(":"):
	return lines[i + 1].split(":", 1)[1].strip()
	if i + 2 < len(lines) and lines[i + 1] == ":":
	return lines[i + 2]
	return ""


	def _regex(text: str, pattern: str) -> str:
	m = re.search(pattern, text, re.IGNORECASE)
	return m.group(1).strip() if m else ""