Spaces:

lablab-ai-amd-developer-hackathon
/

AndesOps-AI

Running

AndesOps-AI / backend /app /services /tender_detail_extractor.py

Álvaro Valenzuela Valdes

deploy: v10 AMD hardware monitor integration

2da34a9 4 minutes ago

6.34 kB

	"""
	Service to extract and persist tender detail tab information from Mercado Público.
	Uses HTML parsing to extract visible content + attachment URLs.
	"""
	import httpx
	import re
	from typing import List, Optional, Dict, Any
	from html.parser import HTMLParser
	from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel


	class AttachmentLinkExtractor(HTMLParser):
	"""Extract attachment links from HTML tables"""
	def __init__(self):
	super().__init__()
	self.attachments = []
	self.in_row = False
	self.current_row_data = {}

	def handle_starttag(self, tag, attrs):
	attrs_dict = dict(attrs)
	if tag.lower() == 'tr':
	self.in_row = True
	self.current_row_data = {}
	elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict:
	href = attrs_dict.get('href')
	if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href:
	name = attrs_dict.get('value', 'Attachment')
	self.attachments.append({'href': href, 'name': name})

	def handle_endtag(self, tag):
	if tag.lower() == 'tr':
	self.in_row = False


	async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]:
	"""
	Fetch tender detail page and extract tab information.
	Uses qs parameter if provided (encrypted detail URL).
	Falls back to codigo parameter.
	"""
	headers = {'User-Agent': 'Mozilla/5.0'}

	if qs_param:
	url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}"
	else:
	url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}"

	try:
	async with httpx.AsyncClient(timeout=30.0) as client:
	resp = await client.get(url, headers=headers)
	if resp.status_code != 200:
	return {"error": f"HTTP {resp.status_code}"}

	html = resp.text
	result = {
	"tender_code": tender_code,
	"url": str(resp.url),
	"tabs": {},
	"attachments": [],
	"metadata": {}
	}

	# Extract attachments from grv* controls
	extractor = AttachmentLinkExtractor()
	extractor.feed(html)
	result["attachments"] = extractor.attachments

	# Extract tab sections (look for hidden controls that track tab state)
	if 'imgHistorial' in html:
	result["tabs"]["history"] = {"name": "Historial", "found": True}
	if 'imgPreguntasLicitacion' in html:
	result["tabs"]["questions"] = {"name": "Preguntas", "found": True}
	if 'imgAperturaTecnica' in html:
	result["tabs"]["opening"] = {"name": "Apertura", "found": True}

	# Count attachment groups (Administrative, Technical, Economic)
	result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0
	result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0
	result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0

	# Count questions/responses (more specific regex for the questions tab label)
	questions_match = re.search(r'id="[^"]PreguntasLicitacion"[^>]>.*?(\d+)', html, re.IGNORECASE)
	if questions_match:
	result["metadata"]["question_count"] = int(questions_match.group(1))
	else:
	# Fallback to general label if specific ID not found
	questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE)
	if questions_match:
	result["metadata"]["question_count"] = int(questions_match.group(1))
	else:
	result["metadata"]["question_count"] = 0

	# Extract adjudication info
	if "adjudic" in html.lower():
	result["metadata"]["has_adjudication"] = True

	# Extract complaints and purchases (New Intelligence)
	complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE)
	if complaints_match:
	result["metadata"]["buyer_complaints"] = int(complaints_match.group(1))

	# Extract Guarantees (Seriedad y Fiel Cumplimiento)
	guarantees = []
	seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.?Monto:\s(.*?)(?=<br\|</td>\|Beneficiario)', html, re.IGNORECASE \| re.DOTALL)
	if seriedad_match:
	guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()})

	fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.?Monto:\s(.*?)(?=<br\|</td>\|Beneficiario)', html, re.IGNORECASE \| re.DOTALL)
	if fiel_match:
	guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()})

	result["metadata"]["guarantees"] = guarantees

	# Extract Detailed Items (Lines)
	items = []
	# Find rows with product codes and descriptions
	item_matches = re.finditer(r'Cod:\s(\d+).?</td>.?<td>\s(.?)\s</td>', html, re.IGNORECASE \| re.DOTALL)
	for m in item_matches:
	items.append({"code": m.group(1), "description": m.group(2).strip()})

	if items:
	result["metadata"]["detailed_items"] = items

	return result

	except Exception as e:
	return {"error": str(e), "tender_code": tender_code}


	async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]:
	"""
	Extract all publicly accessible attachment URLs for a tender.
	These can be used to download documents without authentication.
	"""
	detail_info = await extract_tender_detail_tabs(tender_code, qs_param)
	return detail_info.get("attachments", [])