""" Service to extract and persist tender detail tab information from Mercado Público. Uses HTML parsing to extract visible content + attachment URLs. """ import httpx import re from typing import List, Optional, Dict, Any from html.parser import HTMLParser from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel class AttachmentLinkExtractor(HTMLParser): """Extract attachment links from HTML tables""" def __init__(self): super().__init__() self.attachments = [] self.in_row = False self.current_row_data = {} def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag.lower() == 'tr': self.in_row = True self.current_row_data = {} elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict: href = attrs_dict.get('href') if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href: name = attrs_dict.get('value', 'Attachment') self.attachments.append({'href': href, 'name': name}) def handle_endtag(self, tag): if tag.lower() == 'tr': self.in_row = False async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]: """ Fetch tender detail page and extract tab information. Uses qs parameter if provided (encrypted detail URL). Falls back to codigo parameter. """ headers = {'User-Agent': 'Mozilla/5.0'} if qs_param: url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}" else: url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}" try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.get(url, headers=headers) if resp.status_code != 200: return {"error": f"HTTP {resp.status_code}"} html = resp.text result = { "tender_code": tender_code, "url": str(resp.url), "tabs": {}, "attachments": [], "metadata": {} } # Extract attachments from grv* controls extractor = AttachmentLinkExtractor() extractor.feed(html) result["attachments"] = extractor.attachments # Extract tab sections (look for hidden controls that track tab state) if 'imgHistorial' in html: result["tabs"]["history"] = {"name": "Historial", "found": True} if 'imgPreguntasLicitacion' in html: result["tabs"]["questions"] = {"name": "Preguntas", "found": True} if 'imgAperturaTecnica' in html: result["tabs"]["opening"] = {"name": "Apertura", "found": True} # Count attachment groups (Administrative, Technical, Economic) result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0 result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0 result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0 # Count questions/responses (more specific regex for the questions tab label) questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE) if questions_match: result["metadata"]["question_count"] = int(questions_match.group(1)) else: # Fallback to general label if specific ID not found questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE) if questions_match: result["metadata"]["question_count"] = int(questions_match.group(1)) else: result["metadata"]["question_count"] = 0 # Extract adjudication info if "adjudic" in html.lower(): result["metadata"]["has_adjudication"] = True # Extract complaints and purchases (New Intelligence) complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE) if complaints_match: result["metadata"]["buyer_complaints"] = int(complaints_match.group(1)) # Extract Guarantees (Seriedad y Fiel Cumplimiento) guarantees = [] seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=|Beneficiario)', html, re.IGNORECASE | re.DOTALL) if seriedad_match: guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()}) fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=|Beneficiario)', html, re.IGNORECASE | re.DOTALL) if fiel_match: guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()}) result["metadata"]["guarantees"] = guarantees # Extract Detailed Items (Lines) items = [] # Find rows with product codes and descriptions item_matches = re.finditer(r'Cod:\s*(\d+).*?.*?\s*(.*?)\s*', html, re.IGNORECASE | re.DOTALL) for m in item_matches: items.append({"code": m.group(1), "description": m.group(2).strip()}) if items: result["metadata"]["detailed_items"] = items return result except Exception as e: return {"error": str(e), "tender_code": tender_code} async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]: """ Extract all publicly accessible attachment URLs for a tender. These can be used to download documents without authentication. """ detail_info = await extract_tender_detail_tabs(tender_code, qs_param) return detail_info.get("attachments", [])