| """ |
| Service to extract and persist tender detail tab information from Mercado Público. |
| Uses HTML parsing to extract visible content + attachment URLs. |
| """ |
| import httpx |
| import re |
| from typing import List, Optional, Dict, Any |
| from html.parser import HTMLParser |
| from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel |
|
|
|
|
| class AttachmentLinkExtractor(HTMLParser): |
| """Extract attachment links from HTML tables""" |
| def __init__(self): |
| super().__init__() |
| self.attachments = [] |
| self.in_row = False |
| self.current_row_data = {} |
| |
| def handle_starttag(self, tag, attrs): |
| attrs_dict = dict(attrs) |
| if tag.lower() == 'tr': |
| self.in_row = True |
| self.current_row_data = {} |
| elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict: |
| href = attrs_dict.get('href') |
| if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href: |
| name = attrs_dict.get('value', 'Attachment') |
| self.attachments.append({'href': href, 'name': name}) |
| |
| def handle_endtag(self, tag): |
| if tag.lower() == 'tr': |
| self.in_row = False |
|
|
|
|
| async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]: |
| """ |
| Fetch tender detail page and extract tab information. |
| Uses qs parameter if provided (encrypted detail URL). |
| Falls back to codigo parameter. |
| """ |
| headers = {'User-Agent': 'Mozilla/5.0'} |
| |
| if qs_param: |
| url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}" |
| else: |
| url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}" |
| |
| try: |
| async with httpx.AsyncClient(timeout=30.0) as client: |
| resp = await client.get(url, headers=headers) |
| if resp.status_code != 200: |
| return {"error": f"HTTP {resp.status_code}"} |
| |
| html = resp.text |
| result = { |
| "tender_code": tender_code, |
| "url": str(resp.url), |
| "tabs": {}, |
| "attachments": [], |
| "metadata": {} |
| } |
| |
| |
| extractor = AttachmentLinkExtractor() |
| extractor.feed(html) |
| result["attachments"] = extractor.attachments |
| |
| |
| if 'imgHistorial' in html: |
| result["tabs"]["history"] = {"name": "Historial", "found": True} |
| if 'imgPreguntasLicitacion' in html: |
| result["tabs"]["questions"] = {"name": "Preguntas", "found": True} |
| if 'imgAperturaTecnica' in html: |
| result["tabs"]["opening"] = {"name": "Apertura", "found": True} |
| |
| |
| result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0 |
| result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0 |
| result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0 |
| |
| |
| questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE) |
| if questions_match: |
| result["metadata"]["question_count"] = int(questions_match.group(1)) |
| else: |
| |
| questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE) |
| if questions_match: |
| result["metadata"]["question_count"] = int(questions_match.group(1)) |
| else: |
| result["metadata"]["question_count"] = 0 |
| |
| |
| if "adjudic" in html.lower(): |
| result["metadata"]["has_adjudication"] = True |
| |
| |
| complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE) |
| if complaints_match: |
| result["metadata"]["buyer_complaints"] = int(complaints_match.group(1)) |
| |
| |
| guarantees = [] |
| seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL) |
| if seriedad_match: |
| guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()}) |
| |
| fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL) |
| if fiel_match: |
| guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()}) |
| |
| result["metadata"]["guarantees"] = guarantees |
|
|
| |
| items = [] |
| |
| item_matches = re.finditer(r'Cod:\s*(\d+).*?</td>.*?<td>\s*(.*?)\s*</td>', html, re.IGNORECASE | re.DOTALL) |
| for m in item_matches: |
| items.append({"code": m.group(1), "description": m.group(2).strip()}) |
| |
| if items: |
| result["metadata"]["detailed_items"] = items |
|
|
| return result |
| |
| except Exception as e: |
| return {"error": str(e), "tender_code": tender_code} |
|
|
|
|
| async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]: |
| """ |
| Extract all publicly accessible attachment URLs for a tender. |
| These can be used to download documents without authentication. |
| """ |
| detail_info = await extract_tender_detail_tabs(tender_code, qs_param) |
| return detail_info.get("attachments", []) |
|
|