File size: 6,343 Bytes
5e52bd7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
Service to extract and persist tender detail tab information from Mercado Público.
Uses HTML parsing to extract visible content + attachment URLs.
"""
import httpx
import re
from typing import List, Optional, Dict, Any
from html.parser import HTMLParser
from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel
class AttachmentLinkExtractor(HTMLParser):
"""Extract attachment links from HTML tables"""
def __init__(self):
super().__init__()
self.attachments = []
self.in_row = False
self.current_row_data = {}
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag.lower() == 'tr':
self.in_row = True
self.current_row_data = {}
elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict:
href = attrs_dict.get('href')
if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href:
name = attrs_dict.get('value', 'Attachment')
self.attachments.append({'href': href, 'name': name})
def handle_endtag(self, tag):
if tag.lower() == 'tr':
self.in_row = False
async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch tender detail page and extract tab information.
Uses qs parameter if provided (encrypted detail URL).
Falls back to codigo parameter.
"""
headers = {'User-Agent': 'Mozilla/5.0'}
if qs_param:
url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}"
else:
url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}"
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(url, headers=headers)
if resp.status_code != 200:
return {"error": f"HTTP {resp.status_code}"}
html = resp.text
result = {
"tender_code": tender_code,
"url": str(resp.url),
"tabs": {},
"attachments": [],
"metadata": {}
}
# Extract attachments from grv* controls
extractor = AttachmentLinkExtractor()
extractor.feed(html)
result["attachments"] = extractor.attachments
# Extract tab sections (look for hidden controls that track tab state)
if 'imgHistorial' in html:
result["tabs"]["history"] = {"name": "Historial", "found": True}
if 'imgPreguntasLicitacion' in html:
result["tabs"]["questions"] = {"name": "Preguntas", "found": True}
if 'imgAperturaTecnica' in html:
result["tabs"]["opening"] = {"name": "Apertura", "found": True}
# Count attachment groups (Administrative, Technical, Economic)
result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0
result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0
result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0
# Count questions/responses (more specific regex for the questions tab label)
questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE)
if questions_match:
result["metadata"]["question_count"] = int(questions_match.group(1))
else:
# Fallback to general label if specific ID not found
questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE)
if questions_match:
result["metadata"]["question_count"] = int(questions_match.group(1))
else:
result["metadata"]["question_count"] = 0
# Extract adjudication info
if "adjudic" in html.lower():
result["metadata"]["has_adjudication"] = True
# Extract complaints and purchases (New Intelligence)
complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE)
if complaints_match:
result["metadata"]["buyer_complaints"] = int(complaints_match.group(1))
# Extract Guarantees (Seriedad y Fiel Cumplimiento)
guarantees = []
seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
if seriedad_match:
guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()})
fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
if fiel_match:
guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()})
result["metadata"]["guarantees"] = guarantees
# Extract Detailed Items (Lines)
items = []
# Find rows with product codes and descriptions
item_matches = re.finditer(r'Cod:\s*(\d+).*?</td>.*?<td>\s*(.*?)\s*</td>', html, re.IGNORECASE | re.DOTALL)
for m in item_matches:
items.append({"code": m.group(1), "description": m.group(2).strip()})
if items:
result["metadata"]["detailed_items"] = items
return result
except Exception as e:
return {"error": str(e), "tender_code": tender_code}
async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]:
"""
Extract all publicly accessible attachment URLs for a tender.
These can be used to download documents without authentication.
"""
detail_info = await extract_tender_detail_tabs(tender_code, qs_param)
return detail_info.get("attachments", [])
|