AndesOps-AI / backend /app /services /tender_detail_extractor.py
Álvaro Valenzuela Valdes
deploy: v10 AMD hardware monitor integration
2da34a9
"""
Service to extract and persist tender detail tab information from Mercado Público.
Uses HTML parsing to extract visible content + attachment URLs.
"""
import httpx
import re
from typing import List, Optional, Dict, Any
from html.parser import HTMLParser
from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel
class AttachmentLinkExtractor(HTMLParser):
"""Extract attachment links from HTML tables"""
def __init__(self):
super().__init__()
self.attachments = []
self.in_row = False
self.current_row_data = {}
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag.lower() == 'tr':
self.in_row = True
self.current_row_data = {}
elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict:
href = attrs_dict.get('href')
if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href:
name = attrs_dict.get('value', 'Attachment')
self.attachments.append({'href': href, 'name': name})
def handle_endtag(self, tag):
if tag.lower() == 'tr':
self.in_row = False
async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]:
"""
Fetch tender detail page and extract tab information.
Uses qs parameter if provided (encrypted detail URL).
Falls back to codigo parameter.
"""
headers = {'User-Agent': 'Mozilla/5.0'}
if qs_param:
url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}"
else:
url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}"
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(url, headers=headers)
if resp.status_code != 200:
return {"error": f"HTTP {resp.status_code}"}
html = resp.text
result = {
"tender_code": tender_code,
"url": str(resp.url),
"tabs": {},
"attachments": [],
"metadata": {}
}
# Extract attachments from grv* controls
extractor = AttachmentLinkExtractor()
extractor.feed(html)
result["attachments"] = extractor.attachments
# Extract tab sections (look for hidden controls that track tab state)
if 'imgHistorial' in html:
result["tabs"]["history"] = {"name": "Historial", "found": True}
if 'imgPreguntasLicitacion' in html:
result["tabs"]["questions"] = {"name": "Preguntas", "found": True}
if 'imgAperturaTecnica' in html:
result["tabs"]["opening"] = {"name": "Apertura", "found": True}
# Count attachment groups (Administrative, Technical, Economic)
result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0
result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0
result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0
# Count questions/responses (more specific regex for the questions tab label)
questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE)
if questions_match:
result["metadata"]["question_count"] = int(questions_match.group(1))
else:
# Fallback to general label if specific ID not found
questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE)
if questions_match:
result["metadata"]["question_count"] = int(questions_match.group(1))
else:
result["metadata"]["question_count"] = 0
# Extract adjudication info
if "adjudic" in html.lower():
result["metadata"]["has_adjudication"] = True
# Extract complaints and purchases (New Intelligence)
complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE)
if complaints_match:
result["metadata"]["buyer_complaints"] = int(complaints_match.group(1))
# Extract Guarantees (Seriedad y Fiel Cumplimiento)
guarantees = []
seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
if seriedad_match:
guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()})
fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
if fiel_match:
guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()})
result["metadata"]["guarantees"] = guarantees
# Extract Detailed Items (Lines)
items = []
# Find rows with product codes and descriptions
item_matches = re.finditer(r'Cod:\s*(\d+).*?</td>.*?<td>\s*(.*?)\s*</td>', html, re.IGNORECASE | re.DOTALL)
for m in item_matches:
items.append({"code": m.group(1), "description": m.group(2).strip()})
if items:
result["metadata"]["detailed_items"] = items
return result
except Exception as e:
return {"error": str(e), "tender_code": tender_code}
async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]:
"""
Extract all publicly accessible attachment URLs for a tender.
These can be used to download documents without authentication.
"""
detail_info = await extract_tender_detail_tabs(tender_code, qs_param)
return detail_info.get("attachments", [])