Spaces:

lablab-ai-amd-developer-hackathon
/

AndesOps-AI

Running

File size: 6,343 Bytes

5e52bd7

"""
Service to extract and persist tender detail tab information from Mercado Público.
Uses HTML parsing to extract visible content + attachment URLs.
"""
import httpx
import re
from typing import List, Optional, Dict, Any
from html.parser import HTMLParser
from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel


class AttachmentLinkExtractor(HTMLParser):
    """Extract attachment links from HTML tables"""
    def __init__(self):
        super().__init__()
        self.attachments = []
        self.in_row = False
        self.current_row_data = {}
        
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag.lower() == 'tr':
            self.in_row = True
            self.current_row_data = {}
        elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict:
            href = attrs_dict.get('href')
            if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href:
                name = attrs_dict.get('value', 'Attachment')
                self.attachments.append({'href': href, 'name': name})
    
    def handle_endtag(self, tag):
        if tag.lower() == 'tr':
            self.in_row = False


async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]:
    """
    Fetch tender detail page and extract tab information.
    Uses qs parameter if provided (encrypted detail URL).
    Falls back to codigo parameter.
    """
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    if qs_param:
        url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}"
    else:
        url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}"
    
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.get(url, headers=headers)
            if resp.status_code != 200:
                return {"error": f"HTTP {resp.status_code}"}
            
            html = resp.text
            result = {
                "tender_code": tender_code,
                "url": str(resp.url),
                "tabs": {},
                "attachments": [],
                "metadata": {}
            }
            
            # Extract attachments from grv* controls
            extractor = AttachmentLinkExtractor()
            extractor.feed(html)
            result["attachments"] = extractor.attachments
            
            # Extract tab sections (look for hidden controls that track tab state)
            if 'imgHistorial' in html:
                result["tabs"]["history"] = {"name": "Historial", "found": True}
            if 'imgPreguntasLicitacion' in html:
                result["tabs"]["questions"] = {"name": "Preguntas", "found": True}
            if 'imgAperturaTecnica' in html:
                result["tabs"]["opening"] = {"name": "Apertura", "found": True}
            
            # Count attachment groups (Administrative, Technical, Economic)
            result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0
            result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0
            result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0
            
            # Count questions/responses (more specific regex for the questions tab label)
            questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE)
            if questions_match:
                result["metadata"]["question_count"] = int(questions_match.group(1))
            else:
                # Fallback to general label if specific ID not found
                questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE)
                if questions_match:
                    result["metadata"]["question_count"] = int(questions_match.group(1))
                else:
                    result["metadata"]["question_count"] = 0
            
            # Extract adjudication info
            if "adjudic" in html.lower():
                result["metadata"]["has_adjudication"] = True
            
            # Extract complaints and purchases (New Intelligence)
            complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE)
            if complaints_match:
                result["metadata"]["buyer_complaints"] = int(complaints_match.group(1))
            
            # Extract Guarantees (Seriedad y Fiel Cumplimiento)
            guarantees = []
            seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
            if seriedad_match:
                guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()})
            
            fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
            if fiel_match:
                guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()})
            
            result["metadata"]["guarantees"] = guarantees

            # Extract Detailed Items (Lines)
            items = []
            # Find rows with product codes and descriptions
            item_matches = re.finditer(r'Cod:\s*(\d+).*?</td>.*?<td>\s*(.*?)\s*</td>', html, re.IGNORECASE | re.DOTALL)
            for m in item_matches:
                items.append({"code": m.group(1), "description": m.group(2).strip()})
            
            if items:
                result["metadata"]["detailed_items"] = items

            return result
            
    except Exception as e:
        return {"error": str(e), "tender_code": tender_code}


async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]:
    """
    Extract all publicly accessible attachment URLs for a tender.
    These can be used to download documents without authentication.
    """
    detail_info = await extract_tender_detail_tabs(tender_code, qs_param)
    return detail_info.get("attachments", [])