File size: 6,343 Bytes
5e52bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Service to extract and persist tender detail tab information from Mercado Público.
Uses HTML parsing to extract visible content + attachment URLs.
"""
import httpx
import re
from typing import List, Optional, Dict, Any
from html.parser import HTMLParser
from app.models.tender_detail import TenderDetailTabModel, TenderAttachmentDetailModel


class AttachmentLinkExtractor(HTMLParser):
    """Extract attachment links from HTML tables"""
    def __init__(self):
        super().__init__()
        self.attachments = []
        self.in_row = False
        self.current_row_data = {}
        
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag.lower() == 'tr':
            self.in_row = True
            self.current_row_data = {}
        elif tag.lower() == 'input' and self.in_row and 'href' in attrs_dict:
            href = attrs_dict.get('href')
            if 'VerAntecedentes.aspx' in href or 'ViewAttachment.aspx' in href:
                name = attrs_dict.get('value', 'Attachment')
                self.attachments.append({'href': href, 'name': name})
    
    def handle_endtag(self, tag):
        if tag.lower() == 'tr':
            self.in_row = False


async def extract_tender_detail_tabs(tender_code: str, qs_param: Optional[str] = None) -> Dict[str, Any]:
    """
    Fetch tender detail page and extract tab information.
    Uses qs parameter if provided (encrypted detail URL).
    Falls back to codigo parameter.
    """
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    if qs_param:
        url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?qs={qs_param}"
    else:
        url = f"https://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?codigo={tender_code}"
    
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.get(url, headers=headers)
            if resp.status_code != 200:
                return {"error": f"HTTP {resp.status_code}"}
            
            html = resp.text
            result = {
                "tender_code": tender_code,
                "url": str(resp.url),
                "tabs": {},
                "attachments": [],
                "metadata": {}
            }
            
            # Extract attachments from grv* controls
            extractor = AttachmentLinkExtractor()
            extractor.feed(html)
            result["attachments"] = extractor.attachments
            
            # Extract tab sections (look for hidden controls that track tab state)
            if 'imgHistorial' in html:
                result["tabs"]["history"] = {"name": "Historial", "found": True}
            if 'imgPreguntasLicitacion' in html:
                result["tabs"]["questions"] = {"name": "Preguntas", "found": True}
            if 'imgAperturaTecnica' in html:
                result["tabs"]["opening"] = {"name": "Apertura", "found": True}
            
            # Count attachment groups (Administrative, Technical, Economic)
            result["metadata"]["has_administrative_docs"] = "grvAdministrativo" in html or html.count("Administrativo") > 0
            result["metadata"]["has_technical_docs"] = "grvTecnico" in html or html.count("Técnico") > 0
            result["metadata"]["has_economic_docs"] = "grvEconomico" in html or html.count("Económico") > 0
            
            # Count questions/responses (more specific regex for the questions tab label)
            questions_match = re.search(r'id="[^"]*PreguntasLicitacion"[^>]*>.*?(\d+)', html, re.IGNORECASE)
            if questions_match:
                result["metadata"]["question_count"] = int(questions_match.group(1))
            else:
                # Fallback to general label if specific ID not found
                questions_match = re.search(r'Preguntas y Respuestas.*?(\d+)', html, re.IGNORECASE)
                if questions_match:
                    result["metadata"]["question_count"] = int(questions_match.group(1))
                else:
                    result["metadata"]["question_count"] = 0
            
            # Extract adjudication info
            if "adjudic" in html.lower():
                result["metadata"]["has_adjudication"] = True
            
            # Extract complaints and purchases (New Intelligence)
            complaints_match = re.search(r'Reclamos recibidos por incumplir plazo de pago:\s*(\d+)', html, re.IGNORECASE)
            if complaints_match:
                result["metadata"]["buyer_complaints"] = int(complaints_match.group(1))
            
            # Extract Guarantees (Seriedad y Fiel Cumplimiento)
            guarantees = []
            seriedad_match = re.search(r'Garantías de Seriedad de Ofertas.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
            if seriedad_match:
                guarantees.append({"type": "Seriedad de Oferta", "amount": seriedad_match.group(1).strip()})
            
            fiel_match = re.search(r'Garantía fiel de Cumplimiento de Contrato.*?Monto:\s*(.*?)(?=<br|</td>|Beneficiario)', html, re.IGNORECASE | re.DOTALL)
            if fiel_match:
                guarantees.append({"type": "Fiel Cumplimiento", "amount": fiel_match.group(1).strip()})
            
            result["metadata"]["guarantees"] = guarantees

            # Extract Detailed Items (Lines)
            items = []
            # Find rows with product codes and descriptions
            item_matches = re.finditer(r'Cod:\s*(\d+).*?</td>.*?<td>\s*(.*?)\s*</td>', html, re.IGNORECASE | re.DOTALL)
            for m in item_matches:
                items.append({"code": m.group(1), "description": m.group(2).strip()})
            
            if items:
                result["metadata"]["detailed_items"] = items

            return result
            
    except Exception as e:
        return {"error": str(e), "tender_code": tender_code}


async def extract_all_attachments_for_tender(tender_code: str, qs_param: Optional[str] = None) -> List[Dict[str, str]]:
    """
    Extract all publicly accessible attachment URLs for a tender.
    These can be used to download documents without authentication.
    """
    detail_info = await extract_tender_detail_tabs(tender_code, qs_param)
    return detail_info.get("attachments", [])