Álvaro Valenzuela Valdes commited on
Commit
b54d3e2
·
1 Parent(s): aa38f40

feat: Scraper 2.0 with Deep Text Scan and regex code detection

Browse files
Files changed (1) hide show
  1. backend/app/services/scraper.py +64 -49
backend/app/services/scraper.py CHANGED
@@ -4,73 +4,88 @@ from typing import List
4
  from app.schemas.tender import Tender
5
  from datetime import datetime
6
  import re
 
7
 
8
  async def scrape_compra_agil(keywords: str) -> List[Tender]:
9
  """
10
- Scrapes the Mercado Público Compra Ágil search results page.
 
11
  """
12
- # Base URL for Compra Ágil search
13
  url = f"https://buscador.mercadopublico.cl/compra-agil?keywords={keywords}&status=2&order_by=recent"
14
 
15
  headers = {
16
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 
 
 
17
  }
18
 
19
  try:
20
- async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
21
- print(f"[Scraper] Navigating to: {url}")
22
  response = await client.get(url, headers=headers)
23
  response.raise_for_status()
24
 
25
- soup = BeautifulSoup(response.text, 'html.parser')
 
26
 
27
- # Find tender cards (this is based on standard Mercado Público searcher structure)
28
- # Note: The classes might change, but usually they are consistent
29
  tenders = []
 
30
 
31
- # Find elements that look like tender containers
32
- cards = soup.select('.card-tender, .item-busqueda, .search-result-item')
 
33
 
34
- if not cards:
35
- # Fallback: try to find any link with a COT26 pattern
36
- all_text = soup.get_text()
37
- codes = re.findall(r'[0-9]+-[0-9]+-COT26', all_text)
38
- print(f"[Scraper] Found {len(codes)} codes via regex fallback.")
39
- # We would need more info to build a full Tender object from regex
40
-
41
- for card in cards[:20]: # Limit for performance
42
- try:
43
- name_elem = card.select_one('.title-tender, h3, .name')
44
- code_elem = card.select_one('.code-tender, .code, span:contains("COT26")')
45
- buyer_elem = card.select_one('.buyer-name, .organismo')
46
-
47
- if not name_elem or not code_elem:
48
- continue
49
 
50
- name = name_elem.get_text(strip=True)
51
- code = code_elem.get_text(strip=True)
52
- buyer = buyer_elem.get_text(strip=True) if buyer_elem else "Unknown"
53
-
54
- tenders.append(Tender(
55
- code=code,
56
- name=name,
57
- description=name,
58
- buyer=buyer,
59
- status="Publicada",
60
- closing_date=datetime.now().strftime("%Y-%m-%d"),
61
- estimated_amount=0,
62
- source="Mercado Público (Scraped)",
63
- region="Nacional",
64
- sector="Compra Ágil",
65
- items=[],
66
- attachments=[]
67
- ))
68
- except Exception as e:
69
- print(f"Error parsing card: {e}")
70
-
71
- print(f"[Scraper] Successfully scraped {len(tenders)} tenders.")
72
- return tenders
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  except Exception as e:
75
- print(f"❌ Scraper error: {e}")
76
  return []
 
4
  from app.schemas.tender import Tender
5
  from datetime import datetime
6
  import re
7
+ import json
8
 
9
  async def scrape_compra_agil(keywords: str) -> List[Tender]:
10
  """
11
+ Enhanced scraper for Mercado Público Compra Ágil.
12
+ Uses greedy regex and multiple CSS strategies.
13
  """
14
+ # Optimized URL for agile purchase search
15
  url = f"https://buscador.mercadopublico.cl/compra-agil?keywords={keywords}&status=2&order_by=recent"
16
 
17
  headers = {
18
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
19
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
20
+ "Accept-Language": "es-ES,es;q=0.9,en;q=0.8",
21
+ "Referer": "https://buscador.mercadopublico.cl/"
22
  }
23
 
24
  try:
25
+ async with httpx.AsyncClient(timeout=45.0, follow_redirects=True) as client:
26
+ print(f"[Scraper] Launching deep scan on: {url}")
27
  response = await client.get(url, headers=headers)
28
  response.raise_for_status()
29
 
30
+ html = response.text
31
+ soup = BeautifulSoup(html, 'html.parser')
32
 
 
 
33
  tenders = []
34
+ seen_codes = set()
35
 
36
+ # Strategy 1: Find all links that look like tender details
37
+ # Pattern: ficha?code=XXXX-XXXX-XXXX
38
+ links = soup.find_all('a', href=re.compile(r'code='))
39
 
40
+ for link in links:
41
+ href = link.get('href')
42
+ code_match = re.search(r'code=([0-9a-zA-Z-]+)', href)
43
+ if code_match:
44
+ code = code_match.group(1)
45
+ if code not in seen_codes:
46
+ # Try to find the title nearby
47
+ # Often the link text is the name, or it's in a nearby div
48
+ name = link.get_text(strip=True) or "Licitación Compra Ágil"
49
+
50
+ # Clean code
51
+ code = code.strip()
 
 
 
52
 
53
+ tenders.append(Tender(
54
+ code=code,
55
+ name=name if len(name) > 5 else f"Compra Ágil {code}",
56
+ description=name,
57
+ buyer="Mercado Público",
58
+ status="Publicada",
59
+ closing_date=datetime.now().strftime("%Y-%m-%d"),
60
+ estimated_amount=0,
61
+ source="MP Web Live",
62
+ region="Nacional",
63
+ sector="Agile",
64
+ items=[],
65
+ attachments=[]
66
+ ))
67
+ seen_codes.add(code)
68
+
69
+ # Strategy 2: Search for code patterns in the whole text if Strategy 1 found nothing
70
+ if not tenders:
71
+ codes = re.findall(r'[0-9]+-[0-9]+-[a-zA-Z0-9]+', html)
72
+ for code in list(set(codes)):
73
+ if 'COT26' in code.upper() or len(code) > 10:
74
+ tenders.append(Tender(
75
+ code=code,
76
+ name=f"Oportunidad Detectada: {code}",
77
+ description="Detectada vía escaneo de texto profundo.",
78
+ buyer="Chile Compra",
79
+ status="Activa",
80
+ closing_date="TBD",
81
+ estimated_amount=None,
82
+ source="MP Text Scan",
83
+ sector="Compra Ágil"
84
+ ))
85
+
86
+ print(f"[Scraper] Scan finished. Found {len(tenders)} opportunities.")
87
+ return tenders[:30] # Limit to top 30
88
 
89
  except Exception as e:
90
+ print(f"❌ Scraper critical failure: {e}")
91
  return []