File size: 1,848 Bytes
21c7db9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Optional web fallback ingestion via Scrapling and Playwright."""

from __future__ import annotations

from urllib.parse import urlparse

import requests


def _allowed(url: str, allow_domains: list[str]) -> bool:
    host = urlparse(url).netloc.lower()
    return any(host.endswith(domain) for domain in allow_domains)


def _scrape_with_scrapling(url: str) -> str:
    # Scrapling API compatibility may vary by version, so this path is best-effort.
    from scrapling import Fetcher  # type: ignore

    fetcher = Fetcher()
    page = fetcher.get(url)
    return getattr(page, "text", "") or ""


def _scrape_with_playwright(url: str) -> str:
    from playwright.sync_api import sync_playwright  # type: ignore

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url, timeout=30_000)
        content = page.content()
        browser.close()
        return content


def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]:
    if not _allowed(url, allow_domains):
        return {"status": "blocked", "url": url, "backend": "allowlist"}

    try:
        text = _scrape_with_scrapling(url)
        if text:
            return {"status": "ok", "url": url, "backend": "scrapling", "text": text}
    except Exception:
        pass

    try:
        text = _scrape_with_playwright(url)
        if text:
            return {"status": "ok", "url": url, "backend": "playwright", "text": text}
    except Exception:
        pass

    try:
        response = requests.get(url, timeout=20)
        response.raise_for_status()
        return {"status": "ok", "url": url, "backend": "requests", "text": response.text}
    except Exception as exc:  # noqa: BLE001
        return {"status": "error", "url": url, "backend": "none", "error": str(exc)}