| """Optional web fallback ingestion via Scrapling and Playwright.""" |
|
|
| from __future__ import annotations |
|
|
| from urllib.parse import urlparse |
|
|
| import requests |
|
|
|
|
| def _allowed(url: str, allow_domains: list[str]) -> bool: |
| host = urlparse(url).netloc.lower() |
| return any(host.endswith(domain) for domain in allow_domains) |
|
|
|
|
| def _scrape_with_scrapling(url: str) -> str: |
| |
| from scrapling import Fetcher |
|
|
| fetcher = Fetcher() |
| page = fetcher.get(url) |
| return getattr(page, "text", "") or "" |
|
|
|
|
| def _scrape_with_playwright(url: str) -> str: |
| from playwright.sync_api import sync_playwright |
|
|
| with sync_playwright() as p: |
| browser = p.chromium.launch(headless=True) |
| page = browser.new_page() |
| page.goto(url, timeout=30_000) |
| content = page.content() |
| browser.close() |
| return content |
|
|
|
|
| def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]: |
| if not _allowed(url, allow_domains): |
| return {"status": "blocked", "url": url, "backend": "allowlist"} |
|
|
| try: |
| text = _scrape_with_scrapling(url) |
| if text: |
| return {"status": "ok", "url": url, "backend": "scrapling", "text": text} |
| except Exception: |
| pass |
|
|
| try: |
| text = _scrape_with_playwright(url) |
| if text: |
| return {"status": "ok", "url": url, "backend": "playwright", "text": text} |
| except Exception: |
| pass |
|
|
| try: |
| response = requests.get(url, timeout=20) |
| response.raise_for_status() |
| return {"status": "ok", "url": url, "backend": "requests", "text": response.text} |
| except Exception as exc: |
| return {"status": "error", "url": url, "backend": "none", "error": str(exc)} |
|
|