"""Optional web fallback ingestion via Scrapling and Playwright.""" from __future__ import annotations from urllib.parse import urlparse import requests def _allowed(url: str, allow_domains: list[str]) -> bool: host = urlparse(url).netloc.lower() return any(host.endswith(domain) for domain in allow_domains) def _scrape_with_scrapling(url: str) -> str: # Scrapling API compatibility may vary by version, so this path is best-effort. from scrapling import Fetcher # type: ignore fetcher = Fetcher() page = fetcher.get(url) return getattr(page, "text", "") or "" def _scrape_with_playwright(url: str) -> str: from playwright.sync_api import sync_playwright # type: ignore with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(url, timeout=30_000) content = page.content() browser.close() return content def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]: if not _allowed(url, allow_domains): return {"status": "blocked", "url": url, "backend": "allowlist"} try: text = _scrape_with_scrapling(url) if text: return {"status": "ok", "url": url, "backend": "scrapling", "text": text} except Exception: pass try: text = _scrape_with_playwright(url) if text: return {"status": "ok", "url": url, "backend": "playwright", "text": text} except Exception: pass try: response = requests.get(url, timeout=20) response.raise_for_status() return {"status": "ok", "url": url, "backend": "requests", "text": response.text} except Exception as exc: # noqa: BLE001 return {"status": "error", "url": url, "backend": "none", "error": str(exc)}