Spaces:
Running
Running
| """Optional web fallback ingestion via Scrapling and Playwright.""" | |
| from __future__ import annotations | |
| from urllib.parse import urlparse | |
| import requests | |
| def _allowed(url: str, allow_domains: list[str]) -> bool: | |
| host = urlparse(url).netloc.lower() | |
| return any(host.endswith(domain) for domain in allow_domains) | |
| def _scrape_with_scrapling(url: str) -> str: | |
| # Scrapling API compatibility may vary by version, so this path is best-effort. | |
| from scrapling import Fetcher # type: ignore | |
| fetcher = Fetcher() | |
| page = fetcher.get(url) | |
| return getattr(page, "text", "") or "" | |
| def _scrape_with_playwright(url: str) -> str: | |
| from playwright.sync_api import sync_playwright # type: ignore | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| page = browser.new_page() | |
| page.goto(url, timeout=30_000) | |
| content = page.content() | |
| browser.close() | |
| return content | |
| def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]: | |
| if not _allowed(url, allow_domains): | |
| return {"status": "blocked", "url": url, "backend": "allowlist"} | |
| try: | |
| text = _scrape_with_scrapling(url) | |
| if text: | |
| return {"status": "ok", "url": url, "backend": "scrapling", "text": text} | |
| except Exception: | |
| pass | |
| try: | |
| text = _scrape_with_playwright(url) | |
| if text: | |
| return {"status": "ok", "url": url, "backend": "playwright", "text": text} | |
| except Exception: | |
| pass | |
| try: | |
| response = requests.get(url, timeout=20) | |
| response.raise_for_status() | |
| return {"status": "ok", "url": url, "backend": "requests", "text": response.text} | |
| except Exception as exc: # noqa: BLE001 | |
| return {"status": "error", "url": url, "backend": "none", "error": str(exc)} | |