adithya9903's picture
Deploy PolyGuard HF training Space
fd0c71a verified
"""Optional web fallback ingestion via Scrapling and Playwright."""
from __future__ import annotations
from urllib.parse import urlparse
import requests
def _allowed(url: str, allow_domains: list[str]) -> bool:
host = urlparse(url).netloc.lower()
return any(host.endswith(domain) for domain in allow_domains)
def _scrape_with_scrapling(url: str) -> str:
# Scrapling API compatibility may vary by version, so this path is best-effort.
from scrapling import Fetcher # type: ignore
fetcher = Fetcher()
page = fetcher.get(url)
return getattr(page, "text", "") or ""
def _scrape_with_playwright(url: str) -> str:
from playwright.sync_api import sync_playwright # type: ignore
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, timeout=30_000)
content = page.content()
browser.close()
return content
def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]:
if not _allowed(url, allow_domains):
return {"status": "blocked", "url": url, "backend": "allowlist"}
try:
text = _scrape_with_scrapling(url)
if text:
return {"status": "ok", "url": url, "backend": "scrapling", "text": text}
except Exception:
pass
try:
text = _scrape_with_playwright(url)
if text:
return {"status": "ok", "url": url, "backend": "playwright", "text": text}
except Exception:
pass
try:
response = requests.get(url, timeout=20)
response.raise_for_status()
return {"status": "ok", "url": url, "backend": "requests", "text": response.text}
except Exception as exc: # noqa: BLE001
return {"status": "error", "url": url, "backend": "none", "error": str(exc)}