Spaces:

TheJackBright
/

polyguard-openenv

Running

App Files Files Community

polyguard-openenv / app /dataops /web_fallback.py

TheJackBright

Deploy PolyGuard OpenEnv Space

877add7 verified 13 days ago

raw

history blame contribute delete

1.85 kB

	"""Optional web fallback ingestion via Scrapling and Playwright."""

	from __future__ import annotations

	from urllib.parse import urlparse

	import requests


	def _allowed(url: str, allow_domains: list[str]) -> bool:
	host = urlparse(url).netloc.lower()
	return any(host.endswith(domain) for domain in allow_domains)


	def _scrape_with_scrapling(url: str) -> str:
	# Scrapling API compatibility may vary by version, so this path is best-effort.
	from scrapling import Fetcher # type: ignore

	fetcher = Fetcher()
	page = fetcher.get(url)
	return getattr(page, "text", "") or ""


	def _scrape_with_playwright(url: str) -> str:
	from playwright.sync_api import sync_playwright # type: ignore

	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	page = browser.new_page()
	page.goto(url, timeout=30_000)
	content = page.content()
	browser.close()
	return content


	def scrape_with_fallback(url: str, allow_domains: list[str]) -> dict[str, str]:
	if not _allowed(url, allow_domains):
	return {"status": "blocked", "url": url, "backend": "allowlist"}

	try:
	text = _scrape_with_scrapling(url)
	if text:
	return {"status": "ok", "url": url, "backend": "scrapling", "text": text}
	except Exception:
	pass

	try:
	text = _scrape_with_playwright(url)
	if text:
	return {"status": "ok", "url": url, "backend": "playwright", "text": text}
	except Exception:
	pass

	try:
	response = requests.get(url, timeout=20)
	response.raise_for_status()
	return {"status": "ok", "url": url, "backend": "requests", "text": response.text}
	except Exception as exc: # noqa: BLE001
	return {"status": "error", "url": url, "backend": "none", "error": str(exc)}