Spaces:

anasraza526
/

customeragent-api

Runtime error

customeragent-api / server /scripts /test_advanced_scraping.py

Clean deploy to Hugging Face

ac90985 21 days ago

1.77 kB

	import asyncio
	import sys
	import os

	# Add parent directory to path to import app services
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from app.services.scraper import WebScraper

	async def run_test(urls):
	async with WebScraper() as scraper:
	for url in urls:
	print(f"\n{'='20} Testing: {url} {'='20}")
	try:
	# Detect type first
	site_type = await scraper.detect_website_type(url)
	print(f"Detected Type: {site_type}")

	# Perform scrape
	results = await scraper.scrape_page(url)

	if results:
	print(f"✓ Successfully extracted {len(results)} content chunks.")
	# Show first 2 chunks as sample
	for i, res in enumerate(results[:2]):
	content_snippet = res['content'][:300] + "..." if len(res['content']) > 300 else res['content']
	print(f"\n[Chunk {i+1}] Title: {res.get('title')}")
	print(f"Content: {content_snippet}")
	else:
	print(f"✗ Failed to extract content from {url}")

	except Exception as e:
	print(f"❌ Error during scrape of {url}: {e}")

	if __name__ == "__main__":
	# Test URLs:
	# 1. React/JS based site
	# 2. Complex Medical Fact Sheet (Highly unstructured but detailed)
	test_urls = [
	"https://react.dev",
	"https://www.who.int/news-room/fact-sheets/detail/diabetes"
	]

	print("🚀 Starting Advanced Scraping Test (JS + Complex HTML)...")
	asyncio.run(run_test(test_urls))
	print("\n✅ Test Complete.")