customeragent-api / server /scripts /test_advanced_scraping.py
anasraza526's picture
Clean deploy to Hugging Face
ac90985
import asyncio
import sys
import os
# Add parent directory to path to import app services
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.services.scraper import WebScraper
async def run_test(urls):
async with WebScraper() as scraper:
for url in urls:
print(f"\n{'='*20} Testing: {url} {'='*20}")
try:
# Detect type first
site_type = await scraper.detect_website_type(url)
print(f"Detected Type: {site_type}")
# Perform scrape
results = await scraper.scrape_page(url)
if results:
print(f"βœ“ Successfully extracted {len(results)} content chunks.")
# Show first 2 chunks as sample
for i, res in enumerate(results[:2]):
content_snippet = res['content'][:300] + "..." if len(res['content']) > 300 else res['content']
print(f"\n[Chunk {i+1}] Title: {res.get('title')}")
print(f"Content: {content_snippet}")
else:
print(f"βœ— Failed to extract content from {url}")
except Exception as e:
print(f"❌ Error during scrape of {url}: {e}")
if __name__ == "__main__":
# Test URLs:
# 1. React/JS based site
# 2. Complex Medical Fact Sheet (Highly unstructured but detailed)
test_urls = [
"https://react.dev",
"https://www.who.int/news-room/fact-sheets/detail/diabetes"
]
print("πŸš€ Starting Advanced Scraping Test (JS + Complex HTML)...")
asyncio.run(run_test(test_urls))
print("\nβœ… Test Complete.")