import asyncio import sys import os # Add parent directory to path to import app services sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.services.scraper import WebScraper async def run_test(urls): async with WebScraper() as scraper: for url in urls: print(f"\n{'='*20} Testing: {url} {'='*20}") try: # Detect type first site_type = await scraper.detect_website_type(url) print(f"Detected Type: {site_type}") # Perform scrape results = await scraper.scrape_page(url) if results: print(f"āœ“ Successfully extracted {len(results)} content chunks.") # Show first 2 chunks as sample for i, res in enumerate(results[:2]): content_snippet = res['content'][:300] + "..." if len(res['content']) > 300 else res['content'] print(f"\n[Chunk {i+1}] Title: {res.get('title')}") print(f"Content: {content_snippet}") else: print(f"āœ— Failed to extract content from {url}") except Exception as e: print(f"āŒ Error during scrape of {url}: {e}") if __name__ == "__main__": # Test URLs: # 1. React/JS based site # 2. Complex Medical Fact Sheet (Highly unstructured but detailed) test_urls = [ "https://react.dev", "https://www.who.int/news-room/fact-sheets/detail/diabetes" ] print("šŸš€ Starting Advanced Scraping Test (JS + Complex HTML)...") asyncio.run(run_test(test_urls)) print("\nāœ… Test Complete.")