Spaces:
Runtime error
Runtime error
| import asyncio | |
| import sys | |
| import os | |
| # Add parent directory to path to import app services | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from app.services.scraper import WebScraper | |
| async def run_test(urls): | |
| async with WebScraper() as scraper: | |
| for url in urls: | |
| print(f"\n{'='*20} Testing: {url} {'='*20}") | |
| try: | |
| # Detect type first | |
| site_type = await scraper.detect_website_type(url) | |
| print(f"Detected Type: {site_type}") | |
| # Perform scrape | |
| results = await scraper.scrape_page(url) | |
| if results: | |
| print(f"β Successfully extracted {len(results)} content chunks.") | |
| # Show first 2 chunks as sample | |
| for i, res in enumerate(results[:2]): | |
| content_snippet = res['content'][:300] + "..." if len(res['content']) > 300 else res['content'] | |
| print(f"\n[Chunk {i+1}] Title: {res.get('title')}") | |
| print(f"Content: {content_snippet}") | |
| else: | |
| print(f"β Failed to extract content from {url}") | |
| except Exception as e: | |
| print(f"β Error during scrape of {url}: {e}") | |
| if __name__ == "__main__": | |
| # Test URLs: | |
| # 1. React/JS based site | |
| # 2. Complex Medical Fact Sheet (Highly unstructured but detailed) | |
| test_urls = [ | |
| "https://react.dev", | |
| "https://www.who.int/news-room/fact-sheets/detail/diabetes" | |
| ] | |
| print("π Starting Advanced Scraping Test (JS + Complex HTML)...") | |
| asyncio.run(run_test(test_urls)) | |
| print("\nβ Test Complete.") | |