scrapeRL / backend /test_scraper.py
NeerajCodz's picture
test: add end-to-end scraper test script with successful validation
8512126
"""Test script to verify scraper functionality end-to-end."""
import asyncio
import json
import logging
from typing import Any
import httpx
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
BASE_URL = "http://localhost:8000/api"
async def test_scraper():
"""Test the scraper with a simple task."""
async with httpx.AsyncClient(timeout=60.0) as client:
# Step 1: Create a scraping task
logger.info("Creating scraping episode...")
reset_payload = {
"task_id": "test-scrape-quotes",
"seed": 42,
"config": {
"start_url": "http://quotes.toscrape.com",
"target_fields": {
"quotes": {
"text": "quote text",
"author": "quote author",
"tags": "quote tags"
}
},
"max_steps": 20,
"timeout": 300
}
}
try:
response = await client.post(f"{BASE_URL}/episode/reset", json=reset_payload)
response.raise_for_status()
reset_data = response.json()
episode_id = reset_data["episode_id"]
logger.info(f"βœ“ Episode created: {episode_id}")
logger.info(f" Initial observation: {reset_data['observation']['current_url']}")
except Exception as e:
logger.error(f"βœ— Failed to create episode: {e}")
if hasattr(e, 'response'):
logger.error(f" Response: {e.response.text}")
return
# Step 2: Execute a few actions
actions = [
{
"action_type": "navigate",
"parameters": {"url": "http://quotes.toscrape.com"},
"reasoning": "Navigate to the quotes website to start scraping",
},
{
"action_type": "extract_field",
"parameters": {
"field_name": "quotes",
"css_selector": ".quote .text"
},
"reasoning": "Extract all quotes from the page",
},
{
"action_type": "done",
"parameters": {"success": True},
"reasoning": "Extraction complete",
}
]
for i, action_data in enumerate(actions, 1):
logger.info(f"\nStep {i}: {action_data['action_type']}")
step_payload = {
"episode_id": episode_id,
"action": action_data
}
try:
response = await client.post(f"{BASE_URL}/episode/step", json=step_payload)
response.raise_for_status()
step_data = response.json()
logger.info(f"βœ“ Action executed successfully")
logger.info(f" Reward: {step_data['reward']:.2f}")
logger.info(f" Progress: {step_data['observation'].get('extraction_progress', 0):.1f}%")
logger.info(f" Terminated: {step_data['terminated']}")
if step_data.get('reward_breakdown'):
logger.info(f" Reward breakdown:")
for key, value in step_data['reward_breakdown'].items():
if isinstance(value, (int, float)):
logger.info(f" {key}: {value:.2f}")
if step_data['terminated'] or step_data['truncated']:
logger.info("\nEpisode finished!")
break
except Exception as e:
logger.error(f"βœ— Step {i} failed: {e}")
if hasattr(e, 'response'):
logger.error(f" Response: {e.response.text}")
break
# Step 3: Get final state
logger.info("\n" + "="*60)
logger.info("Fetching final episode state...")
try:
response = await client.get(f"{BASE_URL}/episode/state/{episode_id}")
response.raise_for_status()
state_data = response.json()
logger.info(f"βœ“ Final state retrieved")
logger.info(f" Episode ID: {state_data.get('episode_id', 'N/A')}")
logger.info(f" Steps: {state_data.get('step_number', 0)}")
logger.info(f" Total reward: {state_data.get('total_reward', 0.0):.2f}")
logger.info(f" Terminal: {state_data.get('is_terminal', False)}")
logger.info(f" Extracted data: {json.dumps(state_data.get('extracted_data', {}), indent=2)}")
except Exception as e:
logger.error(f"βœ— Failed to get state: {e}")
async def test_websocket():
"""Test WebSocket connectivity (just connect, not full test)."""
logger.info("\n" + "="*60)
logger.info("Testing WebSocket endpoint...")
try:
# Just verify the endpoint exists
async with httpx.AsyncClient() as client:
response = await client.get("http://localhost:8000/docs")
if response.status_code == 200:
logger.info("βœ“ API docs accessible at http://localhost:8000/docs")
logger.info(" WebSocket endpoint: ws://localhost:8000/ws/episode/{episode_id}")
except Exception as e:
logger.error(f"βœ— Failed to check docs: {e}")
async def main():
"""Run all tests."""
logger.info("="*60)
logger.info("ScrapeRL End-to-End Test")
logger.info("="*60)
await test_scraper()
await test_websocket()
logger.info("\n" + "="*60)
logger.info("Testing complete!")
logger.info("="*60)
if __name__ == "__main__":
asyncio.run(main())