Spaces:
Running
Running
| """Test script to verify scraper functionality end-to-end.""" | |
| import asyncio | |
| import json | |
| import logging | |
| from typing import Any | |
| import httpx | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| BASE_URL = "http://localhost:8000/api" | |
| async def test_scraper(): | |
| """Test the scraper with a simple task.""" | |
| async with httpx.AsyncClient(timeout=60.0) as client: | |
| # Step 1: Create a scraping task | |
| logger.info("Creating scraping episode...") | |
| reset_payload = { | |
| "task_id": "test-scrape-quotes", | |
| "seed": 42, | |
| "config": { | |
| "start_url": "http://quotes.toscrape.com", | |
| "target_fields": { | |
| "quotes": { | |
| "text": "quote text", | |
| "author": "quote author", | |
| "tags": "quote tags" | |
| } | |
| }, | |
| "max_steps": 20, | |
| "timeout": 300 | |
| } | |
| } | |
| try: | |
| response = await client.post(f"{BASE_URL}/episode/reset", json=reset_payload) | |
| response.raise_for_status() | |
| reset_data = response.json() | |
| episode_id = reset_data["episode_id"] | |
| logger.info(f"β Episode created: {episode_id}") | |
| logger.info(f" Initial observation: {reset_data['observation']['current_url']}") | |
| except Exception as e: | |
| logger.error(f"β Failed to create episode: {e}") | |
| if hasattr(e, 'response'): | |
| logger.error(f" Response: {e.response.text}") | |
| return | |
| # Step 2: Execute a few actions | |
| actions = [ | |
| { | |
| "action_type": "navigate", | |
| "parameters": {"url": "http://quotes.toscrape.com"}, | |
| "reasoning": "Navigate to the quotes website to start scraping", | |
| }, | |
| { | |
| "action_type": "extract_field", | |
| "parameters": { | |
| "field_name": "quotes", | |
| "css_selector": ".quote .text" | |
| }, | |
| "reasoning": "Extract all quotes from the page", | |
| }, | |
| { | |
| "action_type": "done", | |
| "parameters": {"success": True}, | |
| "reasoning": "Extraction complete", | |
| } | |
| ] | |
| for i, action_data in enumerate(actions, 1): | |
| logger.info(f"\nStep {i}: {action_data['action_type']}") | |
| step_payload = { | |
| "episode_id": episode_id, | |
| "action": action_data | |
| } | |
| try: | |
| response = await client.post(f"{BASE_URL}/episode/step", json=step_payload) | |
| response.raise_for_status() | |
| step_data = response.json() | |
| logger.info(f"β Action executed successfully") | |
| logger.info(f" Reward: {step_data['reward']:.2f}") | |
| logger.info(f" Progress: {step_data['observation'].get('extraction_progress', 0):.1f}%") | |
| logger.info(f" Terminated: {step_data['terminated']}") | |
| if step_data.get('reward_breakdown'): | |
| logger.info(f" Reward breakdown:") | |
| for key, value in step_data['reward_breakdown'].items(): | |
| if isinstance(value, (int, float)): | |
| logger.info(f" {key}: {value:.2f}") | |
| if step_data['terminated'] or step_data['truncated']: | |
| logger.info("\nEpisode finished!") | |
| break | |
| except Exception as e: | |
| logger.error(f"β Step {i} failed: {e}") | |
| if hasattr(e, 'response'): | |
| logger.error(f" Response: {e.response.text}") | |
| break | |
| # Step 3: Get final state | |
| logger.info("\n" + "="*60) | |
| logger.info("Fetching final episode state...") | |
| try: | |
| response = await client.get(f"{BASE_URL}/episode/state/{episode_id}") | |
| response.raise_for_status() | |
| state_data = response.json() | |
| logger.info(f"β Final state retrieved") | |
| logger.info(f" Episode ID: {state_data.get('episode_id', 'N/A')}") | |
| logger.info(f" Steps: {state_data.get('step_number', 0)}") | |
| logger.info(f" Total reward: {state_data.get('total_reward', 0.0):.2f}") | |
| logger.info(f" Terminal: {state_data.get('is_terminal', False)}") | |
| logger.info(f" Extracted data: {json.dumps(state_data.get('extracted_data', {}), indent=2)}") | |
| except Exception as e: | |
| logger.error(f"β Failed to get state: {e}") | |
| async def test_websocket(): | |
| """Test WebSocket connectivity (just connect, not full test).""" | |
| logger.info("\n" + "="*60) | |
| logger.info("Testing WebSocket endpoint...") | |
| try: | |
| # Just verify the endpoint exists | |
| async with httpx.AsyncClient() as client: | |
| response = await client.get("http://localhost:8000/docs") | |
| if response.status_code == 200: | |
| logger.info("β API docs accessible at http://localhost:8000/docs") | |
| logger.info(" WebSocket endpoint: ws://localhost:8000/ws/episode/{episode_id}") | |
| except Exception as e: | |
| logger.error(f"β Failed to check docs: {e}") | |
| async def main(): | |
| """Run all tests.""" | |
| logger.info("="*60) | |
| logger.info("ScrapeRL End-to-End Test") | |
| logger.info("="*60) | |
| await test_scraper() | |
| await test_websocket() | |
| logger.info("\n" + "="*60) | |
| logger.info("Testing complete!") | |
| logger.info("="*60) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |