GraziePrego
/

scrapling

Model card Files Files and versions

xet

Community

GraziePrego commited on 15 days ago

Commit

ce40d2a

verified ·

1 Parent(s): 4202124

Add comprehensive /api-docs endpoint with examples and usage guide

Browse files

Files changed (1) hide show

api.py +286 -0

api.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import os
+import asyncio
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, Dict
+from uuid import uuid4
+from src.web_extractor import WebExtractor
+from src.scrapers.playwright_scraper import ScraperConfig
+app = FastAPI()
+# Store active sessions
+sessions: Dict[str, WebExtractor] = {}
+class ScrapeRequest(BaseModel):
+    url: str
+    query: str
+    model_name: Optional[str] = "alias-fast"
+class SessionCreateRequest(BaseModel):
+    model_name: Optional[str] = "alias-fast"
+@app.get("/health")
+async def health():
+    return {"status": "ok", "message": "CyberScraper 2077 API is running"}
+@app.get("/api-docs")
+async def api_docs():
+    """Comprehensive API documentation with examples"""
+    return {
+        "title": "CyberScraper 2077 API Documentation",
+        "version": "1.0.0",
+        "description": "Advanced web scraping API with session management and AI-powered content extraction",
+        "base_url": "https://grazieprego-scrapling.hf.space",
+        "endpoints": {
+            "health": {
+                "method": "GET",
+                "path": "/health",
+                "description": "Check if the API is running",
+                "response": {
+                    "status": "ok",
+                    "message": "CyberScraper 2077 API is running"
+                },
+                "example": "curl https://grazieprego-scrapling.hf.space/health"
+            },
+            "scrape": {
+                "method": "POST",
+                "path": "/api/scrape",
+                "description": "Stateless scrape request - creates a new extractor for each request",
+                "request_body": {
+                    "url": "string - The URL to scrape",
+                    "query": "string - The extraction query/instruction",
+                    "model_name": "string (optional) - AI model to use (default: 'alias-fast')"
+                },
+                "response": {
+                    "url": "string - The scraped URL",
+                    "query": "string - The query used",
+                    "response": "any - The extracted content"
+                },
+                "example": {
+                    "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'",
+                    "python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())"
+                }
+            },
+            "create_session": {
+                "method": "POST",
+                "path": "/api/session",
+                "description": "Create a persistent scraping session for multiple requests",
+                "request_body": {
+                    "model_name": "string (optional) - AI model to use (default: 'alias-fast')"
+                },
+                "response": {
+                    "session_id": "string - UUID of the created session",
+                    "message": "string - Confirmation message",
+                    "model": "string - Model used"
+                },
+                "example": {
+                    "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'",
+                    "python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']"
+                }
+            },
+            "session_scrape": {
+                "method": "POST",
+                "path": "/api/session/{session_id}/scrape",
+                "description": "Scrape using an existing session context (more efficient for multiple requests)",
+                "path_parameters": {
+                    "session_id": "string - UUID of the session"
+                },
+                "request_body": {
+                    "url": "string - The URL to scrape",
+                    "query": "string - The extraction query",
+                    "model_name": "string (optional)"
+                },
+                "response": {
+                    "session_id": "string - The session ID",
+                    "url": "string - The scraped URL",
+                    "query": "string - The query used",
+                    "response": "any - The extracted content"
+                },
+                "example": {
+                    "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'",
+                    "python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())"
+                }
+            },
+            "close_session": {
+                "method": "DELETE",
+                "path": "/api/session/{session_id}",
+                "description": "Close a session and release resources",
+                "path_parameters": {
+                    "session_id": "string - UUID of the session to close"
+                },
+                "response": {
+                    "message": "string - Confirmation message",
+                    "session_id": "string - The closed session ID"
+                },
+                "example": {
+                    "curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here",
+                    "python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())"
+                }
+            }
+        },
+        "usage_guide": {
+            "quick_start": [
+                "1. Make a simple scrape request to /api/scrape",
+                "2. For multiple requests, create a session first",
+                "3. Use the session ID for subsequent requests",
+                "4. Close sessions when done to free resources"
+            ],
+            "best_practices": [
+                "Use stateless /api/scrape for one-off requests",
+                "Use sessions for batch processing multiple URLs",
+                "Always close sessions when finished",
+                "Handle errors gracefully (500 errors may occur on complex sites)",
+                "Set appropriate timeouts for slow-loading pages"
+            ],
+            "error_handling": {
+                "404": "Session not found (for session endpoints)",
+                "500": "Internal server error - check the detail message",
+                "Common issues": [
+                    "URL unreachable or timeout",
+                    "JavaScript-heavy sites may require different approaches",
+                    "Bot protection may block requests"
+                ]
+            }
+        },
+        "integration_examples": {
+            "python_script": """
+import requests
+# Stateless scrape
+response = requests.post(
+    'https://grazieprego-scrapling.hf.space/api/scrape',
+    json={
+        'url': 'https://example.com',
+        'query': 'Extract all headings and prices'
+    }
+)
+print("Result:", response.json())
+# Session-based workflow
+session_response = requests.post(
+    'https://grazieprego-scrapling.hf.space/api/session',
+    json={'model_name': 'alias-fast'}
+)
+session_id = session_response.json()['session_id']
+try:
+    # Multiple requests using the same session
+    for url in ['https://example.com/page1', 'https://example.com/page2']:
+        result = requests.post(
+            f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape',
+            json={'url': url, 'query': 'Extract product data'}
+        )
+        print(f"Scraped {url}:", result.json())
+finally:
+    # Always close the session
+    requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')
+""",
+            "javascript": """
+// Fetch API example
+async function scrapeUrl(url, query) {
+    const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ url, query })
+    });
+    return await response.json();
+}
+// Usage
+scrapeUrl('https://example.com', 'Extract all links').then(console.log);
+"""
+        },
+        "rate_limits": {
+            "note": "Rate limits may apply. Please use responsibly.",
+            "recommendation": "For high-volume scraping, use session-based approach and implement retry logic"
+        }
+    }
+@app.post("/api/scrape")
+async def scrape(request: ScrapeRequest):
+    """Stateless scrape request (creates a new extractor for each request)"""
+    scraper_config = ScraperConfig(
+        headless=True,
+        max_retries=3,
+        delay_after_load=5
+    )
+    extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
+    try:
+        # Construct the query by combining URL and the specific request
+        full_query = f"{request.url} {request.query}"
+        response = await extractor.process_query(full_query)
+        # If response is a tuple (csv/excel), extract the first part
+        if isinstance(response, tuple):
+            response = response[0]
+        # Clean up
+        if hasattr(extractor.playwright_scraper, 'close'):
+            await extractor.playwright_scraper.close()
+        return {
+            "url": request.url,
+            "query": request.query,
+            "response": response
+        }
+    except Exception as e:
+        # Try to clean up on error
+        if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
+            await extractor.playwright_scraper.close()
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/session")
+async def create_session(request: SessionCreateRequest):
+    """Create a persistent scraping session"""
+    session_id = str(uuid4())
+    try:
+        scraper_config = ScraperConfig(
+            headless=True,
+            max_retries=3,
+            delay_after_load=5
+        )
+        extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
+        sessions[session_id] = extractor
+        return {"session_id": session_id, "message": "Session created", "model": request.model_name}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")
+@app.post("/api/session/{session_id}/scrape")
+async def session_scrape(session_id: str, request: ScrapeRequest):
+    """Scrape using an existing session context"""
+    if session_id not in sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+    extractor = sessions[session_id]
+    try:
+        full_query = f"{request.url} {request.query}"
+        response = await extractor.process_query(full_query)
+        if isinstance(response, tuple):
+            response = response[0]
+        return {
+            "session_id": session_id,
+            "url": request.url,
+            "query": request.query,
+            "response": response
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.delete("/api/session/{session_id}")
+async def close_session(session_id: str):
+    """Close a session and release resources"""
+    if session_id in sessions:
+        extractor = sessions[session_id]
+        if hasattr(extractor.playwright_scraper, 'close'):
+            await extractor.playwright_scraper.close()
+        del sessions[session_id]
+        return {"message": "Session closed", "session_id": session_id}
+    raise HTTPException(status_code=404, detail="Session not found")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)