GraziePrego commited on
Commit
ce40d2a
·
verified ·
1 Parent(s): 4202124

Add comprehensive /api-docs endpoint with examples and usage guide

Browse files
Files changed (1) hide show
  1. api.py +286 -0
api.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from typing import Optional, Dict
6
+ from uuid import uuid4
7
+ from src.web_extractor import WebExtractor
8
+ from src.scrapers.playwright_scraper import ScraperConfig
9
+
10
+ app = FastAPI()
11
+
12
+ # Store active sessions
13
+ sessions: Dict[str, WebExtractor] = {}
14
+
15
+ class ScrapeRequest(BaseModel):
16
+ url: str
17
+ query: str
18
+ model_name: Optional[str] = "alias-fast"
19
+
20
+ class SessionCreateRequest(BaseModel):
21
+ model_name: Optional[str] = "alias-fast"
22
+
23
+ @app.get("/health")
24
+ async def health():
25
+ return {"status": "ok", "message": "CyberScraper 2077 API is running"}
26
+
27
+ @app.get("/api-docs")
28
+ async def api_docs():
29
+ """Comprehensive API documentation with examples"""
30
+ return {
31
+ "title": "CyberScraper 2077 API Documentation",
32
+ "version": "1.0.0",
33
+ "description": "Advanced web scraping API with session management and AI-powered content extraction",
34
+ "base_url": "https://grazieprego-scrapling.hf.space",
35
+ "endpoints": {
36
+ "health": {
37
+ "method": "GET",
38
+ "path": "/health",
39
+ "description": "Check if the API is running",
40
+ "response": {
41
+ "status": "ok",
42
+ "message": "CyberScraper 2077 API is running"
43
+ },
44
+ "example": "curl https://grazieprego-scrapling.hf.space/health"
45
+ },
46
+ "scrape": {
47
+ "method": "POST",
48
+ "path": "/api/scrape",
49
+ "description": "Stateless scrape request - creates a new extractor for each request",
50
+ "request_body": {
51
+ "url": "string - The URL to scrape",
52
+ "query": "string - The extraction query/instruction",
53
+ "model_name": "string (optional) - AI model to use (default: 'alias-fast')"
54
+ },
55
+ "response": {
56
+ "url": "string - The scraped URL",
57
+ "query": "string - The query used",
58
+ "response": "any - The extracted content"
59
+ },
60
+ "example": {
61
+ "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com\", \"query\": \"Extract all product prices\"}'",
62
+ "python": "import requests\nresponse = requests.post('https://grazieprego-scrapling.hf.space/api/scrape', json={'url': 'https://example.com', 'query': 'Extract prices'})\nprint(response.json())"
63
+ }
64
+ },
65
+ "create_session": {
66
+ "method": "POST",
67
+ "path": "/api/session",
68
+ "description": "Create a persistent scraping session for multiple requests",
69
+ "request_body": {
70
+ "model_name": "string (optional) - AI model to use (default: 'alias-fast')"
71
+ },
72
+ "response": {
73
+ "session_id": "string - UUID of the created session",
74
+ "message": "string - Confirmation message",
75
+ "model": "string - Model used"
76
+ },
77
+ "example": {
78
+ "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session -H 'Content-Type: application/json' -d '{\"model_name\": \"alias-fast\"}'",
79
+ "python": "import requests\nsession = requests.post('https://grazieprego-scrapling.hf.space/api/session', json={'model_name': 'alias-fast'})\nsession_id = session.json()['session_id']"
80
+ }
81
+ },
82
+ "session_scrape": {
83
+ "method": "POST",
84
+ "path": "/api/session/{session_id}/scrape",
85
+ "description": "Scrape using an existing session context (more efficient for multiple requests)",
86
+ "path_parameters": {
87
+ "session_id": "string - UUID of the session"
88
+ },
89
+ "request_body": {
90
+ "url": "string - The URL to scrape",
91
+ "query": "string - The extraction query",
92
+ "model_name": "string (optional)"
93
+ },
94
+ "response": {
95
+ "session_id": "string - The session ID",
96
+ "url": "string - The scraped URL",
97
+ "query": "string - The query used",
98
+ "response": "any - The extracted content"
99
+ },
100
+ "example": {
101
+ "curl": "curl -X POST https://grazieprego-scrapling.hf.space/api/session/uuid-here/scrape -H 'Content-Type: application/json' -d '{\"url\": \"https://example.com/page1\", \"query\": \"Extract titles\"}'",
102
+ "python": "import requests\nresponse = requests.post(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape', json={'url': 'https://example.com', 'query': 'Extract data'})\nprint(response.json())"
103
+ }
104
+ },
105
+ "close_session": {
106
+ "method": "DELETE",
107
+ "path": "/api/session/{session_id}",
108
+ "description": "Close a session and release resources",
109
+ "path_parameters": {
110
+ "session_id": "string - UUID of the session to close"
111
+ },
112
+ "response": {
113
+ "message": "string - Confirmation message",
114
+ "session_id": "string - The closed session ID"
115
+ },
116
+ "example": {
117
+ "curl": "curl -X DELETE https://grazieprego-scrapling.hf.space/api/session/uuid-here",
118
+ "python": "import requests\nresponse = requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')\nprint(response.json())"
119
+ }
120
+ }
121
+ },
122
+ "usage_guide": {
123
+ "quick_start": [
124
+ "1. Make a simple scrape request to /api/scrape",
125
+ "2. For multiple requests, create a session first",
126
+ "3. Use the session ID for subsequent requests",
127
+ "4. Close sessions when done to free resources"
128
+ ],
129
+ "best_practices": [
130
+ "Use stateless /api/scrape for one-off requests",
131
+ "Use sessions for batch processing multiple URLs",
132
+ "Always close sessions when finished",
133
+ "Handle errors gracefully (500 errors may occur on complex sites)",
134
+ "Set appropriate timeouts for slow-loading pages"
135
+ ],
136
+ "error_handling": {
137
+ "404": "Session not found (for session endpoints)",
138
+ "500": "Internal server error - check the detail message",
139
+ "Common issues": [
140
+ "URL unreachable or timeout",
141
+ "JavaScript-heavy sites may require different approaches",
142
+ "Bot protection may block requests"
143
+ ]
144
+ }
145
+ },
146
+ "integration_examples": {
147
+ "python_script": """
148
+ import requests
149
+
150
+ # Stateless scrape
151
+ response = requests.post(
152
+ 'https://grazieprego-scrapling.hf.space/api/scrape',
153
+ json={
154
+ 'url': 'https://example.com',
155
+ 'query': 'Extract all headings and prices'
156
+ }
157
+ )
158
+ print("Result:", response.json())
159
+
160
+ # Session-based workflow
161
+ session_response = requests.post(
162
+ 'https://grazieprego-scrapling.hf.space/api/session',
163
+ json={'model_name': 'alias-fast'}
164
+ )
165
+ session_id = session_response.json()['session_id']
166
+
167
+ try:
168
+ # Multiple requests using the same session
169
+ for url in ['https://example.com/page1', 'https://example.com/page2']:
170
+ result = requests.post(
171
+ f'https://grazieprego-scrapling.hf.space/api/session/{session_id}/scrape',
172
+ json={'url': url, 'query': 'Extract product data'}
173
+ )
174
+ print(f"Scraped {url}:", result.json())
175
+ finally:
176
+ # Always close the session
177
+ requests.delete(f'https://grazieprego-scrapling.hf.space/api/session/{session_id}')
178
+ """,
179
+ "javascript": """
180
+ // Fetch API example
181
+ async function scrapeUrl(url, query) {
182
+ const response = await fetch('https://grazieprego-scrapling.hf.space/api/scrape', {
183
+ method: 'POST',
184
+ headers: { 'Content-Type': 'application/json' },
185
+ body: JSON.stringify({ url, query })
186
+ });
187
+ return await response.json();
188
+ }
189
+
190
+ // Usage
191
+ scrapeUrl('https://example.com', 'Extract all links').then(console.log);
192
+ """
193
+ },
194
+ "rate_limits": {
195
+ "note": "Rate limits may apply. Please use responsibly.",
196
+ "recommendation": "For high-volume scraping, use session-based approach and implement retry logic"
197
+ }
198
+ }
199
+
200
+ @app.post("/api/scrape")
201
+ async def scrape(request: ScrapeRequest):
202
+ """Stateless scrape request (creates a new extractor for each request)"""
203
+ scraper_config = ScraperConfig(
204
+ headless=True,
205
+ max_retries=3,
206
+ delay_after_load=5
207
+ )
208
+
209
+ extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
210
+ try:
211
+ # Construct the query by combining URL and the specific request
212
+ full_query = f"{request.url} {request.query}"
213
+ response = await extractor.process_query(full_query)
214
+
215
+ # If response is a tuple (csv/excel), extract the first part
216
+ if isinstance(response, tuple):
217
+ response = response[0]
218
+
219
+ # Clean up
220
+ if hasattr(extractor.playwright_scraper, 'close'):
221
+ await extractor.playwright_scraper.close()
222
+
223
+ return {
224
+ "url": request.url,
225
+ "query": request.query,
226
+ "response": response
227
+ }
228
+ except Exception as e:
229
+ # Try to clean up on error
230
+ if hasattr(extractor, 'playwright_scraper') and hasattr(extractor.playwright_scraper, 'close'):
231
+ await extractor.playwright_scraper.close()
232
+ raise HTTPException(status_code=500, detail=str(e))
233
+
234
+ @app.post("/api/session")
235
+ async def create_session(request: SessionCreateRequest):
236
+ """Create a persistent scraping session"""
237
+ session_id = str(uuid4())
238
+ try:
239
+ scraper_config = ScraperConfig(
240
+ headless=True,
241
+ max_retries=3,
242
+ delay_after_load=5
243
+ )
244
+ extractor = WebExtractor(model_name=request.model_name, scraper_config=scraper_config)
245
+ sessions[session_id] = extractor
246
+ return {"session_id": session_id, "message": "Session created", "model": request.model_name}
247
+ except Exception as e:
248
+ raise HTTPException(status_code=500, detail=f"Failed to create session: {str(e)}")
249
+
250
+ @app.post("/api/session/{session_id}/scrape")
251
+ async def session_scrape(session_id: str, request: ScrapeRequest):
252
+ """Scrape using an existing session context"""
253
+ if session_id not in sessions:
254
+ raise HTTPException(status_code=404, detail="Session not found")
255
+
256
+ extractor = sessions[session_id]
257
+ try:
258
+ full_query = f"{request.url} {request.query}"
259
+ response = await extractor.process_query(full_query)
260
+
261
+ if isinstance(response, tuple):
262
+ response = response[0]
263
+
264
+ return {
265
+ "session_id": session_id,
266
+ "url": request.url,
267
+ "query": request.query,
268
+ "response": response
269
+ }
270
+ except Exception as e:
271
+ raise HTTPException(status_code=500, detail=str(e))
272
+
273
+ @app.delete("/api/session/{session_id}")
274
+ async def close_session(session_id: str):
275
+ """Close a session and release resources"""
276
+ if session_id in sessions:
277
+ extractor = sessions[session_id]
278
+ if hasattr(extractor.playwright_scraper, 'close'):
279
+ await extractor.playwright_scraper.close()
280
+ del sessions[session_id]
281
+ return {"message": "Session closed", "session_id": session_id}
282
+ raise HTTPException(status_code=404, detail="Session not found")
283
+
284
+ if __name__ == "__main__":
285
+ import uvicorn
286
+ uvicorn.run(app, host="0.0.0.0", port=8000)