Paijo commited on
Commit
ced16e1
·
verified ·
1 Parent(s): 404d1c0

update app/grabber/scraping_utils.py

Browse files
Files changed (1) hide show
  1. app/grabber/scraping_utils.py +374 -0
app/grabber/scraping_utils.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scraping Utilities
3
+
4
+ Helper functions untuk enhanced scraping dengan rate limiting,
5
+ error handling, dan performance optimization.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import time
11
+ import random
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime, timedelta
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass
20
+ class ProxyAgent:
21
+ """Proxy agent for rotation and management."""
22
+
23
+ proxy_url: str
24
+ success_rate: float = 1.0
25
+ failure_count: int = 0
26
+ last_used: Optional[datetime] = None
27
+
28
+
29
+ @dataclass
30
+ class ScrapingSession:
31
+ """Session untuk track scraping statistics"""
32
+
33
+ session_id: str
34
+ start_time: float
35
+ requests_made: int = 0
36
+ successful_requests: int = 0
37
+ failed_requests: int = 0
38
+ proxies_used: List[str] = None
39
+ avg_response_time: float = 0.0
40
+ total_data_bytes: int = 0
41
+
42
+
43
+ @dataclass
44
+ class RequestQueue:
45
+ """Queue untuk mengatur concurrent requests"""
46
+
47
+ max_concurrent: int
48
+ active_requests: Dict[str, datetime] = field(default_factory=dict)
49
+ completed_requests: Dict[str, datetime] = field(default_factory=dict)
50
+
51
+
52
+ class RateLimiter:
53
+ """Rate limiting per domain and IP"""
54
+
55
+ def __init__(self):
56
+ self.domain_requests: Dict[str, List[datetime]] = {}
57
+ self.ip_requests: Dict[str, List[datetime]] = {}
58
+ self.global_requests: List[datetime] = []
59
+
60
+ def can_make_request(
61
+ self, domain: Optional[str], ip: Optional[str]
62
+ ) -> Tuple[bool, Optional[float]]:
63
+ """Check if request can be made"""
64
+ now = datetime.now()
65
+
66
+ # Rate limiting per domain
67
+ if domain and domain in self.domain_requests:
68
+ recent_requests = [
69
+ req
70
+ for req in self.domain_requests[domain]
71
+ if req > now - timedelta(minutes=1)
72
+ ]
73
+ if len(recent_requests) >= 10: # 10 requests per minute
74
+ return False, 60.0 # Retry after 60 seconds
75
+
76
+ # Rate limiting per IP
77
+ if ip and ip in self.ip_requests:
78
+ recent_requests = [
79
+ req for req in self.ip_requests[ip] if req > now - timedelta(minutes=5)
80
+ ]
81
+ if len(recent_requests) >= 50: # 50 requests per 5 minutes
82
+ return False, 300.0 # Retry after 5 minutes
83
+
84
+ # Global rate limiting
85
+ total_recent = len(
86
+ [req for req in self.global_requests if req > now - timedelta(minutes=1)]
87
+ )
88
+ if total_recent >= 100: # 100 requests per minute globally
89
+ return False, 60.0
90
+
91
+ return True, 0.0
92
+
93
+ def record_request(self, domain: Optional[str], ip: Optional[str]):
94
+ """Record successful request"""
95
+ now = datetime.now()
96
+
97
+ if domain:
98
+ if domain not in self.domain_requests:
99
+ self.domain_requests[domain] = []
100
+ self.domain_requests[domain].append(now)
101
+
102
+ if ip:
103
+ if ip not in self.ip_requests:
104
+ self.ip_requests[ip] = []
105
+ self.ip_requests[ip].append(now)
106
+
107
+ self.global_requests.append(now)
108
+
109
+ def record_success(
110
+ self,
111
+ domain: Optional[str],
112
+ ip: Optional[str],
113
+ response_time: float,
114
+ data_size: int,
115
+ ):
116
+ """Record successful request"""
117
+ now = datetime.now()
118
+
119
+ # Update session stats
120
+ self.active_requests[str(uuid.uuid4())] = {
121
+ "timestamp": now,
122
+ "domain": domain,
123
+ "ip": ip,
124
+ "response_time": response_time,
125
+ "data_size": data_size,
126
+ "success": True,
127
+ }
128
+
129
+
130
+ class ExponentialBackoff:
131
+ """Exponential backoff dengan jitter untuk optimal retry"""
132
+
133
+ @staticmethod
134
+ def get_delay(
135
+ attempt: int,
136
+ base_delay: float = 1.0,
137
+ max_delay: float = 60.0,
138
+ jitter: bool = True,
139
+ ) -> float:
140
+ """Calculate delay dengan exponential backoff"""
141
+ delay = base_delay * (2**attempt)
142
+
143
+ if jitter:
144
+ # Add random jitter (±25%) to avoid thundering herd
145
+ jitter_factor = random.uniform(0.75, 1.25)
146
+ delay *= jitter_factor
147
+
148
+ return min(delay, max_delay)
149
+
150
+
151
+ class ProxyRotator:
152
+ """Smart proxy rotation untuk load balancing"""
153
+
154
+ def __init__(self, proxies: List[str] = None):
155
+ self.proxies = proxies or []
156
+ self.index = 0
157
+ self.failure_count: Dict[str, int] = {}
158
+ self.last_rotated = time.time()
159
+
160
+ async def get_next_proxy(
161
+ self, exclude: Optional[List[str]] = None
162
+ ) -> Optional[ProxyAgent]:
163
+ """Get next proxy dengan round-robin"""
164
+ available_proxies = [p for p in self.proxies if not exclude or p not in exclude]
165
+
166
+ if not available_proxies:
167
+ return None
168
+
169
+ proxy = available_proxies[self.index % len(available_proxies)]
170
+
171
+ self.index = (self.index + 1) % len(available_proxies)
172
+
173
+ return ProxyAgent(proxy_url=proxy)
174
+
175
+ def record_failure(self, proxy: str):
176
+ """Record proxy failure"""
177
+ if proxy not in self.failure_count:
178
+ self.failure_count[proxy] = 0
179
+ self.failure_count[proxy] += 1
180
+
181
+ # Rotate proxy setelah 5 failures
182
+ if self.failure_count[proxy] >= 5:
183
+ logger.warning(
184
+ f"Proxy {proxy} memiliki {self.failure_count[proxy]} kegagalan, dirotasi"
185
+ )
186
+ # Move proxy ke akhir daftar
187
+ self.proxies = [p for p in self.proxies if p != proxy]
188
+ self.failure_count[proxy] = 0
189
+
190
+ self.last_rotated = time.time()
191
+
192
+ def get_stats(self) -> Dict[str, Any]:
193
+ """Get rotation statistics"""
194
+ return {
195
+ "total_proxies": len(self.proxies),
196
+ "failure_counts": dict(self.failure_count),
197
+ "last_rotated": datetime.fromtimestamp(self.last_rotated).isoformat(),
198
+ "rotation_count": sum(
199
+ 1 for count in self.failure_count.values() if count > 5
200
+ ),
201
+ }
202
+
203
+
204
+ class PerformanceMonitor:
205
+ """Monitor scraping performance"""
206
+
207
+ def __init__(self):
208
+ self.session_stats: List[ScrapingSession] = []
209
+ self.start_time = time.time()
210
+
211
+ def start_session(self) -> ScrapingSession:
212
+ """Start new scraping session"""
213
+ session = ScrapingSession(
214
+ session_id=f"session_{int(time.time())}", start_time=time.time()
215
+ )
216
+ self.session_stats.append(session)
217
+ return session
218
+
219
+ def end_session(self, session: ScrapingSession):
220
+ """End scraping session"""
221
+ session.end_time = time.time()
222
+ session.duration = session.end_time - session.start_time
223
+ session.success_rate = (
224
+ (session.successful_requests / max(session.requests_made, 1)) * 100
225
+ if session.requests_made > 0
226
+ else 0
227
+ )
228
+ session.avg_response_time = (
229
+ session.avg_response_time / max(session.requests_made, 1)
230
+ if session.requests_made > 0
231
+ else 0
232
+ )
233
+
234
+ # Calculate performance metrics
235
+ self.session_stats.remove(session)
236
+ return session
237
+
238
+ def get_overall_stats(self) -> Dict[str, Any]:
239
+ """Get overall performance statistics"""
240
+ if not self.session_stats:
241
+ return {}
242
+
243
+ total_sessions = len(self.session_stats)
244
+ total_requests = sum(s.requests_made for s in self.session_stats)
245
+ total_successful = sum(s.successful_requests for s in self.session_stats)
246
+ total_data_bytes = sum(s.total_data_bytes for s in self.session_stats)
247
+ avg_duration = (
248
+ sum(s.duration for s in self.session_stats) / max(total_sessions, 1)
249
+ if total_sessions > 0
250
+ else 0
251
+ )
252
+
253
+ return {
254
+ "total_sessions": total_sessions,
255
+ "total_requests": total_requests,
256
+ "total_successful": total_successful,
257
+ "success_rate": (total_successful / max(total_requests, 1)) * 100
258
+ if total_requests > 0
259
+ else 0,
260
+ "total_data_bytes": total_data_bytes,
261
+ "avg_session_duration": avg_duration,
262
+ "requests_per_second": total_requests / max(avg_duration, 1)
263
+ if avg_duration > 0
264
+ else 0,
265
+ "bytes_per_second": total_data_bytes / max(avg_duration, 1)
266
+ if avg_duration > 0
267
+ else 0,
268
+ }
269
+
270
+
271
+ # Helper functions
272
+ def calculate_proxy_score(
273
+ latency: int, can_access_google: bool, proxy_type: str, country_code: str
274
+ ) -> int:
275
+ """Calculate proxy quality score (0-100)"""
276
+ score = 0
277
+
278
+ # Latency scoring (40 points max)
279
+ if latency <= 200:
280
+ score += 40 # Perfect
281
+ elif latency <= 500:
282
+ score += 30
283
+ elif latency <= 1000:
284
+ score += 20
285
+ elif latency <= 2000:
286
+ score += 10
287
+
288
+ # Anonymity scoring (30 points max)
289
+ if proxy_type == "elite":
290
+ score += 30
291
+ elif proxy_type == "anonymous":
292
+ score += 20
293
+ elif proxy_type == "transparent":
294
+ score += 0 # No points for transparent
295
+
296
+ # Google access scoring (15 points)
297
+ if can_access_google:
298
+ score += 15
299
+
300
+ # Residential bonus (15 points)
301
+ if country_code and country_code != "US":
302
+ score += 15 # Assume non-US is residential
303
+
304
+ return min(score, 100)
305
+
306
+
307
+ def extract_domain(url: str) -> str:
308
+ """Extract domain dari URL"""
309
+ try:
310
+ return urlparse(url).netloc
311
+ except:
312
+ return "unknown"
313
+
314
+
315
+ def validate_url(url: str) -> bool:
316
+ """Validate URL format"""
317
+ try:
318
+ result = urlparse(url)
319
+ return bool(result.scheme and result.netloc)
320
+ except:
321
+ return False
322
+
323
+
324
+ def clean_text(text: str) -> str:
325
+ """Clean text dari HTML tags dan noise"""
326
+ import re
327
+
328
+ # Remove HTML tags
329
+ text = re.sub(r"<[^>]+>", " ", text)
330
+
331
+ # Remove multiple whitespace
332
+ text = re.sub(r"\s+", " ", text)
333
+
334
+ # Remove URLs yang tidak relevan
335
+ text = re.sub(r"https?://[^\s]+", "", text)
336
+
337
+ return text.strip()
338
+
339
+
340
+ def format_bytes(bytes_size: int) -> str:
341
+ """Format bytes size untuk human readable"""
342
+ for unit in ["B", "KB", "MB", "GB"]:
343
+ if bytes_size < 1024:
344
+ return f"{bytes_size} {unit}"
345
+ bytes_size /= 1024
346
+ if bytes_size < 1024:
347
+ return f"{bytes_size} {unit}"
348
+ bytes_size /= 1024
349
+ if bytes_size < 1024:
350
+ return f"{bytes_size} {unit}"
351
+ else:
352
+ return f"{bytes_size:.1f} {unit}"
353
+
354
+
355
+ def generate_session_id() -> str:
356
+ """Generate unique session ID"""
357
+ import uuid
358
+
359
+ return str(uuid.uuid4())
360
+
361
+
362
+ # Constants
363
+ DEFAULT_TIMEOUT = 30
364
+ DEFAULT_MAX_RETRIES = 3
365
+ DEFAULT_BATCH_SIZE = 100
366
+
367
+ # Rate limiting constants
368
+ DOMAIN_RATE_LIMIT = 10 # requests per minute per domain
369
+ IP_RATE_LIMIT = 50 # requests per 5 minutes per IP
370
+ GLOBAL_RATE_LIMIT = 100 # requests per minute globally
371
+
372
+ # Quality thresholds
373
+ MIN_QUALITY_SCORE = 30
374
+ SUCCESS_RATE_THRESHOLD = 0.3 # 30% success rate untuk proxy rotation