| """
|
| Scraping Configuration Management
|
|
|
| Manages dynamic scraping settings for different grabber modules
|
| including retry policies, timeouts, and performance tuning.
|
| """
|
|
|
| import asyncio
|
| from typing import Dict, Any, Optional
|
| from pydantic import BaseModel, Field
|
|
|
|
|
| class ScrapingConfig(BaseModel):
|
| """Main scraping configuration model"""
|
|
|
|
|
| max_concurrent_requests: int = Field(
|
| default=50, description="Maximum concurrent HTTP requests"
|
| )
|
| default_timeout: int = Field(default=30, description="Default timeout in seconds")
|
| max_retries: int = Field(default=3, description="Maximum retry attempts")
|
| retry_delay: float = Field(
|
| default=1.0, description="Delay between retries in seconds"
|
| )
|
|
|
|
|
| github_timeout: int = Field(default=60, description="GitHub fetch timeout")
|
| github_max_retries: int = Field(default=5, description="GitHub max retries")
|
| subscription_timeout: int = Field(
|
| default=45, description="Subscription URL timeout"
|
| )
|
| subscription_max_retries: int = Field(
|
| default=3, description="Subscription max retries"
|
| )
|
|
|
|
|
| enable_batching: bool = Field(default=True, description="Enable batch processing")
|
| batch_size: int = Field(default=100, description="Batch size for bulk operations")
|
|
|
|
|
| min_proxy_quality: int = Field(
|
| default=30, description="Minimum quality score to accept"
|
| )
|
| enable_duplicate_filtering: bool = Field(
|
| default=True, description="Enable duplicate proxy filtering"
|
| )
|
|
|
|
|
| enable_user_agent_rotation: bool = Field(
|
| default=False, description="Rotate user agents for each request"
|
| )
|
| enable_proxy_rotation: bool = Field(
|
| default=False, description="Use proxy rotation for scraping"
|
| )
|
| proxy_rotation_list: Optional[str] = Field(
|
| default=None, description="List of proxies to rotate through"
|
| )
|
|
|
|
|
| class ScrapingSettingsManager:
|
| """Manages scraping configuration with persistence"""
|
|
|
| def __init__(self):
|
| self.config: Dict[str, Any] = {}
|
| self._load_default_config()
|
|
|
| def _load_default_config(self):
|
| """Load default scraping configuration"""
|
| self.config = {
|
| "global": {
|
| "max_concurrent_requests": 50,
|
| "default_timeout": 30,
|
| "max_retries": 3,
|
| "retry_delay": 1.0,
|
| "enable_batching": True,
|
| "batch_size": 100,
|
| "min_proxy_quality": 30,
|
| "enable_duplicate_filtering": True,
|
| },
|
| "github_grabber": {
|
| "timeout": 60,
|
| "max_retries": 5,
|
| "enable_rate_limiting": True,
|
| "github_token_required": False,
|
| "respect_robots_txt": True,
|
| },
|
| "subscription_grabber": {
|
| "timeout": 45,
|
| "max_retries": 3,
|
| "enable_base64_padding_fix": True,
|
| "max_subscription_size": 1048576,
|
| "supported_formats": ["text", "base64", "json"],
|
| },
|
| }
|
|
|
| async def get_config(self, module_name: str) -> Dict[str, Any]:
|
| """Get configuration for specific module"""
|
| return self.config.get(module_name, {})
|
|
|
| async def update_config(self, module_name: str, settings: Dict[str, Any]):
|
| """Update configuration for specific module"""
|
| if module_name in self.config:
|
| self.config[module_name].update(settings)
|
| return True
|
| return False
|
|
|
| async def save_config(self):
|
| """Save configuration to storage (database or file)"""
|
|
|
| pass
|
|
|
| def get_global_config(self) -> ScrapingConfig:
|
| """Get global scraping configuration"""
|
| return ScrapingConfig(**self.config.get("global", {}))
|
|
|