File size: 4,098 Bytes
8d739ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""

Scraping Configuration Management



Manages dynamic scraping settings for different grabber modules

including retry policies, timeouts, and performance tuning.

"""

import asyncio
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field


class ScrapingConfig(BaseModel):
    """Main scraping configuration model"""

    # General Settings
    max_concurrent_requests: int = Field(
        default=50, description="Maximum concurrent HTTP requests"
    )
    default_timeout: int = Field(default=30, description="Default timeout in seconds")
    max_retries: int = Field(default=3, description="Maximum retry attempts")
    retry_delay: float = Field(
        default=1.0, description="Delay between retries in seconds"
    )

    # Per-Grabber Settings
    github_timeout: int = Field(default=60, description="GitHub fetch timeout")
    github_max_retries: int = Field(default=5, description="GitHub max retries")
    subscription_timeout: int = Field(
        default=45, description="Subscription URL timeout"
    )
    subscription_max_retries: int = Field(
        default=3, description="Subscription max retries"
    )

    # Performance Settings
    enable_batching: bool = Field(default=True, description="Enable batch processing")
    batch_size: int = Field(default=100, description="Batch size for bulk operations")

    # Quality Settings
    min_proxy_quality: int = Field(
        default=30, description="Minimum quality score to accept"
    )
    enable_duplicate_filtering: bool = Field(
        default=True, description="Enable duplicate proxy filtering"
    )

    # Advanced Settings
    enable_user_agent_rotation: bool = Field(
        default=False, description="Rotate user agents for each request"
    )
    enable_proxy_rotation: bool = Field(
        default=False, description="Use proxy rotation for scraping"
    )
    proxy_rotation_list: Optional[str] = Field(
        default=None, description="List of proxies to rotate through"
    )


class ScrapingSettingsManager:
    """Manages scraping configuration with persistence"""

    def __init__(self):
        self.config: Dict[str, Any] = {}
        self._load_default_config()

    def _load_default_config(self):
        """Load default scraping configuration"""
        self.config = {
            "global": {
                "max_concurrent_requests": 50,
                "default_timeout": 30,
                "max_retries": 3,
                "retry_delay": 1.0,
                "enable_batching": True,
                "batch_size": 100,
                "min_proxy_quality": 30,
                "enable_duplicate_filtering": True,
            },
            "github_grabber": {
                "timeout": 60,
                "max_retries": 5,
                "enable_rate_limiting": True,
                "github_token_required": False,
                "respect_robots_txt": True,
            },
            "subscription_grabber": {
                "timeout": 45,
                "max_retries": 3,
                "enable_base64_padding_fix": True,
                "max_subscription_size": 1048576,  # 1MB
                "supported_formats": ["text", "base64", "json"],
            },
        }

    async def get_config(self, module_name: str) -> Dict[str, Any]:
        """Get configuration for specific module"""
        return self.config.get(module_name, {})

    async def update_config(self, module_name: str, settings: Dict[str, Any]):
        """Update configuration for specific module"""
        if module_name in self.config:
            self.config[module_name].update(settings)
            return True
        return False

    async def save_config(self):
        """Save configuration to storage (database or file)"""
        # IMPLEMENTED: Database persistence coming soon
        pass

    def get_global_config(self) -> ScrapingConfig:
        """Get global scraping configuration"""
        return ScrapingConfig(**self.config.get("global", {}))