Spaces:

MagicMeWizard
/

AI_Powered_Web_Scraper

Paused

App Files Files Community

AI_Powered_Web_Scraper / config.py

MagicMeWizard

Update config.py

ccc5d44 verified 10 months ago

raw

history blame contribute delete

14.1 kB

	"""
	⚙️ Configuration settings for AI Dataset Studio with Perplexity integration
	"""

	import os
	from dataclasses import dataclass
	from typing import List, Dict, Optional

	@dataclass
	class PerplexityConfig:
	"""Configuration for Perplexity AI integration"""

	# API Configuration
	api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY')
	base_url: str = "https://api.perplexity.ai"
	model: str = "llama-3.1-sonar-large-128k-online"

	# Rate Limiting
	requests_per_minute: int = 30
	request_timeout: int = 30
	max_retries: int = 3
	min_request_interval: float = 1.0 # seconds

	# Search Configuration
	default_max_sources: int = 20
	max_sources_limit: int = 50
	min_sources: int = 5

	# Quality Thresholds
	min_relevance_score: float = 3.0
	min_content_length: int = 100
	max_content_length: int = 10_000_000 # 10MB

	# Search Templates
	search_templates: Dict[str, str] = None

	def __post_init__(self):
	"""Initialize search templates after creation"""
	if self.search_templates is None:
	self.search_templates = {
	"sentiment_analysis": """
	Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Sources with clear positive, negative, or neutral sentiment
	- Text suitable for sentiment classification training
	- Diverse content types (reviews, social media, news, forums)
	- Avoid heavily biased or extreme content
	- Include metadata when possible (ratings, timestamps, etc.)

	SEARCH FOCUS:
	- Product reviews and customer feedback
	- Social media posts and comments
	- News articles with opinion content
	- Blog posts with clear sentiment
	- Forum discussions and community posts

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Why this source is good for sentiment analysis
	4. Content Type: [review/social/news/blog/forum]
	5. Expected Sentiment Distribution: Estimate of positive/negative/neutral content
	6. Quality Score: 1-10 rating for ML training suitability
	""",

	"text_classification": """
	Find {max_sources} diverse, well-categorized sources for text classification training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Sources with clear, distinct categories or topics
	- Consistent content structure within categories
	- Sufficient variety within each category
	- Professional or semi-professional content quality
	- Avoid overly niche or specialized content

	SEARCH FOCUS:
	- News articles with clear sections (politics, sports, technology, etc.)
	- Academic papers with subject classifications
	- E-commerce product descriptions with categories
	- Blog posts with clear topical focus
	- Government documents with departmental classifications

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Content type and classification scheme
	4. Categories Available: List of categories/classes present
	5. Content Volume: Estimated amount of data per category
	6. Quality Score: 1-10 rating for classification training
	""",

	"named_entity_recognition": """
	Find {max_sources} text-rich sources with clear named entities for NER training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Rich in named entities (people, places, organizations, dates, etc.)
	- Clear, well-written text (not fragmented or poorly formatted)
	- Diverse entity types and contexts
	- Professional writing quality
	- Entities are clearly identifiable in context

	SEARCH FOCUS:
	- News articles and press releases
	- Biographical content and profiles
	- Business and financial reports
	- Historical documents and articles
	- Academic papers and research
	- Government publications

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Types of entities commonly found
	4. Entity Density: Expected frequency of named entities
	5. Text Quality: Assessment of writing clarity
	6. Quality Score: 1-10 rating for NER training
	""",

	"question_answering": """
	Find {max_sources} sources with clear question-answer patterns for QA training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Explicit Q&A format OR clear factual content suitable for QA generation
	- Questions and answers are clearly delineated
	- Factual, verifiable information
	- Diverse question types (factual, definitional, procedural, etc.)
	- Professional quality content

	SEARCH FOCUS:
	- FAQ pages and help documentation
	- Interview transcripts and Q&A sessions
	- Educational content with questions
	- Technical documentation with examples
	- Customer support knowledge bases
	- Stack Overflow and similar Q&A platforms

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Q&A format type and subject matter
	4. Question Types: Types of questions typically found
	5. Answer Quality: Assessment of answer completeness
	6. Quality Score: 1-10 rating for QA training
	""",

	"text_summarization": """
	Find {max_sources} sources with substantial, well-structured content for summarization training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Long-form content (articles, reports, papers)
	- Clear structure with main points
	- Professional writing quality
	- Self-contained content (doesn't rely heavily on external references)
	- Diverse content types and subjects

	SEARCH FOCUS:
	- News articles and investigative reports
	- Research papers and academic articles
	- Long-form blog posts and essays
	- Government reports and white papers
	- Industry analysis and market reports
	- Review articles and meta-analyses

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Content length and structure
	4. Main Topics: Key subjects covered
	5. Summarization Potential: How well-suited for summary generation
	6. Quality Score: 1-10 rating for summarization training
	""",

	"translation": """
	Find {max_sources} parallel or multilingual content for translation training:

	PROJECT: {project_description}

	REQUIREMENTS:
	- Content available in multiple languages
	- High translation quality (professional or native-level)
	- Parallel content alignment when possible
	- Diverse domains and text types
	- Clear source and target language identification

	SEARCH FOCUS:
	- Multilingual news websites
	- International organization publications
	- Government documents in multiple languages
	- Educational content with translations
	- Software documentation with localization
	- Cultural and literary translations

	OUTPUT FORMAT:
	For each source provide:
	1. URL: Direct link to content
	2. Title: Clear, descriptive title
	3. Description: Languages available and content type
	4. Language Pairs: Specific language combinations
	5. Translation Quality: Assessment of translation accuracy
	6. Quality Score: 1-10 rating for translation training
	"""
	}

	@dataclass
	class ScrapingConfig:
	"""Configuration for web scraping"""

	# Request settings
	timeout: int = 15
	max_retries: int = 3
	retry_delay: float = 1.0

	# Rate limiting
	requests_per_second: float = 0.5 # Conservative rate limiting
	burst_requests: int = 5

	# Content filtering
	min_content_length: int = 100
	max_content_length: int = 1_000_000 # 1MB per page

	# User agent rotation
	user_agents: List[str] = None

	# Blocked domains (respect robots.txt)
	blocked_domains: List[str] = None

	# Content extraction settings
	extract_metadata: bool = True
	clean_html: bool = True
	preserve_structure: bool = False

	def __post_init__(self):
	"""Initialize default values"""
	if self.user_agents is None:
	self.user_agents = [
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	]

	if self.blocked_domains is None:
	self.blocked_domains = [
	'localhost',
	'127.0.0.1',
	'0.0.0.0',
	'10.',
	'172.',
	'192.168.',
	'internal.',
	'staging.',
	'test.',
	'dev.'
	]

	@dataclass
	class ModelConfig:
	"""Configuration for AI models"""

	# Model selection
	sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
	summarization_model: str = "facebook/bart-large-cnn"
	ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english"

	# Fallback models (lighter/faster)
	sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english"
	summarization_fallback: str = "sshleifer/distilbart-cnn-12-6"
	ner_fallback: str = "distilbert-base-cased"

	# Device configuration
	device: str = "auto" # auto, cpu, cuda
	use_gpu: bool = True
	max_memory_mb: int = 4000

	# Processing settings
	max_sequence_length: int = 512
	batch_size: int = 8
	confidence_threshold: float = 0.7

	# Cache settings
	cache_models: bool = True
	model_cache_dir: str = "./model_cache"

	@dataclass
	class ExportConfig:
	"""Configuration for dataset export"""

	# File settings
	max_file_size_mb: int = 100
	compression: bool = True
	encoding: str = "utf-8"

	# Format-specific settings
	json_indent: int = 2
	csv_delimiter: str = ","
	csv_quoting: int = 1 # csv.QUOTE_ALL

	# HuggingFace dataset settings
	hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}"
	hf_private: bool = True
	hf_token: Optional[str] = os.getenv('HF_TOKEN')

	# Metadata inclusion
	include_source_urls: bool = True
	include_timestamps: bool = True
	include_processing_info: bool = True
	include_confidence_scores: bool = True

	@dataclass
	class SecurityConfig:
	"""Security and safety configuration"""

	# URL validation
	allow_local_urls: bool = False
	allow_private_ips: bool = False
	max_redirects: int = 5

	# Content filtering
	filter_adult_content: bool = True
	filter_spam: bool = True
	max_duplicate_content: float = 0.8 # Similarity threshold

	# Rate limiting enforcement
	enforce_rate_limits: bool = True
	respect_robots_txt: bool = True

	# Safety checks
	scan_for_malware: bool = False # Requires additional dependencies
	validate_ssl: bool = True

	@dataclass
	class UIConfig:
	"""User interface configuration"""

	# Theme settings
	theme: str = "soft"
	custom_css: bool = True
	dark_mode: bool = False

	# Interface settings
	max_preview_items: int = 10
	preview_text_length: int = 200
	show_progress_bars: bool = True

	# Advanced features
	enable_debug_mode: bool = False
	show_model_info: bool = True
	enable_export_preview: bool = True

	# Global configuration instance
	class Config:
	"""Main configuration class combining all settings"""

	def __init__(self):
	self.perplexity = PerplexityConfig()
	self.scraping = ScrapingConfig()
	self.models = ModelConfig()
	self.export = ExportConfig()
	self.security = SecurityConfig()
	self.ui = UIConfig()

	# Application settings
	self.app_name = "AI Dataset Studio"
	self.version = "2.0.0"
	self.debug = os.getenv('DEBUG', 'false').lower() == 'true'

	# Logging
	self.log_level = os.getenv('LOG_LEVEL', 'INFO')
	self.log_format = '%(asctime)s - %(levelname)s - %(message)s'

	def is_perplexity_enabled(self) -> bool:
	"""Check if Perplexity AI is properly configured"""
	return bool(self.perplexity.api_key)

	def get_search_template(self, template_type: str, **kwargs) -> str:
	"""Get formatted search template for Perplexity"""
	template = self.perplexity.search_templates.get(template_type, "")
	if template:
	return template.format(**kwargs)
	return ""

	def validate_url(self, url: str) -> bool:
	"""Validate URL against security settings"""
	from urllib.parse import urlparse

	try:
	parsed = urlparse(url)

	# Check scheme
	if parsed.scheme not in ['http', 'https']:
	return False

	# Check for blocked domains
	netloc = parsed.netloc.lower()
	for blocked in self.security.blocked_domains:
	if blocked in netloc:
	return False

	# Check for local/private IPs if not allowed
	if not self.security.allow_local_urls:
	if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']):
	return False

	if not self.security.allow_private_ips:
	if any(private in netloc for private in ['10.', '172.', '192.168.']):
	return False

	return True

	except Exception:
	return False

	# Create global config instance
	config = Config()

	# Export commonly used configurations
	PERPLEXITY_CONFIG = config.perplexity
	SCRAPING_CONFIG = config.scraping
	MODEL_CONFIG = config.models
	EXPORT_CONFIG = config.export
	SECURITY_CONFIG = config.security
	UI_CONFIG = config.ui