| """ |
| ⚙️ Configuration settings for AI Dataset Studio with Perplexity integration |
| """ |
|
|
| import os |
| from dataclasses import dataclass |
| from typing import List, Dict, Optional |
|
|
| @dataclass |
| class PerplexityConfig: |
| """Configuration for Perplexity AI integration""" |
| |
| |
| api_key: Optional[str] = os.getenv('PERPLEXITY_API_KEY') |
| base_url: str = "https://api.perplexity.ai" |
| model: str = "llama-3.1-sonar-large-128k-online" |
| |
| |
| requests_per_minute: int = 30 |
| request_timeout: int = 30 |
| max_retries: int = 3 |
| min_request_interval: float = 1.0 |
| |
| |
| default_max_sources: int = 20 |
| max_sources_limit: int = 50 |
| min_sources: int = 5 |
| |
| |
| min_relevance_score: float = 3.0 |
| min_content_length: int = 100 |
| max_content_length: int = 10_000_000 |
| |
| |
| search_templates: Dict[str, str] = None |
| |
| def __post_init__(self): |
| """Initialize search templates after creation""" |
| if self.search_templates is None: |
| self.search_templates = { |
| "sentiment_analysis": """ |
| Find {max_sources} high-quality sources containing text with clear emotional sentiment for machine learning training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Sources with clear positive, negative, or neutral sentiment |
| - Text suitable for sentiment classification training |
| - Diverse content types (reviews, social media, news, forums) |
| - Avoid heavily biased or extreme content |
| - Include metadata when possible (ratings, timestamps, etc.) |
| |
| SEARCH FOCUS: |
| - Product reviews and customer feedback |
| - Social media posts and comments |
| - News articles with opinion content |
| - Blog posts with clear sentiment |
| - Forum discussions and community posts |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Why this source is good for sentiment analysis |
| 4. **Content Type**: [review/social/news/blog/forum] |
| 5. **Expected Sentiment Distribution**: Estimate of positive/negative/neutral content |
| 6. **Quality Score**: 1-10 rating for ML training suitability |
| """, |
| |
| "text_classification": """ |
| Find {max_sources} diverse, well-categorized sources for text classification training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Sources with clear, distinct categories or topics |
| - Consistent content structure within categories |
| - Sufficient variety within each category |
| - Professional or semi-professional content quality |
| - Avoid overly niche or specialized content |
| |
| SEARCH FOCUS: |
| - News articles with clear sections (politics, sports, technology, etc.) |
| - Academic papers with subject classifications |
| - E-commerce product descriptions with categories |
| - Blog posts with clear topical focus |
| - Government documents with departmental classifications |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Content type and classification scheme |
| 4. **Categories Available**: List of categories/classes present |
| 5. **Content Volume**: Estimated amount of data per category |
| 6. **Quality Score**: 1-10 rating for classification training |
| """, |
| |
| "named_entity_recognition": """ |
| Find {max_sources} text-rich sources with clear named entities for NER training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Rich in named entities (people, places, organizations, dates, etc.) |
| - Clear, well-written text (not fragmented or poorly formatted) |
| - Diverse entity types and contexts |
| - Professional writing quality |
| - Entities are clearly identifiable in context |
| |
| SEARCH FOCUS: |
| - News articles and press releases |
| - Biographical content and profiles |
| - Business and financial reports |
| - Historical documents and articles |
| - Academic papers and research |
| - Government publications |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Types of entities commonly found |
| 4. **Entity Density**: Expected frequency of named entities |
| 5. **Text Quality**: Assessment of writing clarity |
| 6. **Quality Score**: 1-10 rating for NER training |
| """, |
| |
| "question_answering": """ |
| Find {max_sources} sources with clear question-answer patterns for QA training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Explicit Q&A format OR clear factual content suitable for QA generation |
| - Questions and answers are clearly delineated |
| - Factual, verifiable information |
| - Diverse question types (factual, definitional, procedural, etc.) |
| - Professional quality content |
| |
| SEARCH FOCUS: |
| - FAQ pages and help documentation |
| - Interview transcripts and Q&A sessions |
| - Educational content with questions |
| - Technical documentation with examples |
| - Customer support knowledge bases |
| - Stack Overflow and similar Q&A platforms |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Q&A format type and subject matter |
| 4. **Question Types**: Types of questions typically found |
| 5. **Answer Quality**: Assessment of answer completeness |
| 6. **Quality Score**: 1-10 rating for QA training |
| """, |
| |
| "text_summarization": """ |
| Find {max_sources} sources with substantial, well-structured content for summarization training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Long-form content (articles, reports, papers) |
| - Clear structure with main points |
| - Professional writing quality |
| - Self-contained content (doesn't rely heavily on external references) |
| - Diverse content types and subjects |
| |
| SEARCH FOCUS: |
| - News articles and investigative reports |
| - Research papers and academic articles |
| - Long-form blog posts and essays |
| - Government reports and white papers |
| - Industry analysis and market reports |
| - Review articles and meta-analyses |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Content length and structure |
| 4. **Main Topics**: Key subjects covered |
| 5. **Summarization Potential**: How well-suited for summary generation |
| 6. **Quality Score**: 1-10 rating for summarization training |
| """, |
| |
| "translation": """ |
| Find {max_sources} parallel or multilingual content for translation training: |
| |
| PROJECT: {project_description} |
| |
| REQUIREMENTS: |
| - Content available in multiple languages |
| - High translation quality (professional or native-level) |
| - Parallel content alignment when possible |
| - Diverse domains and text types |
| - Clear source and target language identification |
| |
| SEARCH FOCUS: |
| - Multilingual news websites |
| - International organization publications |
| - Government documents in multiple languages |
| - Educational content with translations |
| - Software documentation with localization |
| - Cultural and literary translations |
| |
| OUTPUT FORMAT: |
| For each source provide: |
| 1. **URL**: Direct link to content |
| 2. **Title**: Clear, descriptive title |
| 3. **Description**: Languages available and content type |
| 4. **Language Pairs**: Specific language combinations |
| 5. **Translation Quality**: Assessment of translation accuracy |
| 6. **Quality Score**: 1-10 rating for translation training |
| """ |
| } |
|
|
| @dataclass |
| class ScrapingConfig: |
| """Configuration for web scraping""" |
| |
| |
| timeout: int = 15 |
| max_retries: int = 3 |
| retry_delay: float = 1.0 |
| |
| |
| requests_per_second: float = 0.5 |
| burst_requests: int = 5 |
| |
| |
| min_content_length: int = 100 |
| max_content_length: int = 1_000_000 |
| |
| |
| user_agents: List[str] = None |
| |
| |
| blocked_domains: List[str] = None |
| |
| |
| extract_metadata: bool = True |
| clean_html: bool = True |
| preserve_structure: bool = False |
| |
| def __post_init__(self): |
| """Initialize default values""" |
| if self.user_agents is None: |
| self.user_agents = [ |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| ] |
| |
| if self.blocked_domains is None: |
| self.blocked_domains = [ |
| 'localhost', |
| '127.0.0.1', |
| '0.0.0.0', |
| '10.', |
| '172.', |
| '192.168.', |
| 'internal.', |
| 'staging.', |
| 'test.', |
| 'dev.' |
| ] |
|
|
| @dataclass |
| class ModelConfig: |
| """Configuration for AI models""" |
| |
| |
| sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest" |
| summarization_model: str = "facebook/bart-large-cnn" |
| ner_model: str = "dbmdz/bert-large-cased-finetuned-conll03-english" |
| |
| |
| sentiment_fallback: str = "distilbert-base-uncased-finetuned-sst-2-english" |
| summarization_fallback: str = "sshleifer/distilbart-cnn-12-6" |
| ner_fallback: str = "distilbert-base-cased" |
| |
| |
| device: str = "auto" |
| use_gpu: bool = True |
| max_memory_mb: int = 4000 |
| |
| |
| max_sequence_length: int = 512 |
| batch_size: int = 8 |
| confidence_threshold: float = 0.7 |
| |
| |
| cache_models: bool = True |
| model_cache_dir: str = "./model_cache" |
|
|
| @dataclass |
| class ExportConfig: |
| """Configuration for dataset export""" |
| |
| |
| max_file_size_mb: int = 100 |
| compression: bool = True |
| encoding: str = "utf-8" |
| |
| |
| json_indent: int = 2 |
| csv_delimiter: str = "," |
| csv_quoting: int = 1 |
| |
| |
| hf_dataset_name_template: str = "ai-dataset-studio-{timestamp}" |
| hf_private: bool = True |
| hf_token: Optional[str] = os.getenv('HF_TOKEN') |
| |
| |
| include_source_urls: bool = True |
| include_timestamps: bool = True |
| include_processing_info: bool = True |
| include_confidence_scores: bool = True |
|
|
| @dataclass |
| class SecurityConfig: |
| """Security and safety configuration""" |
| |
| |
| allow_local_urls: bool = False |
| allow_private_ips: bool = False |
| max_redirects: int = 5 |
| |
| |
| filter_adult_content: bool = True |
| filter_spam: bool = True |
| max_duplicate_content: float = 0.8 |
| |
| |
| enforce_rate_limits: bool = True |
| respect_robots_txt: bool = True |
| |
| |
| scan_for_malware: bool = False |
| validate_ssl: bool = True |
|
|
| @dataclass |
| class UIConfig: |
| """User interface configuration""" |
| |
| |
| theme: str = "soft" |
| custom_css: bool = True |
| dark_mode: bool = False |
| |
| |
| max_preview_items: int = 10 |
| preview_text_length: int = 200 |
| show_progress_bars: bool = True |
| |
| |
| enable_debug_mode: bool = False |
| show_model_info: bool = True |
| enable_export_preview: bool = True |
|
|
| |
| class Config: |
| """Main configuration class combining all settings""" |
| |
| def __init__(self): |
| self.perplexity = PerplexityConfig() |
| self.scraping = ScrapingConfig() |
| self.models = ModelConfig() |
| self.export = ExportConfig() |
| self.security = SecurityConfig() |
| self.ui = UIConfig() |
| |
| |
| self.app_name = "AI Dataset Studio" |
| self.version = "2.0.0" |
| self.debug = os.getenv('DEBUG', 'false').lower() == 'true' |
| |
| |
| self.log_level = os.getenv('LOG_LEVEL', 'INFO') |
| self.log_format = '%(asctime)s - %(levelname)s - %(message)s' |
| |
| def is_perplexity_enabled(self) -> bool: |
| """Check if Perplexity AI is properly configured""" |
| return bool(self.perplexity.api_key) |
| |
| def get_search_template(self, template_type: str, **kwargs) -> str: |
| """Get formatted search template for Perplexity""" |
| template = self.perplexity.search_templates.get(template_type, "") |
| if template: |
| return template.format(**kwargs) |
| return "" |
| |
| def validate_url(self, url: str) -> bool: |
| """Validate URL against security settings""" |
| from urllib.parse import urlparse |
| |
| try: |
| parsed = urlparse(url) |
| |
| |
| if parsed.scheme not in ['http', 'https']: |
| return False |
| |
| |
| netloc = parsed.netloc.lower() |
| for blocked in self.security.blocked_domains: |
| if blocked in netloc: |
| return False |
| |
| |
| if not self.security.allow_local_urls: |
| if any(local in netloc for local in ['localhost', '127.0.0.1', '0.0.0.0']): |
| return False |
| |
| if not self.security.allow_private_ips: |
| if any(private in netloc for private in ['10.', '172.', '192.168.']): |
| return False |
| |
| return True |
| |
| except Exception: |
| return False |
|
|
| |
| config = Config() |
|
|
| |
| PERPLEXITY_CONFIG = config.perplexity |
| SCRAPING_CONFIG = config.scraping |
| MODEL_CONFIG = config.models |
| EXPORT_CONFIG = config.export |
| SECURITY_CONFIG = config.security |
| UI_CONFIG = config.ui |