| """ |
| π AI Dataset Studio with Perplexity AI Integration |
| A comprehensive platform for creating high-quality training datasets using AI-powered source discovery |
| """ |
|
|
| import gradio as gr |
| import pandas as pd |
| import requests |
| import json |
| import logging |
| import os |
| import sys |
| import time |
| import re |
| from datetime import datetime |
| from typing import List, Dict, Optional, Tuple, Any |
| from urllib.parse import urlparse, urljoin |
| from dataclasses import dataclass, asdict |
| import traceback |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s - %(levelname)s - %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
| |
| try: |
| from bs4 import BeautifulSoup |
| logger.info("β
BeautifulSoup imported successfully") |
| except ImportError as e: |
| logger.error("β Failed to import BeautifulSoup: %s", e) |
| sys.exit(1) |
|
|
| try: |
| import nltk |
| from nltk.corpus import stopwords |
| from nltk.tokenize import word_tokenize, sent_tokenize |
| logger.info("β
NLTK imported successfully") |
| HAS_NLTK = True |
| except ImportError: |
| logger.warning("β οΈ NLTK not available - using basic text processing") |
| HAS_NLTK = False |
|
|
| try: |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification |
| import torch |
| logger.info("β
Transformers imported successfully") |
| HAS_TRANSFORMERS = True |
| except ImportError: |
| logger.warning("β οΈ Transformers not available - using extractive summaries") |
| HAS_TRANSFORMERS = False |
|
|
| |
| try: |
| from perplexity_client import PerplexityClient, SearchType, SourceResult, SearchResults |
| logger.info("β
Perplexity client imported successfully") |
| HAS_PERPLEXITY = True |
| except ImportError: |
| logger.warning("β οΈ Perplexity client not available - manual source entry only") |
| HAS_PERPLEXITY = False |
|
|
| |
| DATASET_TEMPLATES = { |
| "sentiment_analysis": { |
| "name": "π Sentiment Analysis", |
| "description": "Classify text as positive, negative, or neutral", |
| "fields": ["text", "sentiment"], |
| "example": {"text": "This product is amazing!", "sentiment": "positive"}, |
| "search_queries": ["product reviews", "customer feedback", "social media posts", "movie reviews"] |
| }, |
| "text_classification": { |
| "name": "π Text Classification", |
| "description": "Categorize text into predefined classes", |
| "fields": ["text", "category"], |
| "example": {"text": "Breaking: Stock market reaches new high", "category": "finance"}, |
| "search_queries": ["news articles", "blog posts", "academic papers", "forum discussions"] |
| }, |
| "named_entity_recognition": { |
| "name": "π·οΈ Named Entity Recognition", |
| "description": "Identify people, places, organizations in text", |
| "fields": ["text", "entities"], |
| "example": {"text": "Apple Inc. was founded by Steve Jobs in California", |
| "entities": [{"text": "Apple Inc.", "label": "ORG"}, {"text": "Steve Jobs", "label": "PERSON"}]}, |
| "search_queries": ["news articles", "biographies", "company reports", "wikipedia articles"] |
| }, |
| "question_answering": { |
| "name": "β Question Answering", |
| "description": "Extract answers from context passages", |
| "fields": ["context", "question", "answer"], |
| "example": {"context": "The capital of France is Paris", "question": "What is the capital of France?", "answer": "Paris"}, |
| "search_queries": ["FAQ pages", "educational content", "interview transcripts", "knowledge bases"] |
| }, |
| "text_summarization": { |
| "name": "π Text Summarization", |
| "description": "Generate concise summaries of longer texts", |
| "fields": ["text", "summary"], |
| "example": {"text": "Long article content...", "summary": "Brief summary of key points"}, |
| "search_queries": ["news articles", "research papers", "blog posts", "reports"] |
| }, |
| "translation": { |
| "name": "π Translation", |
| "description": "Translate text between languages", |
| "fields": ["source_text", "target_text", "source_lang", "target_lang"], |
| "example": {"source_text": "Hello world", "target_text": "Hola mundo", "source_lang": "en", "target_lang": "es"}, |
| "search_queries": ["multilingual websites", "international news", "translation datasets", "parallel corpora"] |
| } |
| } |
|
|
| class DatasetStudio: |
| """ |
| π― Main Dataset Studio Class |
| Handles all core functionality for dataset creation |
| """ |
| |
| def __init__(self): |
| """Initialize the Dataset Studio""" |
| logger.info("π Initializing AI Dataset Studio...") |
| |
| |
| self.projects = {} |
| self.current_project = None |
| self.scraped_data = [] |
| self.processed_data = [] |
| |
| |
| self.sentiment_analyzer = None |
| self.summarizer = None |
| self.ner_model = None |
| |
| |
| self.perplexity_client = None |
| if HAS_PERPLEXITY: |
| try: |
| api_key = os.getenv('PERPLEXITY_API_KEY') |
| if api_key: |
| self.perplexity_client = PerplexityClient(api_key) |
| logger.info("β
Perplexity AI client initialized") |
| else: |
| logger.warning("β οΈ PERPLEXITY_API_KEY not found - manual source entry only") |
| except Exception as e: |
| logger.error(f"β Failed to initialize Perplexity client: {e}") |
| |
| self._load_models() |
| logger.info("β
Dataset Studio initialized successfully") |
| |
| def _load_models(self): |
| """Load AI models for processing""" |
| if not HAS_TRANSFORMERS: |
| logger.info("β οΈ Skipping model loading - transformers not available") |
| return |
| |
| try: |
| |
| logger.info("π¦ Loading sentiment analysis model...") |
| self.sentiment_analyzer = pipeline( |
| "sentiment-analysis", |
| model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
| return_all_scores=True |
| ) |
| logger.info("β
Sentiment analyzer loaded") |
| |
| except Exception as e: |
| logger.warning(f"β οΈ Could not load sentiment analyzer: {e}") |
| |
| try: |
| |
| logger.info("π¦ Loading summarization model...") |
| self.summarizer = pipeline( |
| "summarization", |
| model="facebook/bart-large-cnn", |
| max_length=150, |
| min_length=30, |
| do_sample=False |
| ) |
| logger.info("β
Summarizer loaded") |
| |
| except Exception as e: |
| logger.warning(f"β οΈ Could not load summarizer: {e}") |
| |
| try: |
| |
| logger.info("π¦ Loading NER model...") |
| self.ner_model = pipeline( |
| "ner", |
| model="dbmdz/bert-large-cased-finetuned-conll03-english", |
| aggregation_strategy="simple" |
| ) |
| logger.info("β
NER model loaded") |
| |
| except Exception as e: |
| logger.warning(f"β οΈ Could not load NER model: {e}") |
| |
| def discover_sources_with_ai( |
| self, |
| project_description: str, |
| max_sources: int = 20, |
| search_type: str = "general", |
| include_academic: bool = True, |
| include_news: bool = True |
| ) -> Tuple[str, str]: |
| """ |
| π§ Discover sources using Perplexity AI |
| |
| Args: |
| project_description: Description of the dataset project |
| max_sources: Maximum number of sources to find |
| search_type: Type of search (general, academic, news, etc.) |
| include_academic: Include academic sources |
| include_news: Include news sources |
| |
| Returns: |
| Tuple of (status_message, sources_json) |
| """ |
| if not self.perplexity_client: |
| return "β Perplexity AI not available. Please set PERPLEXITY_API_KEY environment variable.", "[]" |
| |
| try: |
| logger.info(f"π Discovering sources for: {project_description}") |
| |
| |
| search_type_enum = getattr(SearchType, search_type.upper(), SearchType.GENERAL) |
| |
| |
| results = self.perplexity_client.discover_sources( |
| project_description=project_description, |
| search_type=search_type_enum, |
| max_sources=max_sources, |
| include_academic=include_academic, |
| include_news=include_news |
| ) |
| |
| if not results.sources: |
| return "β οΈ No sources found. Try adjusting your search terms.", "[]" |
| |
| |
| sources_data = [] |
| for source in results.sources: |
| sources_data.append({ |
| "URL": source.url, |
| "Title": source.title, |
| "Description": source.description, |
| "Type": source.source_type, |
| "Domain": source.domain, |
| "Quality Score": f"{source.relevance_score:.1f}/10" |
| }) |
| |
| status = f"β
Found {len(results.sources)} sources in {results.search_time:.1f}s" |
| if results.suggestions: |
| status += f"\nπ‘ Suggestions: {', '.join(results.suggestions[:3])}" |
| |
| return status, json.dumps(sources_data, indent=2) |
| |
| except Exception as e: |
| logger.error(f"β Error discovering sources: {e}") |
| return f"β Error: {str(e)}", "[]" |
| |
| def extract_urls_from_sources(self, sources_json: str) -> List[str]: |
| """Extract URLs from discovered sources JSON""" |
| try: |
| sources = json.loads(sources_json) |
| if isinstance(sources, list): |
| return [source.get("URL", "") for source in sources if source.get("URL")] |
| return [] |
| except: |
| return [] |
| |
| def create_project(self, name: str, template: str, description: str) -> str: |
| """Create a new dataset project""" |
| if not name.strip(): |
| return "β Please provide a project name" |
| |
| project_id = f"project_{int(time.time())}" |
| self.projects[project_id] = { |
| "name": name, |
| "template": template, |
| "description": description, |
| "created_at": datetime.now().isoformat(), |
| "urls": [], |
| "data": [], |
| "processed_data": [] |
| } |
| |
| self.current_project = project_id |
| |
| template_info = DATASET_TEMPLATES.get(template, {}) |
| status = f"β
Project '{name}' created successfully!\n" |
| status += f"π Template: {template_info.get('name', template)}\n" |
| status += f"π Description: {description}\n" |
| status += f"π Project ID: {project_id}" |
| |
| return status |
| |
| def scrape_urls(self, urls_text: str, progress=gr.Progress()) -> Tuple[str, str]: |
| """Scrape content from provided URLs""" |
| if not self.current_project: |
| return "β Please create a project first", "" |
| |
| |
| urls = [] |
| for line in urls_text.strip().split('\n'): |
| url = line.strip() |
| if url and self._is_valid_url(url): |
| urls.append(url) |
| |
| if not urls: |
| return "β No valid URLs found", "" |
| |
| scraped_data = [] |
| failed_urls = [] |
| |
| progress(0, desc="Starting scraping...") |
| |
| for i, url in enumerate(urls): |
| try: |
| progress((i + 1) / len(urls), desc=f"Scraping {i + 1}/{len(urls)}") |
| |
| logger.info(f"π Scraping: {url}") |
| |
| |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| } |
| |
| response = requests.get(url, headers=headers, timeout=10) |
| response.raise_for_status() |
| |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| title = self._extract_title(soup) |
| content = self._extract_content(soup) |
| |
| if content: |
| scraped_data.append({ |
| 'url': url, |
| 'title': title, |
| 'content': content, |
| 'length': len(content), |
| 'scraped_at': datetime.now().isoformat() |
| }) |
| logger.info(f"β
Scraped {len(content)} characters from {url}") |
| else: |
| failed_urls.append(url) |
| logger.warning(f"β οΈ No content extracted from {url}") |
| |
| |
| time.sleep(0.5) |
| |
| except Exception as e: |
| failed_urls.append(url) |
| logger.error(f"β Failed to scrape {url}: {e}") |
| |
| |
| self.projects[self.current_project]['urls'] = urls |
| self.projects[self.current_project]['data'] = scraped_data |
| self.scraped_data = scraped_data |
| |
| |
| status = f"β
Scraping completed!\n" |
| status += f"π Successfully scraped: {len(scraped_data)} URLs\n" |
| status += f"β Failed: {len(failed_urls)} URLs\n" |
| status += f"π Total content: {sum(item['length'] for item in scraped_data):,} characters" |
| |
| if failed_urls: |
| status += f"\n\nFailed URLs:\n" + "\n".join(f"β’ {url}" for url in failed_urls[:5]) |
| if len(failed_urls) > 5: |
| status += f"\n... and {len(failed_urls) - 5} more" |
| |
| |
| preview_data = [] |
| for item in scraped_data[:10]: |
| preview_data.append({ |
| "Title": item['title'][:50] + "..." if len(item['title']) > 50 else item['title'], |
| "URL": item['url'], |
| "Length": f"{item['length']:,} chars", |
| "Preview": item['content'][:100] + "..." if len(item['content']) > 100 else item['content'] |
| }) |
| |
| return status, json.dumps(preview_data, indent=2) |
| |
| def process_data(self, template: str, progress=gr.Progress()) -> Tuple[str, str]: |
| """Process scraped data according to template""" |
| if not self.scraped_data: |
| return "β No scraped data available. Please scrape URLs first.", "" |
| |
| template_config = DATASET_TEMPLATES.get(template, {}) |
| if not template_config: |
| return f"β Unknown template: {template}", "" |
| |
| processed_data = [] |
| |
| progress(0, desc="Starting data processing...") |
| |
| for i, item in enumerate(self.scraped_data): |
| try: |
| progress((i + 1) / len(self.scraped_data), desc=f"Processing {i + 1}/{len(self.scraped_data)}") |
| |
| content = item['content'] |
| |
| |
| if template == "sentiment_analysis": |
| processed_item = self._process_sentiment_analysis(item) |
| elif template == "text_classification": |
| processed_item = self._process_text_classification(item) |
| elif template == "named_entity_recognition": |
| processed_item = self._process_ner(item) |
| elif template == "question_answering": |
| processed_item = self._process_qa(item) |
| elif template == "text_summarization": |
| processed_item = self._process_summarization(item) |
| elif template == "translation": |
| processed_item = self._process_translation(item) |
| else: |
| processed_item = self._process_generic(item) |
| |
| if processed_item: |
| processed_data.extend(processed_item) |
| |
| except Exception as e: |
| logger.error(f"β Error processing item {i}: {e}") |
| continue |
| |
| |
| self.processed_data = processed_data |
| if self.current_project: |
| self.projects[self.current_project]['processed_data'] = processed_data |
| |
| |
| status = f"β
Processing completed!\n" |
| status += f"π Generated {len(processed_data)} training examples\n" |
| status += f"π Template: {template_config['name']}\n" |
| status += f"π·οΈ Fields: {', '.join(template_config['fields'])}" |
| |
| |
| preview_data = processed_data[:10] if processed_data else [] |
| |
| return status, json.dumps(preview_data, indent=2) |
| |
| def _process_sentiment_analysis(self, item: Dict) -> List[Dict]: |
| """Process item for sentiment analysis""" |
| content = item['content'] |
| |
| |
| if HAS_NLTK: |
| try: |
| sentences = sent_tokenize(content) |
| except: |
| sentences = content.split('. ') |
| else: |
| sentences = content.split('. ') |
| |
| results = [] |
| |
| for sentence in sentences: |
| sentence = sentence.strip() |
| if len(sentence) < 10 or len(sentence) > 500: |
| continue |
| |
| |
| if self.sentiment_analyzer: |
| try: |
| prediction = self.sentiment_analyzer(sentence)[0] |
| |
| label_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative', 'NEUTRAL': 'neutral'} |
| sentiment = label_map.get(prediction[0]['label'], 'neutral') |
| confidence = prediction[0]['score'] |
| |
| |
| if confidence > 0.7: |
| results.append({ |
| 'text': sentence, |
| 'sentiment': sentiment, |
| 'confidence': confidence, |
| 'source_url': item['url'] |
| }) |
| except Exception as e: |
| logger.debug(f"Sentiment analysis failed: {e}") |
| continue |
| else: |
| |
| sentiment = self._keyword_sentiment(sentence) |
| results.append({ |
| 'text': sentence, |
| 'sentiment': sentiment, |
| 'source_url': item['url'] |
| }) |
| |
| return results[:20] |
| |
| def _process_text_classification(self, item: Dict) -> List[Dict]: |
| """Process item for text classification""" |
| content = item['content'] |
| |
| |
| url = item['url'] |
| category = self._extract_category_from_url(url) |
| |
| |
| paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50] |
| |
| results = [] |
| for paragraph in paragraphs[:10]: |
| results.append({ |
| 'text': paragraph, |
| 'category': category, |
| 'source_url': url |
| }) |
| |
| return results |
| |
| def _process_ner(self, item: Dict) -> List[Dict]: |
| """Process item for Named Entity Recognition""" |
| content = item['content'] |
| |
| if HAS_NLTK: |
| try: |
| sentences = sent_tokenize(content) |
| except: |
| sentences = content.split('. ') |
| else: |
| sentences = content.split('. ') |
| |
| results = [] |
| |
| for sentence in sentences[:20]: |
| sentence = sentence.strip() |
| if len(sentence) < 20: |
| continue |
| |
| entities = [] |
| |
| if self.ner_model: |
| try: |
| ner_results = self.ner_model(sentence) |
| for entity in ner_results: |
| entities.append({ |
| 'text': entity['word'], |
| 'label': entity['entity_group'], |
| 'confidence': entity['score'] |
| }) |
| except Exception as e: |
| logger.debug(f"NER failed: {e}") |
| |
| |
| if not entities: |
| entities = self._simple_ner(sentence) |
| |
| if entities: |
| results.append({ |
| 'text': sentence, |
| 'entities': entities, |
| 'source_url': item['url'] |
| }) |
| |
| return results |
| |
| def _process_qa(self, item: Dict) -> List[Dict]: |
| """Process item for Question Answering""" |
| content = item['content'] |
| |
| |
| results = [] |
| |
| |
| qa_patterns = [ |
| (r'Q:\s*(.+?)\s*A:\s*(.+?)(?=Q:|$)', 'qa'), |
| (r'Question:\s*(.+?)\s*Answer:\s*(.+?)(?=Question:|$)', 'qa'), |
| (r'(.+\?)\s*(.+?)(?=.+\?|$)', 'simple') |
| ] |
| |
| for pattern, style in qa_patterns: |
| matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE) |
| |
| for match in matches[:10]: |
| if len(match) == 2: |
| question = match[0].strip() |
| answer = match[1].strip() |
| |
| if len(question) > 10 and len(answer) > 10: |
| results.append({ |
| 'context': content[:500], |
| 'question': question, |
| 'answer': answer, |
| 'source_url': item['url'] |
| }) |
| |
| return results |
| |
| def _process_summarization(self, item: Dict) -> List[Dict]: |
| """Process item for summarization""" |
| content = item['content'] |
| |
| |
| chunk_size = 1000 |
| chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)] |
| |
| results = [] |
| |
| for chunk in chunks[:5]: |
| if len(chunk) < 100: |
| continue |
| |
| summary = "" |
| |
| if self.summarizer and len(chunk) > 100: |
| try: |
| summary_result = self.summarizer(chunk, max_length=100, min_length=30) |
| summary = summary_result[0]['summary_text'] |
| except Exception as e: |
| logger.debug(f"Summarization failed: {e}") |
| |
| |
| if not summary: |
| summary = self._extractive_summary(chunk) |
| |
| if summary: |
| results.append({ |
| 'text': chunk, |
| 'summary': summary, |
| 'source_url': item['url'] |
| }) |
| |
| return results |
| |
| def _process_translation(self, item: Dict) -> List[Dict]: |
| """Process item for translation (placeholder)""" |
| |
| |
| return [] |
| |
| def _process_generic(self, item: Dict) -> List[Dict]: |
| """Generic processing for unknown templates""" |
| content = item['content'] |
| |
| |
| paragraphs = [p.strip() for p in content.split('\n\n') if len(p.strip()) > 50] |
| |
| results = [] |
| for paragraph in paragraphs[:10]: |
| results.append({ |
| 'text': paragraph, |
| 'source_url': item['url'] |
| }) |
| |
| return results |
| |
| def export_dataset(self, format_type: str) -> Tuple[str, str]: |
| """Export processed dataset""" |
| if not self.processed_data: |
| return "β No processed data available", "" |
| |
| try: |
| if format_type == "JSON": |
| data = json.dumps(self.processed_data, indent=2) |
| filename = f"dataset_{int(time.time())}.json" |
| |
| elif format_type == "CSV": |
| df = pd.DataFrame(self.processed_data) |
| data = df.to_csv(index=False) |
| filename = f"dataset_{int(time.time())}.csv" |
| |
| elif format_type == "HuggingFace Dataset": |
| |
| hf_data = { |
| "data": self.processed_data, |
| "info": { |
| "description": "AI Dataset Studio generated dataset", |
| "created_at": datetime.now().isoformat(), |
| "size": len(self.processed_data) |
| } |
| } |
| data = json.dumps(hf_data, indent=2) |
| filename = f"hf_dataset_{int(time.time())}.json" |
| |
| elif format_type == "JSONL": |
| lines = [json.dumps(item) for item in self.processed_data] |
| data = '\n'.join(lines) |
| filename = f"dataset_{int(time.time())}.jsonl" |
| |
| else: |
| return "β Unsupported format", "" |
| |
| |
| temp_path = f"/tmp/{filename}" |
| with open(temp_path, 'w', encoding='utf-8') as f: |
| f.write(data) |
| |
| status = f"β
Dataset exported successfully!\n" |
| status += f"π Records: {len(self.processed_data)}\n" |
| status += f"π Format: {format_type}\n" |
| status += f"π Size: {len(data):,} characters" |
| |
| return status, temp_path |
| |
| except Exception as e: |
| logger.error(f"Export failed: {e}") |
| return f"β Export failed: {str(e)}", "" |
| |
| |
| def _is_valid_url(self, url: str) -> bool: |
| """Validate URL format""" |
| try: |
| result = urlparse(url) |
| return all([result.scheme, result.netloc]) |
| except: |
| return False |
| |
| def _extract_title(self, soup: BeautifulSoup) -> str: |
| """Extract title from HTML""" |
| title_tag = soup.find('title') |
| if title_tag: |
| return title_tag.get_text().strip() |
| |
| h1_tag = soup.find('h1') |
| if h1_tag: |
| return h1_tag.get_text().strip() |
| |
| return "Untitled" |
| |
| def _extract_content(self, soup: BeautifulSoup) -> str: |
| """Extract main content from HTML""" |
| |
| for script in soup(["script", "style", "nav", "footer", "header"]): |
| script.decompose() |
| |
| |
| main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|article')) |
| |
| if main_content: |
| text = main_content.get_text() |
| else: |
| text = soup.get_text() |
| |
| |
| lines = (line.strip() for line in text.splitlines()) |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| text = ' '.join(chunk for chunk in chunks if chunk) |
| |
| return text |
| |
| def _keyword_sentiment(self, text: str) -> str: |
| """Simple keyword-based sentiment analysis""" |
| positive_words = ['good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love', 'like'] |
| negative_words = ['bad', 'terrible', 'awful', 'hate', 'dislike', 'horrible', 'worst'] |
| |
| text_lower = text.lower() |
| |
| pos_count = sum(1 for word in positive_words if word in text_lower) |
| neg_count = sum(1 for word in negative_words if word in text_lower) |
| |
| if pos_count > neg_count: |
| return 'positive' |
| elif neg_count > pos_count: |
| return 'negative' |
| else: |
| return 'neutral' |
| |
| def _extract_category_from_url(self, url: str) -> str: |
| """Extract category based on URL domain/path""" |
| domain = urlparse(url).netloc.lower() |
| |
| if any(news in domain for news in ['cnn', 'bbc', 'reuters', 'news']): |
| return 'news' |
| elif any(tech in domain for tech in ['techcrunch', 'wired', 'tech']): |
| return 'technology' |
| elif any(biz in domain for biz in ['bloomberg', 'forbes', 'business']): |
| return 'business' |
| elif any(sport in domain for sport in ['espn', 'sport']): |
| return 'sports' |
| else: |
| return 'general' |
| |
| def _simple_ner(self, text: str) -> List[Dict]: |
| """Simple pattern-based NER""" |
| entities = [] |
| |
| |
| cap_words = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text) |
| |
| for word in cap_words: |
| if len(word) > 2: |
| entities.append({ |
| 'text': word, |
| 'label': 'MISC', |
| 'confidence': 0.5 |
| }) |
| |
| return entities[:5] |
| |
| def _extractive_summary(self, text: str) -> str: |
| """Simple extractive summarization""" |
| sentences = text.split('. ') |
| |
| if len(sentences) <= 2: |
| return text |
| |
| |
| summary = f"{sentences[0]}. {sentences[-1]}" |
| |
| return summary |
|
|
| def create_modern_interface(): |
| """Create the modern Gradio interface""" |
| logger.info("π¨ Creating modern interface...") |
| |
| |
| studio = DatasetStudio() |
| |
| |
| custom_css = """ |
| .gradio-container { |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
| } |
| |
| .main-header { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; |
| padding: 2rem; |
| border-radius: 10px; |
| margin-bottom: 2rem; |
| text-align: center; |
| } |
| |
| .step-header { |
| background: linear-gradient(90deg, #4facfe 0%, #00f2fe 100%); |
| color: white; |
| padding: 1rem; |
| border-radius: 8px; |
| margin: 1rem 0; |
| font-weight: bold; |
| } |
| |
| .template-card { |
| border: 2px solid #e1e5e9; |
| border-radius: 10px; |
| padding: 1rem; |
| margin: 0.5rem; |
| transition: all 0.3s ease; |
| } |
| |
| .template-card:hover { |
| border-color: #4facfe; |
| box-shadow: 0 4px 12px rgba(79, 172, 254, 0.3); |
| } |
| |
| .status-success { |
| background-color: #d4edda; |
| border-color: #c3e6cb; |
| color: #155724; |
| padding: 1rem; |
| border-radius: 5px; |
| border-left: 4px solid #28a745; |
| } |
| |
| .status-error { |
| background-color: #f8d7da; |
| border-color: #f5c6cb; |
| color: #721c24; |
| padding: 1rem; |
| border-radius: 5px; |
| border-left: 4px solid #dc3545; |
| } |
| """ |
| |
| with gr.Blocks(css=custom_css, title="π AI Dataset Studio", theme=gr.themes.Soft()) as interface: |
| |
| gr.HTML(""" |
| <div class="main-header"> |
| <h1>π AI Dataset Studio</h1> |
| <p>Create high-quality training datasets with AI-powered source discovery</p> |
| <p><strong>π§ Powered by Perplexity AI β’ π€ Advanced NLP β’ π Professional Export</strong></p> |
| </div> |
| """) |
| |
| with gr.Tabs() as tabs: |
| |
| with gr.TabItem("1οΈβ£ Project Setup", id=0): |
| gr.HTML('<div class="step-header">π Step 1: Create Your Dataset Project</div>') |
| |
| with gr.Row(): |
| with gr.Column(scale=2): |
| project_name = gr.Textbox( |
| label="π·οΈ Project Name", |
| placeholder="e.g., Customer Review Sentiment Analysis", |
| info="Give your dataset project a descriptive name" |
| ) |
| |
| project_description = gr.Textbox( |
| label="π Project Description", |
| lines=3, |
| placeholder="Describe what kind of dataset you want to create...", |
| info="This will be used by AI to discover relevant sources" |
| ) |
| |
| with gr.Column(scale=1): |
| |
| template_choices = list(DATASET_TEMPLATES.keys()) |
| template_labels = [DATASET_TEMPLATES[t]["name"] for t in template_choices] |
| |
| template_selector = gr.Dropdown( |
| choices=list(zip(template_labels, template_choices)), |
| label="π Dataset Template", |
| value=(template_labels[0], template_choices[0]), |
| info="Choose the type of ML task" |
| ) |
| |
| |
| template_info = gr.Markdown("Select a template to see details") |
| |
| create_project_btn = gr.Button("π― Create Project", variant="primary", size="lg") |
| project_status = gr.Textbox(label="π Project Status", interactive=False) |
| |
| |
| def update_template_info(template_choice): |
| if template_choice and len(template_choice) > 1: |
| template_key = template_choice[1] |
| template = DATASET_TEMPLATES.get(template_key, {}) |
| info = f"**{template.get('name', '')}**\n\n" |
| info += f"π {template.get('description', '')}\n\n" |
| info += f"π·οΈ **Fields:** {', '.join(template.get('fields', []))}\n\n" |
| info += f"π‘ **Example:** `{template.get('example', {})}`" |
| return info |
| return "Select a template to see details" |
| |
| template_selector.change( |
| fn=update_template_info, |
| inputs=[template_selector], |
| outputs=[template_info] |
| ) |
| |
| |
| with gr.TabItem("2οΈβ£ AI Source Discovery", id=1): |
| gr.HTML('<div class="step-header">π§ Step 2: Discover Sources with Perplexity AI</div>') |
| |
| if HAS_PERPLEXITY: |
| gr.Markdown(""" |
| β¨ **AI-Powered Source Discovery** - Let Perplexity AI find the best sources for your dataset! |
| |
| Just describe your project and AI will discover relevant, high-quality sources automatically. |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| ai_search_description = gr.Textbox( |
| label="π― Project Description for AI Search", |
| lines=3, |
| placeholder="e.g., I need product reviews for sentiment analysis training data...", |
| info="Describe what sources you need - be specific!" |
| ) |
| |
| with gr.Row(): |
| search_type = gr.Dropdown( |
| choices=["general", "academic", "news", "technical"], |
| value="general", |
| label="π Search Type" |
| ) |
| |
| max_sources = gr.Slider( |
| minimum=5, |
| maximum=50, |
| value=20, |
| step=5, |
| label="π Max Sources" |
| ) |
| |
| with gr.Row(): |
| include_academic = gr.Checkbox(label="π Include Academic Sources", value=True) |
| include_news = gr.Checkbox(label="π° Include News Sources", value=True) |
| |
| discover_btn = gr.Button("π§ Discover Sources with AI", variant="primary", size="lg") |
| |
| ai_search_status = gr.Textbox(label="π Discovery Status", interactive=False) |
| discovered_sources = gr.Code(label="π Discovered Sources", language="json", interactive=False) |
| |
| |
| use_ai_sources_btn = gr.Button("β
Use These Sources", variant="secondary") |
| |
| else: |
| gr.Markdown(""" |
| β οΈ **Perplexity AI Not Available** |
| |
| To enable AI-powered source discovery, set your `PERPLEXITY_API_KEY` environment variable. |
| For now, you can manually enter URLs below. |
| """) |
| |
| discovered_sources = gr.Code(value="[]", visible=False) |
| |
| gr.HTML('<div class="step-header">π Manual URL Entry</div>') |
| |
| urls_input = gr.Textbox( |
| label="π URLs to Scrape", |
| lines=10, |
| placeholder="https://example.com/article1\nhttps://example.com/article2\n...", |
| info="Enter one URL per line" |
| ) |
| |
| scrape_btn = gr.Button("π·οΈ Start Scraping", variant="primary", size="lg") |
| scrape_status = gr.Textbox(label="π Scraping Status", interactive=False) |
| scraped_preview = gr.Code(label="π Scraped Data Preview", language="json", interactive=False) |
| |
| |
| with gr.TabItem("3οΈβ£ Data Processing", id=2): |
| gr.HTML('<div class="step-header">βοΈ Step 3: Process Data with AI</div>') |
| |
| processing_template = gr.Dropdown( |
| choices=list(zip(template_labels, template_choices)), |
| label="π Processing Template", |
| value=(template_labels[0], template_choices[0]), |
| info="How should the data be processed?" |
| ) |
| |
| process_btn = gr.Button("βοΈ Process Data", variant="primary", size="lg") |
| process_status = gr.Textbox(label="π Processing Status", interactive=False) |
| processed_preview = gr.Code(label="π― Processed Data Preview", language="json", interactive=False) |
| |
| |
| with gr.TabItem("4οΈβ£ Export Dataset", id=3): |
| gr.HTML('<div class="step-header">π¦ Step 4: Export Your Dataset</div>') |
| |
| export_format = gr.Dropdown( |
| choices=["JSON", "CSV", "HuggingFace Dataset", "JSONL"], |
| value="JSON", |
| label="π Export Format", |
| info="Choose format for your dataset" |
| ) |
| |
| export_btn = gr.Button("π¦ Export Dataset", variant="primary", size="lg") |
| export_status = gr.Textbox(label="π Export Status", interactive=False) |
| download_file = gr.File(label="πΎ Download Dataset", interactive=False) |
| |
| |
| create_project_btn.click( |
| fn=lambda name, desc, template: studio.create_project(name, template[1] if template else "", desc), |
| inputs=[project_name, project_description, template_selector], |
| outputs=[project_status] |
| ) |
| |
| if HAS_PERPLEXITY: |
| discover_btn.click( |
| fn=studio.discover_sources_with_ai, |
| inputs=[ai_search_description, max_sources, search_type, include_academic, include_news], |
| outputs=[ai_search_status, discovered_sources] |
| ) |
| |
| use_ai_sources_btn.click( |
| fn=lambda sources_json: '\n'.join(studio.extract_urls_from_sources(sources_json)), |
| inputs=[discovered_sources], |
| outputs=[urls_input] |
| ) |
| |
| scrape_btn.click( |
| fn=studio.scrape_urls, |
| inputs=[urls_input], |
| outputs=[scrape_status, scraped_preview] |
| ) |
| |
| process_btn.click( |
| fn=lambda template: studio.process_data(template[1] if template else ""), |
| inputs=[processing_template], |
| outputs=[process_status, processed_preview] |
| ) |
| |
| export_btn.click( |
| fn=studio.export_dataset, |
| inputs=[export_format], |
| outputs=[export_status, download_file] |
| ) |
| |
| logger.info("β
Interface created successfully") |
| return interface |
|
|
| |
| try: |
| logger.info("π Starting AI Dataset Studio...") |
| logger.info("π Features: β
AI Models | β
Advanced NLP | β
HuggingFace Integration") |
| |
| interface = create_modern_interface() |
| |
| logger.info("β
Application startup successful") |
| |
| if __name__ == "__main__": |
| interface.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| show_error=True |
| ) |
|
|
| except Exception as e: |
| logger.error(f"β Failed to launch application: {e}") |
| logger.error(f"Traceback: {traceback.format_exc()}") |
| sys.exit(1) |