| """ |
| AI Dataset Studio - Minimal Version |
| Guaranteed to work with basic dependencies only |
| """ |
|
|
| import gradio as gr |
| import pandas as pd |
| import json |
| import re |
| import requests |
| from bs4 import BeautifulSoup |
| from urllib.parse import urlparse |
| from datetime import datetime |
| import logging |
| from typing import Dict, List, Tuple, Optional, Any |
| from dataclasses import dataclass, asdict |
| import uuid |
| import time |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| @dataclass |
| class SimpleScrapedItem: |
| """Simplified scraped content structure""" |
| id: str |
| url: str |
| title: str |
| content: str |
| word_count: int |
| scraped_at: str |
| quality_score: float = 0.0 |
|
|
| class SimpleWebScraper: |
| """Simplified web scraper with basic functionality""" |
| |
| def __init__(self): |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (compatible; AI-DatasetStudio/1.0)', |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' |
| }) |
| |
| def scrape_url(self, url: str) -> Optional[SimpleScrapedItem]: |
| """Scrape a single URL""" |
| try: |
| if not self._validate_url(url): |
| return None |
| |
| response = self.session.get(url, timeout=10) |
| response.raise_for_status() |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| |
| title_tag = soup.find('title') |
| title = title_tag.get_text().strip() if title_tag else "Untitled" |
| |
| |
| |
| for element in soup(['script', 'style', 'nav', 'header', 'footer']): |
| element.decompose() |
| |
| |
| content_element = (soup.find('article') or |
| soup.find('main') or |
| soup.find(class_='content') or |
| soup.find('body')) |
| |
| if content_element: |
| content = content_element.get_text(separator=' ', strip=True) |
| else: |
| content = soup.get_text(separator=' ', strip=True) |
| |
| |
| content = re.sub(r'\s+', ' ', content).strip() |
| |
| |
| word_count = len(content.split()) |
| quality_score = min(1.0, word_count / 100) if word_count > 0 else 0.0 |
| |
| return SimpleScrapedItem( |
| id=str(uuid.uuid4()), |
| url=url, |
| title=title, |
| content=content, |
| word_count=word_count, |
| scraped_at=datetime.now().isoformat(), |
| quality_score=quality_score |
| ) |
| |
| except Exception as e: |
| logger.error(f"Failed to scrape {url}: {e}") |
| return None |
| |
| def _validate_url(self, url: str) -> bool: |
| """Basic URL validation""" |
| try: |
| parsed = urlparse(url) |
| return parsed.scheme in ['http', 'https'] and parsed.netloc |
| except: |
| return False |
| |
| def batch_scrape(self, urls: List[str], progress_callback=None) -> List[SimpleScrapedItem]: |
| """Scrape multiple URLs""" |
| results = [] |
| total = len(urls) |
| |
| for i, url in enumerate(urls): |
| if progress_callback: |
| progress_callback((i + 1) / total, f"Scraping {i+1}/{total}") |
| |
| item = self.scrape_url(url) |
| if item: |
| results.append(item) |
| |
| time.sleep(1) |
| |
| return results |
|
|
| class SimpleDataProcessor: |
| """Basic data processing""" |
| |
| def process_items(self, items: List[SimpleScrapedItem], options: Dict[str, bool]) -> List[SimpleScrapedItem]: |
| """Process scraped items""" |
| processed = [] |
| |
| for item in items: |
| |
| if options.get('quality_filter', True) and item.quality_score < 0.3: |
| continue |
| |
| |
| if options.get('clean_text', True): |
| item.content = self._clean_text(item.content) |
| |
| processed.append(item) |
| |
| return processed |
| |
| def _clean_text(self, text: str) -> str: |
| """Basic text cleaning""" |
| |
| text = re.sub(r'http\S+', '', text) |
| |
| text = re.sub(r'\s+', ' ', text) |
| |
| text = re.sub(r'(Click here|Read more|Subscribe|Advertisement)', '', text, flags=re.IGNORECASE) |
| return text.strip() |
|
|
| class SimpleExporter: |
| """Basic export functionality""" |
| |
| def export_dataset(self, items: List[SimpleScrapedItem], format_type: str) -> str: |
| """Export dataset""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| |
| if format_type == "json": |
| filename = f"dataset_{timestamp}.json" |
| data = [asdict(item) for item in items] |
| with open(filename, 'w', encoding='utf-8') as f: |
| json.dump(data, f, indent=2, ensure_ascii=False) |
| return filename |
| |
| elif format_type == "csv": |
| filename = f"dataset_{timestamp}.csv" |
| data = [asdict(item) for item in items] |
| df = pd.DataFrame(data) |
| df.to_csv(filename, index=False) |
| return filename |
| |
| else: |
| raise ValueError(f"Unsupported format: {format_type}") |
|
|
| class SimpleDatasetStudio: |
| """Simplified main application""" |
| |
| def __init__(self): |
| self.scraper = SimpleWebScraper() |
| self.processor = SimpleDataProcessor() |
| self.exporter = SimpleExporter() |
| |
| self.scraped_items = [] |
| self.processed_items = [] |
| self.current_project = None |
| |
| def create_project(self, name: str) -> Dict[str, Any]: |
| """Create a new project""" |
| self.current_project = { |
| 'name': name, |
| 'id': str(uuid.uuid4()), |
| 'created_at': datetime.now().isoformat() |
| } |
| self.scraped_items = [] |
| self.processed_items = [] |
| return self.current_project |
| |
| def scrape_urls(self, urls: List[str], progress_callback=None) -> Tuple[int, List[str]]: |
| """Scrape URLs""" |
| url_list = [url.strip() for url in urls if url.strip()] |
| if not url_list: |
| return 0, ["No valid URLs provided"] |
| |
| self.scraped_items = self.scraper.batch_scrape(url_list, progress_callback) |
| success_count = len(self.scraped_items) |
| failed_count = len(url_list) - success_count |
| |
| errors = [] |
| if failed_count > 0: |
| errors.append(f"{failed_count} URLs failed") |
| |
| return success_count, errors |
| |
| def process_data(self, options: Dict[str, bool]) -> int: |
| """Process scraped data""" |
| if not self.scraped_items: |
| return 0 |
| |
| self.processed_items = self.processor.process_items(self.scraped_items, options) |
| return len(self.processed_items) |
| |
| def get_preview(self) -> List[Dict[str, Any]]: |
| """Get data preview""" |
| items = self.processed_items or self.scraped_items |
| preview = [] |
| |
| for item in items[:5]: |
| preview.append({ |
| 'Title': item.title[:50] + "..." if len(item.title) > 50 else item.title, |
| 'Content Preview': item.content[:100] + "..." if len(item.content) > 100 else item.content, |
| 'Word Count': item.word_count, |
| 'Quality Score': round(item.quality_score, 2), |
| 'URL': item.url[:50] + "..." if len(item.url) > 50 else item.url |
| }) |
| |
| return preview |
| |
| def get_stats(self) -> Dict[str, Any]: |
| """Get dataset statistics""" |
| items = self.processed_items or self.scraped_items |
| if not items: |
| return {} |
| |
| word_counts = [item.word_count for item in items] |
| quality_scores = [item.quality_score for item in items] |
| |
| return { |
| 'total_items': len(items), |
| 'avg_word_count': round(sum(word_counts) / len(word_counts)), |
| 'avg_quality': round(sum(quality_scores) / len(quality_scores), 2), |
| 'min_words': min(word_counts), |
| 'max_words': max(word_counts) |
| } |
| |
| def export_data(self, format_type: str) -> str: |
| """Export dataset""" |
| items = self.processed_items or self.scraped_items |
| if not items: |
| raise ValueError("No data to export") |
| |
| return self.exporter.export_dataset(items, format_type) |
|
|
| def create_simple_interface(): |
| """Create simplified Gradio interface""" |
| |
| studio = SimpleDatasetStudio() |
| |
| |
| css = """ |
| .container { max-width: 1200px; margin: auto; } |
| .header { |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
| color: white; padding: 2rem; border-radius: 10px; |
| text-align: center; margin-bottom: 2rem; |
| } |
| .step-box { |
| background: #f8f9ff; border: 1px solid #e1e5ff; |
| border-radius: 8px; padding: 1.5rem; margin: 1rem 0; |
| } |
| """ |
| |
| with gr.Blocks(css=css, title="AI Dataset Studio - Simple") as interface: |
| |
| |
| gr.HTML(""" |
| <div class="header"> |
| <h1>π AI Dataset Studio - Simple Version</h1> |
| <p>Create datasets from web content - No complex setup required!</p> |
| </div> |
| """) |
| |
| |
| project_state = gr.State({}) |
| |
| with gr.Tabs(): |
| |
| |
| with gr.Tab("π Project Setup"): |
| gr.HTML('<div class="step-box"><h3>Step 1: Create Your Project</h3></div>') |
| |
| project_name = gr.Textbox( |
| label="Project Name", |
| placeholder="e.g., News Articles Dataset", |
| value="My Dataset" |
| ) |
| |
| create_btn = gr.Button("Create Project", variant="primary") |
| project_status = gr.Markdown("") |
| |
| def create_project_handler(name): |
| if not name.strip(): |
| return "β Please enter a project name", {} |
| |
| project = studio.create_project(name.strip()) |
| status = f""" |
| β
**Project Created!** |
| |
| **Name:** {project['name']} |
| **ID:** {project['id'][:8]}... |
| **Created:** {project['created_at'][:19]} |
| |
| π Next: Go to Data Collection tab |
| """ |
| return status, project |
| |
| create_btn.click( |
| fn=create_project_handler, |
| inputs=[project_name], |
| outputs=[project_status, project_state] |
| ) |
| |
| |
| with gr.Tab("π·οΈ Data Collection"): |
| gr.HTML('<div class="step-box"><h3>Step 2: Scrape Web Content</h3></div>') |
| |
| urls_input = gr.Textbox( |
| label="URLs to Scrape (one per line)", |
| placeholder="https://example.com/article1\nhttps://example.com/article2", |
| lines=6 |
| ) |
| |
| scrape_btn = gr.Button("Start Scraping", variant="primary") |
| scrape_status = gr.Markdown("") |
| |
| def scrape_handler(urls_text, project, progress=gr.Progress()): |
| if not project: |
| return "β Create a project first" |
| |
| urls = [url.strip() for url in urls_text.split('\n') if url.strip()] |
| if not urls: |
| return "β No URLs provided" |
| |
| def progress_callback(pct, msg): |
| progress(pct, desc=msg) |
| |
| success_count, errors = studio.scrape_urls(urls, progress_callback) |
| |
| if success_count > 0: |
| return f""" |
| β
**Scraping Complete!** |
| |
| **Success:** {success_count} URLs |
| **Failed:** {len(urls) - success_count} URLs |
| |
| π Next: Go to Data Processing tab |
| """ |
| else: |
| return f"β Scraping failed: {', '.join(errors)}" |
| |
| scrape_btn.click( |
| fn=scrape_handler, |
| inputs=[urls_input, project_state], |
| outputs=[scrape_status] |
| ) |
| |
| |
| with gr.Tab("βοΈ Data Processing"): |
| gr.HTML('<div class="step-box"><h3>Step 3: Clean and Process Data</h3></div>') |
| |
| with gr.Row(): |
| clean_text = gr.Checkbox(label="Clean Text", value=True) |
| quality_filter = gr.Checkbox(label="Quality Filter", value=True) |
| |
| process_btn = gr.Button("Process Data", variant="primary") |
| process_status = gr.Markdown("") |
| |
| def process_handler(clean, quality, project): |
| if not project: |
| return "β Create a project first" |
| |
| options = { |
| 'clean_text': clean, |
| 'quality_filter': quality |
| } |
| |
| processed_count = studio.process_data(options) |
| |
| if processed_count > 0: |
| return f""" |
| β
**Processing Complete!** |
| |
| **Processed:** {processed_count} items |
| |
| π Next: Check Data Preview tab |
| """ |
| else: |
| return "β No items passed processing filters" |
| |
| process_btn.click( |
| fn=process_handler, |
| inputs=[clean_text, quality_filter, project_state], |
| outputs=[process_status] |
| ) |
| |
| |
| with gr.Tab("π Data Preview"): |
| gr.HTML('<div class="step-box"><h3>Step 4: Review Your Dataset</h3></div>') |
| |
| refresh_btn = gr.Button("Refresh Preview") |
| preview_table = gr.DataFrame(label="Dataset Preview") |
| stats_display = gr.JSON(label="Statistics") |
| |
| def refresh_handler(project): |
| if not project: |
| return None, {} |
| |
| preview = studio.get_preview() |
| stats = studio.get_stats() |
| return preview, stats |
| |
| refresh_btn.click( |
| fn=refresh_handler, |
| inputs=[project_state], |
| outputs=[preview_table, stats_display] |
| ) |
| |
| |
| with gr.Tab("π€ Export Dataset"): |
| gr.HTML('<div class="step-box"><h3>Step 5: Export Your Dataset</h3></div>') |
| |
| export_format = gr.Radio( |
| choices=["JSON", "CSV"], |
| label="Export Format", |
| value="JSON" |
| ) |
| |
| export_btn = gr.Button("Export Dataset", variant="primary") |
| export_status = gr.Markdown("") |
| export_file = gr.File(label="Download", visible=False) |
| |
| def export_handler(format_type, project): |
| if not project: |
| return "β Create a project first", None |
| |
| try: |
| filename = studio.export_data(format_type.lower()) |
| return f"β
Export successful! File: {filename}", filename |
| except Exception as e: |
| return f"β Export failed: {str(e)}", None |
| |
| export_btn.click( |
| fn=export_handler, |
| inputs=[export_format, project_state], |
| outputs=[export_status, export_file] |
| ) |
| |
| |
| with gr.Accordion("π Quick Guide", open=False): |
| gr.Markdown(""" |
| ## How to Use |
| |
| 1. **Create Project** - Give your dataset a name |
| 2. **Add URLs** - Paste URLs of web pages to scrape |
| 3. **Process Data** - Clean and filter the content |
| 4. **Review** - Check the quality of your dataset |
| 5. **Export** - Download in JSON or CSV format |
| |
| ## Features |
| - β
Smart content extraction |
| - β
Quality filtering |
| - β
Text cleaning |
| - β
JSON/CSV export |
| - β
Preview and statistics |
| |
| ## Tips |
| - Use high-quality source URLs |
| - Enable quality filtering for better results |
| - Review your data before exporting |
| - Start with 5-10 URLs to test |
| """) |
| |
| return interface |
|
|
| |
| if __name__ == "__main__": |
| logger.info("π Starting AI Dataset Studio (Simple Version)") |
| |
| try: |
| interface = create_simple_interface() |
| logger.info("β
Simple interface created successfully") |
| |
| interface.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| show_error=True |
| ) |
| |
| except Exception as e: |
| logger.error(f"β Failed to launch: {e}") |
| print("\nπ‘ If you see import errors, try installing:") |
| print("pip install gradio pandas requests beautifulsoup4") |
| raise |