| """ |
| π§ͺ Testing utilities for Perplexity AI integration |
| Run comprehensive tests to validate your AI Dataset Studio deployment |
| """ |
|
|
| import os |
| import json |
| import time |
| import logging |
| from typing import Dict, List, Tuple, Optional |
| from datetime import datetime |
|
|
| |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| logger = logging.getLogger(__name__) |
|
|
| def test_environment_setup() -> Dict[str, bool]: |
| """ |
| π Test environment setup and dependencies |
| |
| Returns: |
| Dict with test results for each component |
| """ |
| results = {} |
| |
| print("π Testing Environment Setup...") |
| print("=" * 50) |
| |
| |
| try: |
| import sys |
| python_version = sys.version_info |
| if python_version >= (3, 8): |
| print(f"β
Python version: {python_version.major}.{python_version.minor}") |
| results['python_version'] = True |
| else: |
| print(f"β Python version too old: {python_version.major}.{python_version.minor} (need 3.8+)") |
| results['python_version'] = False |
| except Exception as e: |
| print(f"β Python version check failed: {e}") |
| results['python_version'] = False |
| |
| |
| required_packages = [ |
| ('gradio', 'Gradio'), |
| ('requests', 'Requests'), |
| ('pandas', 'Pandas'), |
| ('beautifulsoup4', 'BeautifulSoup'), |
| ('transformers', 'Transformers'), |
| ('torch', 'PyTorch'), |
| ('nltk', 'NLTK') |
| ] |
| |
| for package, name in required_packages: |
| try: |
| __import__(package) |
| print(f"β
{name} imported successfully") |
| results[f'package_{package}'] = True |
| except ImportError: |
| print(f"β οΈ {name} not available (optional for some features)") |
| results[f'package_{package}'] = False |
| |
| |
| env_vars = ['PERPLEXITY_API_KEY', 'HF_TOKEN'] |
| for var in env_vars: |
| if os.getenv(var): |
| print(f"β
{var} is set") |
| results[f'env_{var.lower()}'] = True |
| else: |
| status = "β" if var == 'PERPLEXITY_API_KEY' else "β οΈ" |
| required = "required" if var == 'PERPLEXITY_API_KEY' else "optional" |
| print(f"{status} {var} not set ({required})") |
| results[f'env_{var.lower()}'] = bool(os.getenv(var)) |
| |
| |
| required_files = ['app.py', 'perplexity_client.py', 'config.py', 'requirements.txt'] |
| for file in required_files: |
| if os.path.exists(file): |
| print(f"β
{file} found") |
| results[f'file_{file}'] = True |
| else: |
| print(f"β {file} missing") |
| results[f'file_{file}'] = False |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def test_perplexity_api() -> Dict[str, any]: |
| """ |
| π§ Test Perplexity API connectivity and functionality |
| |
| Returns: |
| Dict with API test results |
| """ |
| results = { |
| 'api_key_valid': False, |
| 'connection_successful': False, |
| 'response_quality': False, |
| 'rate_limiting': False, |
| 'error_handling': False |
| } |
| |
| print("π§ Testing Perplexity API...") |
| print("=" * 50) |
| |
| try: |
| from perplexity_client import PerplexityClient, SearchType |
| |
| |
| client = PerplexityClient() |
| if client._validate_api_key(): |
| print("β
API key is valid") |
| results['api_key_valid'] = True |
| else: |
| print("β API key validation failed") |
| return results |
| |
| |
| try: |
| test_results = client.discover_sources( |
| project_description="Test query for API connectivity", |
| search_type=SearchType.GENERAL, |
| max_sources=5 |
| ) |
| |
| if test_results.sources or test_results.perplexity_response: |
| print("β
API connection successful") |
| results['connection_successful'] = True |
| else: |
| print("β οΈ API connected but no results returned") |
| results['connection_successful'] = True |
| |
| except Exception as e: |
| print(f"β API connection failed: {e}") |
| return results |
| |
| |
| try: |
| quality_test = client.discover_sources( |
| project_description="Find product reviews for sentiment analysis machine learning training", |
| search_type=SearchType.GENERAL, |
| max_sources=10 |
| ) |
| |
| if len(quality_test.sources) >= 3: |
| avg_score = sum(s.relevance_score for s in quality_test.sources) / len(quality_test.sources) |
| if avg_score >= 5.0: |
| print(f"β
Response quality good (avg score: {avg_score:.1f})") |
| results['response_quality'] = True |
| else: |
| print(f"β οΈ Response quality moderate (avg score: {avg_score:.1f})") |
| results['response_quality'] = True |
| else: |
| print("β οΈ Limited response quality (few sources found)") |
| |
| except Exception as e: |
| print(f"β οΈ Response quality test failed: {e}") |
| |
| |
| try: |
| start_time = time.time() |
| |
| |
| client.discover_sources("Test query 1", max_sources=3) |
| time.sleep(0.1) |
| client.discover_sources("Test query 2", max_sources=3) |
| |
| elapsed = time.time() - start_time |
| if elapsed >= 1.0: |
| print("β
Rate limiting is working") |
| results['rate_limiting'] = True |
| else: |
| print("β οΈ Rate limiting may not be active") |
| |
| except Exception as e: |
| print(f"β οΈ Rate limiting test inconclusive: {e}") |
| |
| |
| try: |
| |
| error_test = client.discover_sources("", max_sources=1) |
| print("β
Error handling works (handled empty query)") |
| results['error_handling'] = True |
| |
| except Exception as e: |
| print(f"β
Error handling works (caught exception: {type(e).__name__})") |
| results['error_handling'] = True |
| |
| except ImportError: |
| print("β Cannot import perplexity_client module") |
| except Exception as e: |
| print(f"β Unexpected error in Perplexity tests: {e}") |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def test_ai_models() -> Dict[str, bool]: |
| """ |
| π€ Test AI model loading and functionality |
| |
| Returns: |
| Dict with model test results |
| """ |
| results = {} |
| |
| print("π€ Testing AI Models...") |
| print("=" * 50) |
| |
| try: |
| from transformers import pipeline |
| import torch |
| |
| |
| gpu_available = torch.cuda.is_available() |
| print(f"π§ GPU available: {gpu_available}") |
| results['gpu_available'] = gpu_available |
| |
| |
| try: |
| sentiment_analyzer = pipeline( |
| "sentiment-analysis", |
| model="cardiffnlp/twitter-roberta-base-sentiment-latest", |
| return_all_scores=True |
| ) |
| |
| test_text = "This is a great product!" |
| result = sentiment_analyzer(test_text) |
| |
| if result and len(result[0]) > 0: |
| print("β
Sentiment analysis model loaded and working") |
| results['sentiment_model'] = True |
| else: |
| print("β Sentiment analysis model not working properly") |
| results['sentiment_model'] = False |
| |
| except Exception as e: |
| print(f"β οΈ Sentiment analysis model failed: {e}") |
| results['sentiment_model'] = False |
| |
| |
| try: |
| summarizer = pipeline( |
| "summarization", |
| model="facebook/bart-large-cnn", |
| max_length=100, |
| min_length=30 |
| ) |
| |
| test_text = """ |
| Artificial intelligence has become increasingly important in modern technology. |
| Machine learning algorithms are being used across various industries to solve |
| complex problems and improve efficiency. Natural language processing, computer |
| vision, and robotics are some of the key areas where AI is making significant |
| contributions to society and business. |
| """ |
| |
| result = summarizer(test_text) |
| |
| if result and len(result[0]['summary_text']) > 10: |
| print("β
Summarization model loaded and working") |
| results['summarization_model'] = True |
| else: |
| print("β Summarization model not working properly") |
| results['summarization_model'] = False |
| |
| except Exception as e: |
| print(f"β οΈ Summarization model failed: {e}") |
| results['summarization_model'] = False |
| |
| |
| try: |
| ner_model = pipeline( |
| "ner", |
| model="dbmdz/bert-large-cased-finetuned-conll03-english", |
| aggregation_strategy="simple" |
| ) |
| |
| test_text = "Apple Inc. was founded by Steve Jobs in California." |
| result = ner_model(test_text) |
| |
| if result and len(result) > 0: |
| print("β
NER model loaded and working") |
| results['ner_model'] = True |
| else: |
| print("β NER model not working properly") |
| results['ner_model'] = False |
| |
| except Exception as e: |
| print(f"β οΈ NER model failed: {e}") |
| results['ner_model'] = False |
| |
| except ImportError: |
| print("β Transformers not available - AI models cannot be tested") |
| results = {'transformers_available': False} |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def test_web_scraping() -> Dict[str, bool]: |
| """ |
| π·οΈ Test web scraping functionality |
| |
| Returns: |
| Dict with scraping test results |
| """ |
| results = {} |
| |
| print("π·οΈ Testing Web Scraping...") |
| print("=" * 50) |
| |
| try: |
| import requests |
| from bs4 import BeautifulSoup |
| |
| |
| test_urls = [ |
| "https://httpbin.org/html", |
| "https://example.com", |
| "https://httpbin.org/json" |
| ] |
| |
| successful_scrapes = 0 |
| |
| for url in test_urls: |
| try: |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| } |
| |
| response = requests.get(url, headers=headers, timeout=10) |
| |
| if response.status_code == 200: |
| |
| if 'html' in url: |
| soup = BeautifulSoup(response.content, 'html.parser') |
| text = soup.get_text() |
| if len(text) > 10: |
| successful_scrapes += 1 |
| print(f"β
Successfully scraped HTML from {url}") |
| else: |
| if len(response.text) > 10: |
| successful_scrapes += 1 |
| print(f"β
Successfully retrieved content from {url}") |
| else: |
| print(f"β οΈ HTTP {response.status_code} from {url}") |
| |
| except Exception as e: |
| print(f"β Failed to scrape {url}: {e}") |
| |
| if successful_scrapes >= 2: |
| print("β
Web scraping functionality working") |
| results['scraping_works'] = True |
| else: |
| print("β Web scraping has issues") |
| results['scraping_works'] = False |
| |
| results['successful_scrapes'] = successful_scrapes |
| results['total_tests'] = len(test_urls) |
| |
| except ImportError as e: |
| print(f"β Required packages not available: {e}") |
| results['scraping_works'] = False |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def test_complete_workflow() -> Dict[str, any]: |
| """ |
| π Test complete dataset creation workflow |
| |
| Returns: |
| Dict with workflow test results |
| """ |
| results = { |
| 'project_creation': False, |
| 'source_discovery': False, |
| 'data_scraping': False, |
| 'data_processing': False, |
| 'data_export': False, |
| 'total_time': 0 |
| } |
| |
| print("π Testing Complete Workflow...") |
| print("=" * 50) |
| |
| start_time = time.time() |
| |
| try: |
| |
| from app import DatasetStudio |
| |
| |
| studio = DatasetStudio() |
| print("β
Dataset Studio initialized") |
| |
| |
| project_status = studio.create_project( |
| name="Test Project", |
| template="sentiment_analysis", |
| description="Test project for workflow validation" |
| ) |
| |
| if "β
" in project_status: |
| print("β
Project creation successful") |
| results['project_creation'] = True |
| else: |
| print("β Project creation failed") |
| return results |
| |
| |
| if studio.perplexity_client: |
| discovery_status, sources_json = studio.discover_sources_with_ai( |
| project_description="Product reviews for sentiment analysis testing", |
| max_sources=5, |
| search_type="general" |
| ) |
| |
| if "β
" in discovery_status and sources_json != "[]": |
| print("β
AI source discovery successful") |
| results['source_discovery'] = True |
| |
| |
| test_urls = studio.extract_urls_from_sources(sources_json) |
| if test_urls: |
| test_urls = test_urls[:2] |
| else: |
| print("β οΈ AI source discovery didn't find sources, using fallback") |
| test_urls = ["https://httpbin.org/html"] |
| else: |
| print("β οΈ Perplexity not available, using test URLs") |
| test_urls = ["https://httpbin.org/html"] |
| |
| |
| if test_urls: |
| scrape_status, scraped_data = studio.scrape_urls('\n'.join(test_urls)) |
| |
| if "β
" in scrape_status: |
| print("β
Data scraping successful") |
| results['data_scraping'] = True |
| else: |
| print("β Data scraping failed") |
| return results |
| |
| |
| if studio.scraped_data: |
| process_status, processed_data = studio.process_data("sentiment_analysis") |
| |
| if "β
" in process_status: |
| print("β
Data processing successful") |
| results['data_processing'] = True |
| else: |
| print("β οΈ Data processing had issues but continued") |
| results['data_processing'] = True |
| |
| |
| if studio.processed_data: |
| export_status, file_path = studio.export_dataset("JSON") |
| |
| if "β
" in export_status and file_path: |
| print("β
Data export successful") |
| results['data_export'] = True |
| else: |
| print("β Data export failed") |
| |
| except Exception as e: |
| print(f"β Workflow test failed: {e}") |
| logger.exception("Workflow test error") |
| |
| results['total_time'] = time.time() - start_time |
| print(f"β±οΈ Total workflow time: {results['total_time']:.1f} seconds") |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def run_performance_benchmark() -> Dict[str, float]: |
| """ |
| β‘ Run performance benchmarks |
| |
| Returns: |
| Dict with performance metrics |
| """ |
| results = {} |
| |
| print("β‘ Running Performance Benchmarks...") |
| print("=" * 50) |
| |
| try: |
| |
| if os.getenv('PERPLEXITY_API_KEY'): |
| from perplexity_client import PerplexityClient |
| |
| client = PerplexityClient() |
| start_time = time.time() |
| |
| test_result = client.discover_sources( |
| "Performance test query for machine learning", |
| max_sources=5 |
| ) |
| |
| api_time = time.time() - start_time |
| results['api_response_time'] = api_time |
| print(f"π§ Perplexity API response time: {api_time:.2f}s") |
| |
| |
| try: |
| from transformers import pipeline |
| |
| start_time = time.time() |
| sentiment_analyzer = pipeline("sentiment-analysis") |
| model_load_time = time.time() - start_time |
| |
| results['model_load_time'] = model_load_time |
| print(f"π€ Model loading time: {model_load_time:.2f}s") |
| |
| |
| test_texts = [ |
| "This is a great product!", |
| "I really don't like this item.", |
| "This product is okay, nothing special.", |
| "Amazing quality and fast delivery!", |
| "Terrible experience, would not recommend." |
| ] |
| |
| start_time = time.time() |
| for text in test_texts: |
| sentiment_analyzer(text) |
| processing_time = time.time() - start_time |
| |
| results['processing_speed'] = len(test_texts) / processing_time |
| print(f"π Processing speed: {results['processing_speed']:.1f} items/second") |
| |
| except ImportError: |
| print("β οΈ Cannot test model performance - transformers not available") |
| |
| |
| import psutil |
| import os |
| |
| process = psutil.Process(os.getpid()) |
| memory_mb = process.memory_info().rss / 1024 / 1024 |
| results['memory_usage_mb'] = memory_mb |
| print(f"πΎ Current memory usage: {memory_mb:.1f} MB") |
| |
| except Exception as e: |
| print(f"β οΈ Performance benchmark error: {e}") |
| |
| print("\n" + "=" * 50) |
| return results |
|
|
| def generate_test_report( |
| env_results: Dict, |
| api_results: Dict, |
| model_results: Dict, |
| scraping_results: Dict, |
| workflow_results: Dict, |
| performance_results: Dict |
| ) -> str: |
| """ |
| π Generate comprehensive test report |
| |
| Returns: |
| Formatted test report as string |
| """ |
| report = [] |
| report.append("π AI Dataset Studio - Test Report") |
| report.append("=" * 60) |
| report.append(f"π
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| report.append("") |
| |
| |
| report.append("π ENVIRONMENT SETUP") |
| report.append("-" * 30) |
| |
| env_score = sum(1 for v in env_results.values() if v) / len(env_results) * 100 |
| report.append(f"Overall Score: {env_score:.0f}%") |
| |
| if env_results.get('env_perplexity_api_key'): |
| report.append("β
Perplexity API configured") |
| else: |
| report.append("β Perplexity API not configured") |
| |
| required_packages = ['package_gradio', 'package_requests', 'package_pandas', 'package_beautifulsoup4'] |
| missing_required = [p for p in required_packages if not env_results.get(p)] |
| |
| if not missing_required: |
| report.append("β
All required packages available") |
| else: |
| report.append(f"β Missing required packages: {missing_required}") |
| |
| report.append("") |
| |
| |
| report.append("π§ PERPLEXITY AI INTEGRATION") |
| report.append("-" * 30) |
| |
| if api_results.get('api_key_valid'): |
| report.append("β
API key valid and working") |
| |
| if api_results.get('connection_successful'): |
| report.append("β
API connection successful") |
| |
| if api_results.get('response_quality'): |
| report.append("β
Response quality good") |
| |
| if api_results.get('rate_limiting'): |
| report.append("β
Rate limiting active") |
| else: |
| report.append("β API integration not working") |
| |
| report.append("") |
| |
| |
| report.append("π€ AI MODELS") |
| report.append("-" * 30) |
| |
| if model_results.get('transformers_available', True): |
| working_models = sum(1 for k, v in model_results.items() if k.endswith('_model') and v) |
| total_models = sum(1 for k in model_results.keys() if k.endswith('_model')) |
| |
| report.append(f"Working Models: {working_models}/{total_models}") |
| |
| if model_results.get('gpu_available'): |
| report.append("β
GPU acceleration available") |
| else: |
| report.append("β οΈ CPU-only processing") |
| else: |
| report.append("β AI models not available") |
| |
| report.append("") |
| |
| |
| report.append("π COMPLETE WORKFLOW") |
| report.append("-" * 30) |
| |
| workflow_steps = ['project_creation', 'source_discovery', 'data_scraping', 'data_processing', 'data_export'] |
| working_steps = sum(1 for step in workflow_steps if workflow_results.get(step)) |
| |
| report.append(f"Working Steps: {working_steps}/{len(workflow_steps)}") |
| report.append(f"Total Time: {workflow_results.get('total_time', 0):.1f} seconds") |
| |
| if working_steps >= 4: |
| report.append("β
Workflow fully functional") |
| elif working_steps >= 2: |
| report.append("β οΈ Workflow partially functional") |
| else: |
| report.append("β Workflow has major issues") |
| |
| report.append("") |
| |
| |
| report.append("β‘ PERFORMANCE METRICS") |
| report.append("-" * 30) |
| |
| if 'api_response_time' in performance_results: |
| api_time = performance_results['api_response_time'] |
| if api_time < 10: |
| report.append(f"β
API response time: {api_time:.1f}s (good)") |
| elif api_time < 20: |
| report.append(f"β οΈ API response time: {api_time:.1f}s (acceptable)") |
| else: |
| report.append(f"β API response time: {api_time:.1f}s (slow)") |
| |
| if 'processing_speed' in performance_results: |
| speed = performance_results['processing_speed'] |
| if speed > 2: |
| report.append(f"β
Processing speed: {speed:.1f} items/sec (good)") |
| elif speed > 0.5: |
| report.append(f"β οΈ Processing speed: {speed:.1f} items/sec (acceptable)") |
| else: |
| report.append(f"β Processing speed: {speed:.1f} items/sec (slow)") |
| |
| if 'memory_usage_mb' in performance_results: |
| memory = performance_results['memory_usage_mb'] |
| report.append(f"πΎ Memory usage: {memory:.0f} MB") |
| |
| report.append("") |
| |
| |
| report.append("π― OVERALL ASSESSMENT") |
| report.append("-" * 30) |
| |
| total_score = 0 |
| max_score = 0 |
| |
| |
| if env_results.get('env_perplexity_api_key') and env_results.get('package_gradio'): |
| total_score += 25 |
| max_score += 25 |
| |
| if api_results.get('api_key_valid') and api_results.get('connection_successful'): |
| total_score += 25 |
| max_score += 25 |
| |
| if working_steps >= 3: |
| total_score += 25 |
| max_score += 25 |
| |
| if model_results.get('sentiment_model', False) or not model_results.get('transformers_available', True): |
| total_score += 25 |
| max_score += 25 |
| |
| overall_score = (total_score / max_score) * 100 if max_score > 0 else 0 |
| |
| if overall_score >= 80: |
| status = "β
EXCELLENT - Ready for production use" |
| elif overall_score >= 60: |
| status = "β οΈ GOOD - Minor issues to address" |
| elif overall_score >= 40: |
| status = "π§ FAIR - Several issues need fixing" |
| else: |
| status = "β POOR - Major setup problems" |
| |
| report.append(f"Overall Score: {overall_score:.0f}%") |
| report.append(f"Status: {status}") |
| |
| report.append("") |
| report.append("π§ NEXT STEPS") |
| report.append("-" * 30) |
| |
| if not env_results.get('env_perplexity_api_key'): |
| report.append("1. Set PERPLEXITY_API_KEY environment variable") |
| |
| if not api_results.get('api_key_valid'): |
| report.append("2. Verify Perplexity API key is correct") |
| |
| if working_steps < 3: |
| report.append("3. Check error logs for workflow issues") |
| |
| if not model_results.get('gpu_available', False) and model_results.get('transformers_available', True): |
| report.append("4. Consider upgrading to GPU hardware for better performance") |
| |
| if overall_score >= 80: |
| report.append("π Your AI Dataset Studio is ready to create amazing datasets!") |
| |
| return "\n".join(report) |
|
|
| def main(): |
| """ |
| π§ͺ Run complete test suite |
| """ |
| print("π§ͺ AI Dataset Studio - Complete Test Suite") |
| print("=" * 60) |
| print("This will test all components of your deployment") |
| print("Please wait while tests are running...\n") |
| |
| |
| env_results = test_environment_setup() |
| api_results = test_perplexity_api() |
| model_results = test_ai_models() |
| scraping_results = test_web_scraping() |
| workflow_results = test_complete_workflow() |
| performance_results = run_performance_benchmark() |
| |
| |
| report = generate_test_report( |
| env_results, api_results, model_results, |
| scraping_results, workflow_results, performance_results |
| ) |
| |
| |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
| report_filename = f"test_report_{timestamp}.txt" |
| |
| try: |
| with open(report_filename, 'w', encoding='utf-8') as f: |
| f.write(report) |
| print(f"π Test report saved to: {report_filename}") |
| except Exception as e: |
| print(f"β οΈ Could not save report to file: {e}") |
| |
| print("\n" + "=" * 60) |
| print(report) |
| print("=" * 60) |
| |
| return { |
| 'environment': env_results, |
| 'api': api_results, |
| 'models': model_results, |
| 'scraping': scraping_results, |
| 'workflow': workflow_results, |
| 'performance': performance_results |
| } |
|
|
| if __name__ == "__main__": |
| |
| test_results = main() |