Spaces:
Running
Running
| import gradio as gr | |
| from datetime import datetime | |
| from groq import Groq | |
| import json | |
| import os | |
| import traceback | |
| import re | |
| # --- 1. API KEY --- | |
| api_key_coder = os.environ.get('fristapi') | |
| if not api_key_coder: | |
| raise ValueError("Groq API key not found. Set fristapi environment variable.") | |
| # --- 2. LLM CLIENT --- | |
| class GroqLLM: | |
| def __init__(self, api_key, model="llama-3.3-70b-versatile", temperature=0.7): | |
| self.client = Groq(api_key=api_key) | |
| self.model = model | |
| self.temperature = temperature | |
| def invoke(self, prompt): | |
| try: | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=self.temperature, | |
| max_tokens=4000 | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"LLM Error: {str(e)}" | |
| llm = GroqLLM(api_key=api_key_coder) | |
| class SyntheticDataGenerator: | |
| """Generates synthetic test data in multiple formats""" | |
| def __init__(self, llm): | |
| self.llm = llm | |
| def generate_data(self, schema_description, data_type="json", language="Python", record_count=5): | |
| """ | |
| Generate synthetic test data based on schema description | |
| """ | |
| import json | |
| # Ensure schema_description is safely embedded | |
| if isinstance(schema_description, dict): | |
| schema_description_str = json.dumps(schema_description, indent=2) | |
| else: | |
| schema_description_str = schema_description | |
| # Map data types to formats | |
| format_map = { | |
| "json": "JSON (array of objects)", | |
| "csv": "CSV (comma-separated values with headers)", | |
| "sql": "SQL INSERT statements", | |
| "xml": "XML document", | |
| "python_dict": "Python list of dictionaries", | |
| "yaml": "YAML format", | |
| "excel_style": "Table format (markdown)" | |
| } | |
| output_format = format_map.get(data_type, "JSON") | |
| prompt = f""" | |
| You are a TEST DATA ENGINEER specializing in synthetic data generation. | |
| Generate realistic, diverse, and comprehensive test data. | |
| ===================== | |
| CONFIGURATION | |
| ===================== | |
| Data Schema: {schema_description_str} | |
| Output Format: {output_format} | |
| Number of Records: {record_count} | |
| Programming Context: {language} | |
| ===================== | |
| REQUIREMENTS | |
| ===================== | |
| 1. Generate {record_count} unique, realistic records | |
| 2. Include edge cases and boundary values | |
| 3. Ensure data variety (different types, values) | |
| 4. Make data look production-like | |
| 5. Include at least one record with null/empty values (if applicable) | |
| 6. Include at least one record with extreme values | |
| ===================== | |
| OUTPUT STRUCTURE | |
| ===================== | |
| Generate the response with: | |
| ## 📊 Data Generation Summary | |
| - **Schema**: [brief description] | |
| - **Format**: {output_format} | |
| - **Records**: {record_count} | |
| - **Language Context**: {language} | |
| ## 📝 Generated Test Data | |
| ```{data_type if data_type != 'excel_style' else 'markdown'} | |
| [generated data here] | |
| 🔍 Data Quality Notes | |
| [Special cases included] | |
| [Edge values] | |
| [Validation notes] | |
| 💡 Usage Example | |
| [How to use this data in {language} tests] | |
| """ | |
| # Invoke LLM | |
| return self.llm.invoke(prompt) | |
| # ============================================ | |
| # Synthetic Data Generator: Bulk Function | |
| # ============================================ | |
| def generate_bulk_data(self, schema_description, formats=None, record_counts=None): | |
| """Generate data in multiple formats at once""" | |
| # Default formats | |
| if formats is None: | |
| formats = ["json", "csv", "sql"] | |
| # Default record counts | |
| if record_counts is None: | |
| record_counts = [3, 5, 10] | |
| results = {} | |
| for format_type in formats: | |
| # If only one record count is given, use it for all formats | |
| count = record_counts[0] if len(record_counts) == 1 else record_counts[formats.index(format_type)] | |
| # Generate data using the main generator | |
| results[format_type] = self.generate_data( | |
| schema_description=schema_description, | |
| data_type=format_type, | |
| record_count=count | |
| ) | |
| return results | |
| # Initialize the synthetic data generator | |
| synthetic_generator = SyntheticDataGenerator(llm) | |
| # ============================================ | |
| # Template Functions | |
| # ============================================ | |
| def load_schema_template(template_name): | |
| """Return the JSON schema for a given template name""" | |
| templates = { | |
| "custom": '''{ | |
| "user_id": "uuid", | |
| "full_name": "name", | |
| "email": "email", | |
| "age": {"type": "int", "min": 18, "max": 99}, | |
| "is_active": "boolean", | |
| "country": "country" | |
| }''', | |
| "user_profile": '''{ | |
| "user_id": "integer", | |
| "username": "string (3-20 chars)", | |
| "email": "email format", | |
| "age": "integer (18-80)", | |
| "country": "ISO country code", | |
| "is_premium": "boolean", | |
| "registration_date": "date (last 2 years)", | |
| "last_login": "datetime" | |
| }''', | |
| "ecommerce_order": '''{ | |
| "order_id": "string (format: ORD-XXXX)", | |
| "customer_name": "string", | |
| "email": "email", | |
| "items": [ | |
| { | |
| "product_id": "string", | |
| "quantity": "integer (1-10)", | |
| "price": "decimal (10.99-999.99)" | |
| } | |
| ], | |
| "total_amount": "decimal", | |
| "status": "enum(pending,shipped,delivered,cancelled)", | |
| "order_date": "datetime" | |
| }''', | |
| "api_request_log": '''{ | |
| "request_id": "uuid", | |
| "endpoint": "string", | |
| "method": "enum(GET,POST,PUT,DELETE)", | |
| "status_code": "integer (200,400,401,404,500)", | |
| "response_time_ms": "integer (50-5000)", | |
| "timestamp": "datetime", | |
| "user_agent": "string", | |
| "ip_address": "ipv4" | |
| }''', | |
| "contact_info": '''{ | |
| "contact_id": "integer", | |
| "first_name": "string", | |
| "last_name": "string", | |
| "email": "email", | |
| "phone": "phone number", | |
| "address": { | |
| "street": "string", | |
| "city": "string", | |
| "state": "string", | |
| "zip_code": "string", | |
| "country": "string" | |
| }, | |
| "company": "string", | |
| "notes": "string (optional)" | |
| }''', | |
| "financial_transaction": '''{ | |
| "transaction_id": "uuid", | |
| "account_number": "string (format: ****1234)", | |
| "transaction_type": "enum(deposit,withdrawal,transfer,payment)", | |
| "amount": "decimal (0.01-10000.00)", | |
| "currency": "enum(USD,EUR,GBP,JPY)", | |
| "timestamp": "datetime", | |
| "status": "enum(pending,completed,failed,reversed)", | |
| "description": "string" | |
| }''', | |
| "healthcare_patient": '''{ | |
| "patient_id": "string (format: PAT-XXXX)", | |
| "first_name": "string", | |
| "last_name": "string", | |
| "date_of_birth": "date (1950-2020)", | |
| "gender": "enum(M,F,Other)", | |
| "blood_type": "enum(A+,A-,B+,B-,AB+,AB-,O+,O-)", | |
| "allergies": "array of strings", | |
| "medications": "array of strings", | |
| "last_visit": "datetime", | |
| "insurance_provider": "string" | |
| }''' | |
| } | |
| # Return the requested template, default to 'custom' | |
| return templates.get(template_name, templates["custom"]) | |
| def get_output_format_description(output_type): | |
| """ | |
| Return a human-readable description for a given output format type | |
| """ | |
| descriptions = { | |
| "json": "📄 JSON format - Array of objects, ideal for API testing and data interchange", | |
| "csv": "📊 CSV format - Comma-separated values, perfect for data import and spreadsheet analysis", | |
| "sql": "🗄️ SQL format - INSERT statements with CREATE TABLE, ready for database seeding", | |
| "xml": "📋 XML format - Structured document format for enterprise systems", | |
| "python_dict": "🐍 Python format - List of dictionaries, ready to use in Python tests", | |
| "yaml": "⚙️ YAML format - Human-readable configuration format", | |
| "excel_style": "📑 Table format - Markdown table, easy to read and document" | |
| } | |
| # Return description if found, otherwise default message | |
| return descriptions.get(output_type, "📄 Standard format for test data") | |
| # ============================================ | |
| # Gradio UI Functions | |
| # ============================================ | |
| def generate_synthetic_data(language, schema_json, record_count, data_format, template_name): | |
| """Main generation function for Gradio""" | |
| try: | |
| # Validate schema | |
| if not schema_json or not schema_json.strip(): | |
| return ( | |
| "### ⚠️ No Data Generated\n\nPlease provide a schema definition.", | |
| "# No data generated\nPlease provide a schema definition.", | |
| "### 📊 Generation Statistics\nNo data generated yet", | |
| schema_json | |
| ) | |
| # Parse record count | |
| try: | |
| count = int(record_count) | |
| if count < 1: | |
| count = 1 | |
| elif count > 100: | |
| count = 100 | |
| except: | |
| count = 5 | |
| # Generate synthetic data | |
| result = synthetic_generator.generate_data( | |
| schema_description=schema_json, | |
| data_type=data_format, | |
| language=language, | |
| record_count=count | |
| ) | |
| # Extract data from result | |
| pattern = rf'{data_format}\n(.*?)\n' | |
| if data_format == 'excel_style': | |
| pattern = r'markdown\n(.*?)\n' | |
| data_match = re.search(pattern, result, re.DOTALL) | |
| if data_match: | |
| generated_data = data_match.group(1) | |
| # Generate usage example based on format | |
| usage_examples = { | |
| "json": f'''# Generated test data for {language} | |
| Created with Synthetic Data Generator | |
| import json | |
| test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data} | |
| for record in test_data: | |
| # Your test logic here | |
| print(f"Processing: {{record}}") | |
| # Example validation | |
| assert 'user_id' in record | |
| assert 'email' in record | |
| ''', | |
| "python_dict": f'''# Generated test data for {language} | |
| Test data as Python dictionary/list | |
| test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data} | |
| for record in test_data: | |
| print(f"Testing with: {{record.get('name', 'N/A')}}") | |
| ''', | |
| "csv": f'''# Generated test data for {language} | |
| import csv | |
| from io import StringIO | |
| csv_data = """{generated_data[:500]}""" | |
| reader = csv.DictReader(StringIO(csv_data)) | |
| for row in reader: | |
| print(f"Processing row: {{row}}") | |
| ''', | |
| "sql": f'''# Generated test data for {language} | |
| import sqlite3 | |
| sql_statements = """{generated_data[:500]}""" | |
| conn = sqlite3.connect(':memory:') | |
| cursor = conn.cursor() | |
| for statement in sql_statements.split(';'): | |
| if statement.strip(): | |
| cursor.execute(statement) | |
| conn.commit() | |
| print("Database seeded successfully!") | |
| ''', | |
| "xml": f'''# Generated test data for {language} | |
| import xml.etree.ElementTree as ET | |
| xml_data = """{generated_data[:500]}""" | |
| root = ET.fromstring(xml_data) | |
| for record in root: | |
| print(f"Record: {{record.tag}}") | |
| ''', | |
| "yaml": f'''# Generated test data for {language} | |
| import yaml | |
| yaml_data = """{generated_data[:500]}""" | |
| test_data = yaml.safe_load(yaml_data) | |
| for record in test_data: | |
| print(f"Testing with: {{record}}") | |
| ''' | |
| } | |
| code_template = usage_examples.get( | |
| data_format, | |
| f'''# Generated test data for {language} | |
| Generated Data: | |
| {generated_data[:500]} | |
| Use this data in your tests as needed''' | |
| ) | |
| # Generation statistics | |
| stats = f"""### 📊 Generation Statistics | |
| Metric Value | |
| Records Generated {count} | |
| Language {language} | |
| Format {data_format.upper()} | |
| Template {template_name} | |
| Status ✅ Success | |
| Generated {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | |
| Data Quality Notes: | |
| ✅ Realistic test data created | |
| ✅ Includes edge cases and variations | |
| ✅ Ready for immediate use in tests | |
| ✅ Follows schema specifications | |
| Special Cases Included: | |
| Boundary values | |
| Edge cases | |
| Null/empty values (where applicable) | |
| Extreme values | |
| """ | |
| return ( | |
| generated_data, | |
| code_template, | |
| stats, | |
| schema_json | |
| ) | |
| else: | |
| # No data found in expected format, return raw result | |
| return ( | |
| result[:2000] + "\n\n... (truncated)", | |
| f"# Generated Output\n\n{result}", | |
| "### ⚠️ Generation completed but format may need review\n\nThe data may not be in the expected format. Please check the output above.", | |
| schema_json | |
| ) | |
| except Exception as e: | |
| error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}\n" | |
| return ( | |
| error_msg, | |
| f"# Error generating data\n\n{str(e)}", | |
| f"### ❌ Generation Failed\n\nError: {str(e)}", | |
| schema_json | |
| ) | |
| def preview_schema(schema_json): | |
| """Preview and validate schema""" | |
| try: | |
| if not schema_json or not schema_json.strip(): | |
| return "⚠️ No schema provided" | |
| # Try to parse as JSON for validation | |
| try: | |
| parsed = json.loads(schema_json) | |
| return f"""### ✅ Valid JSON Schema | |
| Structure Preview: | |
| {json.dumps(parsed, indent=2, ensure_ascii=False)[:500]} | |
| Fields Detected: {len(parsed) if isinstance(parsed, dict) else 'Multiple records'} | |
| This schema will generate realistic test data based on the field definitions. | |
| """ | |
| except json.JSONDecodeError: | |
| return f"""### 📝 Schema Description (Text Format) | |
| {schema_json[:300]} | |
| Note: The schema is in text format. The AI will interpret and generate appropriate test data. | |
| """ | |
| except Exception as e: | |
| return f"❌ Error previewing schema: {str(e)}" | |
| # ============================================ | |
| # Gradio UI (Standalone with Home Tab) | |
| # ============================================ | |
| with gr.Blocks(title="Synthetic Test Data Generator") as demo: | |
| gr.Markdown("# 🎲 Synthetic Test Data Generator") | |
| gr.Markdown("### Create Realistic Test Data for Your Applications with AI") | |
| # ============================================ | |
| # Tab 0: Home | |
| # ============================================ | |
| with gr.Tab("🏠 Home"): | |
| gr.Markdown(""" | |
| 🎯 Welcome to Synthetic Test Data Generator | |
| (المحتوى) | |
| """) | |
| # Stats and Use Cases | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### 📊 Quick Stats") | |
| gr.Markdown(""" | |
| Formats Supported: 7 | |
| Max Records: 100 per generation | |
| Templates: 7 pre-built | |
| Languages: 4 | |
| """) | |
| with gr.Column(): | |
| gr.Markdown("### 🎯 Use Cases") | |
| gr.Markdown(""" | |
| API Testing | |
| Database Seeding | |
| Load Testing | |
| UI Testing | |
| Integration Tests | |
| Demo Data Creation | |
| """) | |
| # Quick Action Button | |
| gr.Markdown("---") | |
| gr.Markdown("### 🔧 Quick Action") | |
| quick_generate_btn = gr.Button("🚀 Go to Generator Tab", variant="primary") | |
| quick_generate_btn.click( | |
| fn=None, | |
| inputs=[], | |
| outputs=[], | |
| js='() => { document.querySelector(\'[data-testid="tab-Generator"]\').click(); }' | |
| ) | |
| # ============================================ | |
| # Tab 1: Generator (Main) - يجب أن يكون داخل كتلة demo | |
| # ============================================ | |
| with gr.Tab("Generator") as generator_tab: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ⚙️ Configuration") | |
| # Language Selection | |
| data_language = gr.Dropdown( | |
| choices=["python", "javascript", "java", "csharp"], | |
| value="python", | |
| label="🎯 Target Language" | |
| ) | |
| # Format Selection | |
| data_format = gr.Dropdown( | |
| choices=["json", "csv", "sql", "xml", "python_dict", "yaml", "excel_style"], | |
| value="json", | |
| label="📄 Output Format" | |
| ) | |
| # Format description | |
| format_desc = gr.Markdown("Format Info: " + get_output_format_description("json")) | |
| # Record count | |
| data_count = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=10, | |
| step=1, | |
| label="📊 Number of Records" | |
| ) | |
| gr.Markdown("### 📝 Schema Definition") | |
| # Template selection | |
| data_template = gr.Dropdown( | |
| choices=[ | |
| "custom", "user_profile", "ecommerce_order", "api_request_log", | |
| "contact_info", "financial_transaction", "healthcare_patient" | |
| ], | |
| value="custom", | |
| label="📁 Schema Template" | |
| ) | |
| # Schema input | |
| schema_input = gr.Textbox( | |
| value='''{ | |
| "user_id": "uuid", | |
| "full_name": "name", | |
| "email": "email", | |
| "age": {"type": "int", "min": 18, "max": 99}, | |
| "is_active": "boolean", | |
| "country": "country" | |
| }''', | |
| lines=15, | |
| label="Schema Definition" | |
| ) | |
| # Schema preview | |
| preview_btn = gr.Button("🔍 Preview Schema", size="sm") | |
| schema_preview = gr.Markdown("") | |
| # Actions | |
| generate_btn = gr.Button("✨ Generate Test Data", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📊 Generation Statistics") | |
| data_stats = gr.Markdown("No data generated yet") | |
| gr.Markdown("### 📝 Generated Data") | |
| data_output = gr.Code(label="Test Data", language="json", lines=15) | |
| gr.Markdown("### 💻 Ready-to-use Code Template") | |
| code_output = gr.Code(label="Code Template", language="python", lines=12) | |
| # تعريف الدوال المساعدة | |
| def update_code_language(format_type): | |
| lang_map = { | |
| "json": "json", | |
| "csv": "csv", | |
| "sql": "sql", | |
| "xml": "xml", | |
| "python_dict": "python", | |
| "yaml": "yaml", | |
| "excel_style": "markdown" | |
| } | |
| return gr.update(language=lang_map.get(format_type, "text")) | |
| # Event Handlers | |
| data_template.change( | |
| fn=load_schema_template, | |
| inputs=[data_template], | |
| outputs=[schema_input] | |
| ) | |
| preview_btn.click( | |
| fn=preview_schema, | |
| inputs=[schema_input], | |
| outputs=[schema_preview] | |
| ) | |
| generate_btn.click( | |
| fn=generate_synthetic_data, | |
| inputs=[data_language, schema_input, data_count, data_format, data_template], | |
| outputs=[data_output, code_output, data_stats, schema_input] | |
| ) | |
| data_format.change( | |
| fn=get_output_format_description, | |
| inputs=[data_format], | |
| outputs=[format_desc] | |
| ) | |
| data_format.change( | |
| fn=update_code_language, | |
| inputs=[data_format], | |
| outputs=[data_output] | |
| ) | |
| # ============================================ | |
| # Tab 2: Templates Library | |
| # ============================================ | |
| with gr.Tab("📚 Templates Library"): | |
| gr.Markdown("# 📚 Schema Templates Library") | |
| gr.Markdown("Browse and learn from these example schemas") | |
| template_selector = gr.Dropdown( | |
| choices=[ | |
| "user_profile", "ecommerce_order", "api_request_log", | |
| "contact_info", "financial_transaction", "healthcare_patient" | |
| ], | |
| value="user_profile", | |
| label="Select Template to View" | |
| ) | |
| template_display = gr.Code(label="Template Schema", language="json", lines=20) | |
| template_description = gr.Markdown("") | |
| def show_template_details(template_name): | |
| templates_info = { | |
| "user_profile": { | |
| "desc": "User Profile Schema\n\nPerfect for testing user management systems, authentication, and profile features.\n\nUse Cases:\n- User registration tests\n- Profile management\n- User search/filtering\n- Data export features", | |
| "schema": load_schema_template(template_name) | |
| }, | |
| "ecommerce_order": { | |
| "desc": "E-commerce Order Schema\n\nIdeal for testing shopping carts, checkout flows, and order management systems.\n\nUse Cases:\n- Order creation tests\n- Cart calculations\n- Inventory management\n- Order history features", | |
| "schema": load_schema_template(template_name) | |
| }, | |
| "api_request_log": { | |
| "desc": "API Request Log Schema\n\nGreat for testing logging systems, analytics, and monitoring tools.\n\nUse Cases:\n- API monitoring tests\n- Log analysis\n- Performance metrics\n- Error tracking", | |
| "schema": load_schema_template(template_name) | |
| }, | |
| "contact_info": { | |
| "desc": "Contact Information Schema\n\nUseful for CRM systems, address books, and contact management features.\n\nUse Cases:\n- Contact import/export\n- Address validation\n- Phone number formatting\n- Data enrichment tests", | |
| "schema": load_schema_template(template_name) | |
| }, | |
| "financial_transaction": { | |
| "desc": "Financial Transaction Schema\n\nEssential for banking, payment systems, and financial applications.\n\nUse Cases:\n- Payment processing tests\n- Transaction history\n- Fraud detection\n- Statement generation", | |
| "schema": load_schema_template(template_name) | |
| }, | |
| "healthcare_patient": { | |
| "desc": "Healthcare Patient Schema\n\nDesigned for healthcare systems, patient management, and medical records.\n\nUse Cases:\n- Patient registration\n- Medical records\n- Appointment scheduling\n- Insurance verification", | |
| "schema": load_schema_template(template_name) | |
| } | |
| } | |
| info = templates_info.get(template_name, templates_info["user_profile"]) | |
| return info["schema"], info["desc"] | |
| template_selector.change( | |
| fn=show_template_details, | |
| inputs=[template_selector], | |
| outputs=[template_display, template_description] | |
| ) | |
| # Load initial template | |
| initial_schema, initial_desc = show_template_details("user_profile") | |
| template_display.value = initial_schema | |
| template_description.value = initial_desc | |
| gr.Markdown("---") | |
| gr.Markdown("### 💡 Tips for Creating Custom Schemas") | |
| gr.Markdown(""" | |
| - Use descriptive field names - The AI understands context (e.g., 'email' vs 'e') | |
| - Specify data types - Include hints like 'string', 'integer', 'decimal', 'date' | |
| - Add constraints - Use ranges like 'age: 18-80' or patterns like 'email format' | |
| - Include enums - For fields with limited options (e.g., 'status: pending,active,closed') | |
| - Structure nested data - Use JSON objects or arrays for complex relationships | |
| """) | |
| # ============================================ | |
| # Tab 3: About & Help | |
| # ============================================ | |
| with gr.Tab("ℹ️ About & Help"): | |
| gr.Markdown(""" | |
| ℹ️ **About Synthetic Test Data Generator** | |
| **What is this tool?** | |
| This tool uses AI (Groq's LLM) to generate realistic, production-like test data based on your schema definitions. It helps QA engineers and developers create comprehensive test datasets quickly and efficiently. | |
| (باقي المحتوى) | |
| """) | |
| # ============================================ | |
| # Launch Application | |
| # ============================================ | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("🎲 Synthetic Test Data Generator") | |
| print("=" * 60) | |
| print("✅ All components loaded successfully!") | |
| print("=" * 60) | |
| print("🚀 Launching application...") | |
| print("=" * 60) | |
| demo.launch( | |
| share=True, | |
| debug=False, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| theme=gr.themes.Soft() | |
| ) |