TestForge2AI / app.py
SalwaM's picture
Update app.py
e2a6267 verified
import gradio as gr
from datetime import datetime
from groq import Groq
import json
import os
import traceback
import re
# --- 1. API KEY ---
api_key_coder = os.environ.get('fristapi')
if not api_key_coder:
raise ValueError("Groq API key not found. Set fristapi environment variable.")
# --- 2. LLM CLIENT ---
class GroqLLM:
def __init__(self, api_key, model="llama-3.3-70b-versatile", temperature=0.7):
self.client = Groq(api_key=api_key)
self.model = model
self.temperature = temperature
def invoke(self, prompt):
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=self.temperature,
max_tokens=4000
)
return response.choices[0].message.content
except Exception as e:
return f"LLM Error: {str(e)}"
llm = GroqLLM(api_key=api_key_coder)
class SyntheticDataGenerator:
"""Generates synthetic test data in multiple formats"""
def __init__(self, llm):
self.llm = llm
def generate_data(self, schema_description, data_type="json", language="Python", record_count=5):
"""
Generate synthetic test data based on schema description
"""
import json
# Ensure schema_description is safely embedded
if isinstance(schema_description, dict):
schema_description_str = json.dumps(schema_description, indent=2)
else:
schema_description_str = schema_description
# Map data types to formats
format_map = {
"json": "JSON (array of objects)",
"csv": "CSV (comma-separated values with headers)",
"sql": "SQL INSERT statements",
"xml": "XML document",
"python_dict": "Python list of dictionaries",
"yaml": "YAML format",
"excel_style": "Table format (markdown)"
}
output_format = format_map.get(data_type, "JSON")
prompt = f"""
You are a TEST DATA ENGINEER specializing in synthetic data generation.
Generate realistic, diverse, and comprehensive test data.
=====================
CONFIGURATION
=====================
Data Schema: {schema_description_str}
Output Format: {output_format}
Number of Records: {record_count}
Programming Context: {language}
=====================
REQUIREMENTS
=====================
1. Generate {record_count} unique, realistic records
2. Include edge cases and boundary values
3. Ensure data variety (different types, values)
4. Make data look production-like
5. Include at least one record with null/empty values (if applicable)
6. Include at least one record with extreme values
=====================
OUTPUT STRUCTURE
=====================
Generate the response with:
## 📊 Data Generation Summary
- **Schema**: [brief description]
- **Format**: {output_format}
- **Records**: {record_count}
- **Language Context**: {language}
## 📝 Generated Test Data
```{data_type if data_type != 'excel_style' else 'markdown'}
[generated data here]
🔍 Data Quality Notes
[Special cases included]
[Edge values]
[Validation notes]
💡 Usage Example
[How to use this data in {language} tests]
"""
# Invoke LLM
return self.llm.invoke(prompt)
# ============================================
# Synthetic Data Generator: Bulk Function
# ============================================
def generate_bulk_data(self, schema_description, formats=None, record_counts=None):
"""Generate data in multiple formats at once"""
# Default formats
if formats is None:
formats = ["json", "csv", "sql"]
# Default record counts
if record_counts is None:
record_counts = [3, 5, 10]
results = {}
for format_type in formats:
# If only one record count is given, use it for all formats
count = record_counts[0] if len(record_counts) == 1 else record_counts[formats.index(format_type)]
# Generate data using the main generator
results[format_type] = self.generate_data(
schema_description=schema_description,
data_type=format_type,
record_count=count
)
return results
# Initialize the synthetic data generator
synthetic_generator = SyntheticDataGenerator(llm)
# ============================================
# Template Functions
# ============================================
def load_schema_template(template_name):
"""Return the JSON schema for a given template name"""
templates = {
"custom": '''{
"user_id": "uuid",
"full_name": "name",
"email": "email",
"age": {"type": "int", "min": 18, "max": 99},
"is_active": "boolean",
"country": "country"
}''',
"user_profile": '''{
"user_id": "integer",
"username": "string (3-20 chars)",
"email": "email format",
"age": "integer (18-80)",
"country": "ISO country code",
"is_premium": "boolean",
"registration_date": "date (last 2 years)",
"last_login": "datetime"
}''',
"ecommerce_order": '''{
"order_id": "string (format: ORD-XXXX)",
"customer_name": "string",
"email": "email",
"items": [
{
"product_id": "string",
"quantity": "integer (1-10)",
"price": "decimal (10.99-999.99)"
}
],
"total_amount": "decimal",
"status": "enum(pending,shipped,delivered,cancelled)",
"order_date": "datetime"
}''',
"api_request_log": '''{
"request_id": "uuid",
"endpoint": "string",
"method": "enum(GET,POST,PUT,DELETE)",
"status_code": "integer (200,400,401,404,500)",
"response_time_ms": "integer (50-5000)",
"timestamp": "datetime",
"user_agent": "string",
"ip_address": "ipv4"
}''',
"contact_info": '''{
"contact_id": "integer",
"first_name": "string",
"last_name": "string",
"email": "email",
"phone": "phone number",
"address": {
"street": "string",
"city": "string",
"state": "string",
"zip_code": "string",
"country": "string"
},
"company": "string",
"notes": "string (optional)"
}''',
"financial_transaction": '''{
"transaction_id": "uuid",
"account_number": "string (format: ****1234)",
"transaction_type": "enum(deposit,withdrawal,transfer,payment)",
"amount": "decimal (0.01-10000.00)",
"currency": "enum(USD,EUR,GBP,JPY)",
"timestamp": "datetime",
"status": "enum(pending,completed,failed,reversed)",
"description": "string"
}''',
"healthcare_patient": '''{
"patient_id": "string (format: PAT-XXXX)",
"first_name": "string",
"last_name": "string",
"date_of_birth": "date (1950-2020)",
"gender": "enum(M,F,Other)",
"blood_type": "enum(A+,A-,B+,B-,AB+,AB-,O+,O-)",
"allergies": "array of strings",
"medications": "array of strings",
"last_visit": "datetime",
"insurance_provider": "string"
}'''
}
# Return the requested template, default to 'custom'
return templates.get(template_name, templates["custom"])
def get_output_format_description(output_type):
"""
Return a human-readable description for a given output format type
"""
descriptions = {
"json": "📄 JSON format - Array of objects, ideal for API testing and data interchange",
"csv": "📊 CSV format - Comma-separated values, perfect for data import and spreadsheet analysis",
"sql": "🗄️ SQL format - INSERT statements with CREATE TABLE, ready for database seeding",
"xml": "📋 XML format - Structured document format for enterprise systems",
"python_dict": "🐍 Python format - List of dictionaries, ready to use in Python tests",
"yaml": "⚙️ YAML format - Human-readable configuration format",
"excel_style": "📑 Table format - Markdown table, easy to read and document"
}
# Return description if found, otherwise default message
return descriptions.get(output_type, "📄 Standard format for test data")
# ============================================
# Gradio UI Functions
# ============================================
def generate_synthetic_data(language, schema_json, record_count, data_format, template_name):
"""Main generation function for Gradio"""
try:
# Validate schema
if not schema_json or not schema_json.strip():
return (
"### ⚠️ No Data Generated\n\nPlease provide a schema definition.",
"# No data generated\nPlease provide a schema definition.",
"### 📊 Generation Statistics\nNo data generated yet",
schema_json
)
# Parse record count
try:
count = int(record_count)
if count < 1:
count = 1
elif count > 100:
count = 100
except:
count = 5
# Generate synthetic data
result = synthetic_generator.generate_data(
schema_description=schema_json,
data_type=data_format,
language=language,
record_count=count
)
# Extract data from result
pattern = rf'{data_format}\n(.*?)\n'
if data_format == 'excel_style':
pattern = r'markdown\n(.*?)\n'
data_match = re.search(pattern, result, re.DOTALL)
if data_match:
generated_data = data_match.group(1)
# Generate usage example based on format
usage_examples = {
"json": f'''# Generated test data for {language}
Created with Synthetic Data Generator
import json
test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data}
for record in test_data:
# Your test logic here
print(f"Processing: {{record}}")
# Example validation
assert 'user_id' in record
assert 'email' in record
''',
"python_dict": f'''# Generated test data for {language}
Test data as Python dictionary/list
test_data = {generated_data[:500] + '...' if len(generated_data) > 500 else generated_data}
for record in test_data:
print(f"Testing with: {{record.get('name', 'N/A')}}")
''',
"csv": f'''# Generated test data for {language}
import csv
from io import StringIO
csv_data = """{generated_data[:500]}"""
reader = csv.DictReader(StringIO(csv_data))
for row in reader:
print(f"Processing row: {{row}}")
''',
"sql": f'''# Generated test data for {language}
import sqlite3
sql_statements = """{generated_data[:500]}"""
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
for statement in sql_statements.split(';'):
if statement.strip():
cursor.execute(statement)
conn.commit()
print("Database seeded successfully!")
''',
"xml": f'''# Generated test data for {language}
import xml.etree.ElementTree as ET
xml_data = """{generated_data[:500]}"""
root = ET.fromstring(xml_data)
for record in root:
print(f"Record: {{record.tag}}")
''',
"yaml": f'''# Generated test data for {language}
import yaml
yaml_data = """{generated_data[:500]}"""
test_data = yaml.safe_load(yaml_data)
for record in test_data:
print(f"Testing with: {{record}}")
'''
}
code_template = usage_examples.get(
data_format,
f'''# Generated test data for {language}
Generated Data:
{generated_data[:500]}
Use this data in your tests as needed'''
)
# Generation statistics
stats = f"""### 📊 Generation Statistics
Metric Value
Records Generated {count}
Language {language}
Format {data_format.upper()}
Template {template_name}
Status ✅ Success
Generated {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Data Quality Notes:
✅ Realistic test data created
✅ Includes edge cases and variations
✅ Ready for immediate use in tests
✅ Follows schema specifications
Special Cases Included:
Boundary values
Edge cases
Null/empty values (where applicable)
Extreme values
"""
return (
generated_data,
code_template,
stats,
schema_json
)
else:
# No data found in expected format, return raw result
return (
result[:2000] + "\n\n... (truncated)",
f"# Generated Output\n\n{result}",
"### ⚠️ Generation completed but format may need review\n\nThe data may not be in the expected format. Please check the output above.",
schema_json
)
except Exception as e:
error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}\n"
return (
error_msg,
f"# Error generating data\n\n{str(e)}",
f"### ❌ Generation Failed\n\nError: {str(e)}",
schema_json
)
def preview_schema(schema_json):
"""Preview and validate schema"""
try:
if not schema_json or not schema_json.strip():
return "⚠️ No schema provided"
# Try to parse as JSON for validation
try:
parsed = json.loads(schema_json)
return f"""### ✅ Valid JSON Schema
Structure Preview:
{json.dumps(parsed, indent=2, ensure_ascii=False)[:500]}
Fields Detected: {len(parsed) if isinstance(parsed, dict) else 'Multiple records'}
This schema will generate realistic test data based on the field definitions.
"""
except json.JSONDecodeError:
return f"""### 📝 Schema Description (Text Format)
{schema_json[:300]}
Note: The schema is in text format. The AI will interpret and generate appropriate test data.
"""
except Exception as e:
return f"❌ Error previewing schema: {str(e)}"
# ============================================
# Gradio UI (Standalone with Home Tab)
# ============================================
with gr.Blocks(title="Synthetic Test Data Generator") as demo:
gr.Markdown("# 🎲 Synthetic Test Data Generator")
gr.Markdown("### Create Realistic Test Data for Your Applications with AI")
# ============================================
# Tab 0: Home
# ============================================
with gr.Tab("🏠 Home"):
gr.Markdown("""
🎯 Welcome to Synthetic Test Data Generator
... (المحتوى)
""")
# Stats and Use Cases
with gr.Row():
with gr.Column():
gr.Markdown("### 📊 Quick Stats")
gr.Markdown("""
Formats Supported: 7
Max Records: 100 per generation
Templates: 7 pre-built
Languages: 4
""")
with gr.Column():
gr.Markdown("### 🎯 Use Cases")
gr.Markdown("""
API Testing
Database Seeding
Load Testing
UI Testing
Integration Tests
Demo Data Creation
""")
# Quick Action Button
gr.Markdown("---")
gr.Markdown("### 🔧 Quick Action")
quick_generate_btn = gr.Button("🚀 Go to Generator Tab", variant="primary")
quick_generate_btn.click(
fn=None,
inputs=[],
outputs=[],
js='() => { document.querySelector(\'[data-testid="tab-Generator"]\').click(); }'
)
# ============================================
# Tab 1: Generator (Main) - يجب أن يكون داخل كتلة demo
# ============================================
with gr.Tab("Generator") as generator_tab:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### ⚙️ Configuration")
# Language Selection
data_language = gr.Dropdown(
choices=["python", "javascript", "java", "csharp"],
value="python",
label="🎯 Target Language"
)
# Format Selection
data_format = gr.Dropdown(
choices=["json", "csv", "sql", "xml", "python_dict", "yaml", "excel_style"],
value="json",
label="📄 Output Format"
)
# Format description
format_desc = gr.Markdown("Format Info: " + get_output_format_description("json"))
# Record count
data_count = gr.Slider(
minimum=1,
maximum=100,
value=10,
step=1,
label="📊 Number of Records"
)
gr.Markdown("### 📝 Schema Definition")
# Template selection
data_template = gr.Dropdown(
choices=[
"custom", "user_profile", "ecommerce_order", "api_request_log",
"contact_info", "financial_transaction", "healthcare_patient"
],
value="custom",
label="📁 Schema Template"
)
# Schema input
schema_input = gr.Textbox(
value='''{
"user_id": "uuid",
"full_name": "name",
"email": "email",
"age": {"type": "int", "min": 18, "max": 99},
"is_active": "boolean",
"country": "country"
}''',
lines=15,
label="Schema Definition"
)
# Schema preview
preview_btn = gr.Button("🔍 Preview Schema", size="sm")
schema_preview = gr.Markdown("")
# Actions
generate_btn = gr.Button("✨ Generate Test Data", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### 📊 Generation Statistics")
data_stats = gr.Markdown("No data generated yet")
gr.Markdown("### 📝 Generated Data")
data_output = gr.Code(label="Test Data", language="json", lines=15)
gr.Markdown("### 💻 Ready-to-use Code Template")
code_output = gr.Code(label="Code Template", language="python", lines=12)
# تعريف الدوال المساعدة
def update_code_language(format_type):
lang_map = {
"json": "json",
"csv": "csv",
"sql": "sql",
"xml": "xml",
"python_dict": "python",
"yaml": "yaml",
"excel_style": "markdown"
}
return gr.update(language=lang_map.get(format_type, "text"))
# Event Handlers
data_template.change(
fn=load_schema_template,
inputs=[data_template],
outputs=[schema_input]
)
preview_btn.click(
fn=preview_schema,
inputs=[schema_input],
outputs=[schema_preview]
)
generate_btn.click(
fn=generate_synthetic_data,
inputs=[data_language, schema_input, data_count, data_format, data_template],
outputs=[data_output, code_output, data_stats, schema_input]
)
data_format.change(
fn=get_output_format_description,
inputs=[data_format],
outputs=[format_desc]
)
data_format.change(
fn=update_code_language,
inputs=[data_format],
outputs=[data_output]
)
# ============================================
# Tab 2: Templates Library
# ============================================
with gr.Tab("📚 Templates Library"):
gr.Markdown("# 📚 Schema Templates Library")
gr.Markdown("Browse and learn from these example schemas")
template_selector = gr.Dropdown(
choices=[
"user_profile", "ecommerce_order", "api_request_log",
"contact_info", "financial_transaction", "healthcare_patient"
],
value="user_profile",
label="Select Template to View"
)
template_display = gr.Code(label="Template Schema", language="json", lines=20)
template_description = gr.Markdown("")
def show_template_details(template_name):
templates_info = {
"user_profile": {
"desc": "User Profile Schema\n\nPerfect for testing user management systems, authentication, and profile features.\n\nUse Cases:\n- User registration tests\n- Profile management\n- User search/filtering\n- Data export features",
"schema": load_schema_template(template_name)
},
"ecommerce_order": {
"desc": "E-commerce Order Schema\n\nIdeal for testing shopping carts, checkout flows, and order management systems.\n\nUse Cases:\n- Order creation tests\n- Cart calculations\n- Inventory management\n- Order history features",
"schema": load_schema_template(template_name)
},
"api_request_log": {
"desc": "API Request Log Schema\n\nGreat for testing logging systems, analytics, and monitoring tools.\n\nUse Cases:\n- API monitoring tests\n- Log analysis\n- Performance metrics\n- Error tracking",
"schema": load_schema_template(template_name)
},
"contact_info": {
"desc": "Contact Information Schema\n\nUseful for CRM systems, address books, and contact management features.\n\nUse Cases:\n- Contact import/export\n- Address validation\n- Phone number formatting\n- Data enrichment tests",
"schema": load_schema_template(template_name)
},
"financial_transaction": {
"desc": "Financial Transaction Schema\n\nEssential for banking, payment systems, and financial applications.\n\nUse Cases:\n- Payment processing tests\n- Transaction history\n- Fraud detection\n- Statement generation",
"schema": load_schema_template(template_name)
},
"healthcare_patient": {
"desc": "Healthcare Patient Schema\n\nDesigned for healthcare systems, patient management, and medical records.\n\nUse Cases:\n- Patient registration\n- Medical records\n- Appointment scheduling\n- Insurance verification",
"schema": load_schema_template(template_name)
}
}
info = templates_info.get(template_name, templates_info["user_profile"])
return info["schema"], info["desc"]
template_selector.change(
fn=show_template_details,
inputs=[template_selector],
outputs=[template_display, template_description]
)
# Load initial template
initial_schema, initial_desc = show_template_details("user_profile")
template_display.value = initial_schema
template_description.value = initial_desc
gr.Markdown("---")
gr.Markdown("### 💡 Tips for Creating Custom Schemas")
gr.Markdown("""
- Use descriptive field names - The AI understands context (e.g., 'email' vs 'e')
- Specify data types - Include hints like 'string', 'integer', 'decimal', 'date'
- Add constraints - Use ranges like 'age: 18-80' or patterns like 'email format'
- Include enums - For fields with limited options (e.g., 'status: pending,active,closed')
- Structure nested data - Use JSON objects or arrays for complex relationships
""")
# ============================================
# Tab 3: About & Help
# ============================================
with gr.Tab("ℹ️ About & Help"):
gr.Markdown("""
ℹ️ **About Synthetic Test Data Generator**
**What is this tool?**
This tool uses AI (Groq's LLM) to generate realistic, production-like test data based on your schema definitions. It helps QA engineers and developers create comprehensive test datasets quickly and efficiently.
... (باقي المحتوى)
""")
# ============================================
# Launch Application
# ============================================
if __name__ == "__main__":
print("=" * 60)
print("🎲 Synthetic Test Data Generator")
print("=" * 60)
print("✅ All components loaded successfully!")
print("=" * 60)
print("🚀 Launching application...")
print("=" * 60)
demo.launch(
share=True,
debug=False,
server_name="0.0.0.0",
server_port=7860,
theme=gr.themes.Soft()
)