Spaces:

algorithmicsuperintelligence
/

prompt-optimizer

Running

Asankhaya Sharma Claude Opus 4.6 (1M context) commited on 17 days ago

Commit

9efc6b5

1 Parent(s): 1332a7b

Redesign UI, fix dark mode, generic evaluator, and reduce run time

- Redesigned UI: single-page layout with inline-styled hero header,
removed tabs to fix width inconsistency, all text uses gr.Markdown
for proper dark mode theming
- Generic answer matching: supports IMDB (positive/negative), BoolQ
(true/false), GSM8K (#### extraction), and numeric answers
- Regression protection: if evolution doesn't improve, keeps initial
prompt instead of reporting a worse one
- Reduced to 20 samples and 5 iterations to fit within HF Space
600s timeout (~546s observed)
- Single IMDB preset with intentionally weak starting prompt to
showcase evolution improvement
- Added timing note: "Optimization can take up to 10 minutes"
- Fixed incorrect "10 variants per generation" text

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

app.py +254 -141

app.py CHANGED Viewed

@@ -15,10 +15,73 @@ import glob
 # Model for OpenRouter
 MODELS = [
-    "google/gemini-2.5-flash-lite",
 ]
 def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
     """
     Validate that the dataset exists and has the required fields.
@@ -237,30 +300,8 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
                 # Small delay to avoid rate limiting
                 time.sleep(0.1)
-                # IMDB labels: 0 = negative, 1 = positive
-                true_label = int(target)  # 0 or 1
-                # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
-                # This is strict enough to fail conversational responses, but learnable through evolution
-                pred_lower = prediction.lower()
-                pred_start = pred_lower[:150]  # First 150 chars
-                # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
-                has_sentiment_keyword = "sentiment" in pred_start
-                # Check for positive/negative indicators
-                has_positive = "positive" in pred_start
-                has_negative = "negative" in pred_start
-                # Only count as correct if sentiment keyword present AND unambiguous positive/negative
-                if has_sentiment_keyword and has_positive and not has_negative:
-                    predicted_label = 1
-                elif has_sentiment_keyword and has_negative and not has_positive:
-                    predicted_label = 0
-                else:
-                    predicted_label = -1
-                is_correct = (predicted_label == true_label)
                 if is_correct:
                     correct += 1
@@ -486,10 +527,10 @@ def parse_evolution_history(output_dir: str) -> str:
         if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
             evolution_viz += "### Evolution Complete\n\n"
             evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
-            evolution_viz += "- **Population Size**: 10 prompts per generation\n"
-            evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
-            evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
-            evolution_viz += "- **Evaluation**: 50 samples per prompt variant\n\n"
             # Count files in output directory
             all_files = os.listdir(output_dir)
@@ -503,7 +544,7 @@ def parse_evolution_history(output_dir: str) -> str:
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
-    """Create an evaluator.py file for OpenEvolve that uses same 50 samples as initial/final eval."""
     evaluator_code = f'''
 import os
 import random
@@ -516,7 +557,7 @@ def evaluate(prompt: str) -> dict:
     Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
     OpenEvolve passes a file path, so we need to read the prompt from the file.
-    Using the same 50 samples ensures evolution optimizes for the exact test set.
     Includes early stopping and rate limit handling.
     """
     try:
@@ -546,8 +587,8 @@ def evaluate(prompt: str) -> dict:
             else:
                 raise
-        # Sample 50 samples with seed 42 - SAME as initial/final evaluation for consistency!
-        num_samples = 50
         if len(dataset) > num_samples:
             # Use SAME sampling logic as initial/final eval
             indices = random.sample(range(len(dataset)), num_samples)
@@ -607,30 +648,48 @@ def evaluate(prompt: str) -> dict:
                 prediction = response.choices[0].message.content.strip()
-                # IMDB labels: 0 = negative, 1 = positive
-                true_label = int(target)  # 0 or 1
-                # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
-                # This is strict enough to fail conversational responses, but learnable through evolution
-                pred_lower = prediction.lower()
-                pred_start = pred_lower[:150]  # First 150 chars
-                # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
-                has_sentiment_keyword = "sentiment" in pred_start
-                # Check for positive/negative indicators
-                has_positive = "positive" in pred_start
-                has_negative = "negative" in pred_start
-                # Only count as correct if sentiment keyword present AND unambiguous positive/negative
-                if has_sentiment_keyword and has_positive and not has_negative:
-                    predicted_label = 1
-                elif has_sentiment_keyword and has_negative and not has_positive:
-                    predicted_label = 0
-                else:
-                    predicted_label = -1
-                is_correct = (predicted_label == true_label)
                 if is_correct:
                     correct += 1
@@ -781,7 +840,7 @@ Your improved prompt here
             "api_base": "https://openrouter.ai/api/v1",  # Use OpenRouter endpoint
             "temperature": 1.2,  # Even higher temperature for more creative variations
         },
-        "max_iterations": 10,  # More iterations for better convergence
         "checkpoint_interval": 1,  # Save checkpoints every iteration to preserve prompt history
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
@@ -855,11 +914,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
-        # Run initial evaluation with 50 samples
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
-        progress(0.2, desc="Running initial evaluation on 50 samples...")
         initial_eval = evaluate_prompt(
-            initial_prompt, dataset_name, dataset_split, 50,
             model, input_field, target_field
         )
@@ -893,7 +952,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
-        progress(0.3, desc="Starting evolution: 10 iterations, 10 variants per generation...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
@@ -965,57 +1024,71 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
                     best_prompt = initial_prompt
                     print(f"\n[SELECTION] WARNING: No best_program.txt found, using initial prompt")
-            # Final evaluation: Use same 50 samples as initial eval for fair comparison
-            progress(0.85, desc="Evaluating best prompt on 50 samples (same as initial)...")
             final_eval = evaluate_prompt(
-                best_prompt, dataset_name, dataset_split, 50,
                 model, input_field, target_field,
-                fixed_indices=eval_indices  # Use same 50 samples as initial eval!
             )
             progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
             final_results = f"""
-### Evolved Prompt Evaluation
 **Prompt:**
 ```
 {best_prompt}
 ```
-**Validation:**
-- Contains {{input}} placeholder: {'✓ Yes' if '{input}' in best_prompt else '❌ NO - This will break evaluation!'}
-- Prompt length: {len(best_prompt)} characters
 **Results:**
 - Accuracy: {final_eval['accuracy']:.2f}%
 - Correct: {final_eval['correct']}/{final_eval['total']}
-- Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
-**Sample Results:**
 """
             for i, result in enumerate(final_eval['results'][:5], 1):
                 final_results += f"\n{i}. Input: {result['input']}\n"
                 final_results += f"   Target: {result['target']}\n"
                 final_results += f"   Prediction: {result['prediction']}\n"
                 final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
             summary = f"""
-## 🎉 Optimization Complete!
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Evaluation Model**: {model}
-- **Evolution Model**: google/gemini-2.5-flash (larger model for better prompt generation)
-- **Initial Eval**: 50 samples
-- **Final Eval**: 50 samples (same samples for fair comparison)
-- **Evolution**: 50 samples per variant (SAME samples as initial/final!)
-- **Iterations**: 10 (population: 15, elite: 40%, explore: 10%, exploit: 50%)
 ### Results
 - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
 - **Final Accuracy**: {final_eval['accuracy']:.2f}% ({final_eval['correct']}/{final_eval['total']})
-- **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
 {validation_message}
 """
@@ -1033,88 +1106,125 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
         pass
-# Create Gradio interface
-with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # 🧬 OpenEvolve Prompt Optimizer
-    Automatically optimize prompts using evolutionary algorithms. Evolves better prompts by testing on real datasets.
-    **Setup**: Duplicate this space, add your OpenRouter API key (`OPENAI_API_KEY`) in Settings → Secrets. Get free key at [openrouter.ai](https://openrouter.ai/)
-    **Usage**: Enter initial prompt with `{input}` placeholder → Click optimize → Compare results
-    **Model**: `google/gemini-2.5-flash-lite`
-    """)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Configuration")
-            dataset_name = gr.Textbox(
-                label="HuggingFace Dataset (Full Name)",
-                value="stanfordnlp/imdb",
-                placeholder="e.g., stanfordnlp/imdb, gsm8k, MathArena/aime_2025",
-                info="Dataset name from HuggingFace Hub. Default: IMDB (sentiment classification)"
-            )
-            dataset_split = gr.Textbox(
-                label="Dataset Split",
-                value="test",
-                placeholder="e.g., train, test, validation"
-            )
-            input_field = gr.Textbox(
-                label="Input Field Name",
-                value="text",
-                placeholder="e.g., text, question, sentence",
-                info="The field containing inputs to process"
-            )
-            target_field = gr.Textbox(
-                label="Target Field Name",
-                value="label",
-                placeholder="e.g., label, answer, target",
-                info="The field containing expected outputs"
-            )
-            initial_prompt = gr.TextArea(
-                label="Initial Prompt",
-                value="Review sentiment {input}",
-                lines=5,
-                info="Use {input} as placeholder. This baseline scores ~60% - evolution will improve it!"
-            )
-    # Button outside the column for better visibility
     with gr.Row():
-        with gr.Column():
-            optimize_btn = gr.Button("🚀 Validate & Optimize Prompt", variant="primary", size="lg")
-    # Results section - clearly separated
-    gr.Markdown("---")
-    gr.Markdown("## 📊 Results")
-    with gr.Row():
-        with gr.Column():
-            summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
-    # Side-by-side comparison: Initial vs Best Prompt
     gr.Markdown("---")
-    gr.Markdown("## 🔍 Prompt Comparison: Initial vs Best")
-    with gr.Row():
         with gr.Column():
-            initial_results = gr.Markdown("### Initial Prompt\nWill appear here after validation...", visible=True)
         with gr.Column():
-            final_results = gr.Markdown("### Best Prompt\nWill appear here after optimization...", visible=True)
-    # Wire up the optimize button with hardcoded model
     def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
                                    input_field, target_field, progress=gr.Progress()):
-        """Wrapper to use fixed model instead of dropdown"""
         return optimize_prompt(
             initial_prompt, dataset_name, dataset_split,
-            MODELS[0],
             input_field, target_field, progress
         )
@@ -1122,8 +1232,11 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
         fn=optimize_with_fixed_model,
         inputs=[initial_prompt, dataset_name, dataset_split,
                 input_field, target_field],
-        outputs=[summary, initial_results, final_results]
     )
 if __name__ == "__main__":
-    demo.launch()

 # Model for OpenRouter
 MODELS = [
+    "google/gemini-2.5-flash-lite",
 ]
+def extract_answer(text: str) -> str:
+    """Extract the core answer from a string.
+    Handles:
+    - GSM8K format: "reasoning...\n#### 2280" -> "2280"
+    - Numeric labels: "0" or "1" -> "0" or "1"
+    - Plain text answers
+    """
+    text = str(text).strip()
+    # GSM8K: extract number after ####
+    if "####" in text:
+        answer = text.split("####")[-1].strip()
+        # Remove commas from numbers like "1,234"
+        answer = answer.replace(",", "")
+        return answer
+    return text
+def check_answer(prediction: str, target: str) -> bool:
+    """Check if prediction matches target using flexible matching."""
+    target_answer = extract_answer(target).lower().strip()
+    pred_lower = prediction.lower().strip()
+    # Handle boolean targets (e.g., BoolQ returns Python True/False)
+    if target_answer in ("true", "false"):
+        pred_start = pred_lower[:200]
+        has_yes = any(w in pred_start for w in ("true", "yes"))
+        has_no = any(w in pred_start for w in ("false", "no"))
+        if target_answer == "true" and has_yes and not has_no:
+            return True
+        if target_answer == "false" and has_no and not has_yes:
+            return True
+        return False
+    # Direct containment check
+    if target_answer in pred_lower:
+        return True
+    # For numeric targets, look for the number in the prediction
+    try:
+        target_num = float(target_answer)
+        numbers = re.findall(r'-?[\d,]+\.?\d*', pred_lower)
+        for n in numbers:
+            try:
+                if float(n.replace(",", "")) == target_num:
+                    return True
+            except ValueError:
+                continue
+    except ValueError:
+        pass
+    # For IMDB-style labels (0/1), check for positive/negative keywords
+    if target_answer in ("0", "1"):
+        has_positive = "positive" in pred_lower[:200]
+        has_negative = "negative" in pred_lower[:200]
+        if target_answer == "1" and has_positive and not has_negative:
+            return True
+        if target_answer == "0" and has_negative and not has_positive:
+            return True
+    return False
 def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
     """
     Validate that the dataset exists and has the required fields.
                 # Small delay to avoid rate limiting
                 time.sleep(0.1)
+                # Generic answer matching: extract core answer from both target and prediction
+                is_correct = check_answer(prediction, str(target))
                 if is_correct:
                     correct += 1
         if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
             evolution_viz += "### Evolution Complete\n\n"
             evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
+            evolution_viz += "- **Variants**: 1 new prompt per iteration\n"
+            evolution_viz += "- **Selection Strategy**: 40% elite, 10% explore, 50% exploit\n"
+            evolution_viz += "- **Population**: 1 island, up to 15 programs retained\n"
+            evolution_viz += "- **Evaluation**: 20 samples per prompt variant\n\n"
             # Count files in output directory
             all_files = os.listdir(output_dir)
 def create_evaluator_file(dataset_name: str, split: str, model: str,
                          input_field: str, target_field: str, work_dir: str):
+    """Create an evaluator.py file for OpenEvolve that uses same 20 samples as initial/final eval."""
     evaluator_code = f'''
 import os
 import random
     Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
     OpenEvolve passes a file path, so we need to read the prompt from the file.
+    Using the same 20 samples ensures evolution optimizes for the exact test set.
     Includes early stopping and rate limit handling.
     """
     try:
             else:
                 raise
+        # Sample 20 samples with seed 42 - SAME as initial/final evaluation for consistency!
+        num_samples = 20
         if len(dataset) > num_samples:
             # Use SAME sampling logic as initial/final eval
             indices = random.sample(range(len(dataset)), num_samples)
                 prediction = response.choices[0].message.content.strip()
+                # Generic answer matching
+                target_str = str(target).strip()
+                # Extract core answer (handles GSM8K "####" format, plain labels, etc.)
+                if "####" in target_str:
+                    target_answer = target_str.split("####")[-1].strip().replace(",", "")
+                else:
+                    target_answer = target_str
+                pred_lower = prediction.lower().strip()
+                target_lower = target_answer.lower().strip()
+                is_correct = False
+                # Direct containment
+                if target_lower in pred_lower:
+                    is_correct = True
+                # Numeric matching
+                if not is_correct:
+                    import re as _re
+                    try:
+                        target_num = float(target_lower)
+                        numbers = _re.findall(r'-?[\\d,]+\\.?\\d*', pred_lower)
+                        for n in numbers:
+                            try:
+                                if float(n.replace(",", "")) == target_num:
+                                    is_correct = True
+                                    break
+                            except ValueError:
+                                continue
+                    except ValueError:
+                        pass
+                # IMDB-style 0/1 labels
+                if not is_correct and target_lower in ("0", "1"):
+                    has_positive = "positive" in pred_lower[:200]
+                    has_negative = "negative" in pred_lower[:200]
+                    if target_lower == "1" and has_positive and not has_negative:
+                        is_correct = True
+                    if target_lower == "0" and has_negative and not has_positive:
+                        is_correct = True
                 if is_correct:
                     correct += 1
             "api_base": "https://openrouter.ai/api/v1",  # Use OpenRouter endpoint
             "temperature": 1.2,  # Even higher temperature for more creative variations
         },
+        "max_iterations": 5,  # Fewer iterations to fit within time limits
         "checkpoint_interval": 1,  # Save checkpoints every iteration to preserve prompt history
         "diff_based_evolution": False,  # Use full rewrite mode for prompts (not diff/patch mode)
         "language": "text",  # CRITICAL: Optimize text/prompts, not Python code!
         progress(0.15, desc="Creating configuration...")
         config_path = create_config_file(model, work_dir)
+        # Run initial evaluation with 20 samples
         # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
+        progress(0.2, desc="Running initial evaluation on 20 samples...")
         initial_eval = evaluate_prompt(
+            initial_prompt, dataset_name, dataset_split, 20,
             model, input_field, target_field
         )
             initial_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
         # Run OpenEvolve
+        progress(0.3, desc="Starting evolution: 5 iterations...")
         output_dir = os.path.join(work_dir, "output")
         os.makedirs(output_dir, exist_ok=True)
                     best_prompt = initial_prompt
                     print(f"\n[SELECTION] WARNING: No best_program.txt found, using initial prompt")
+            # Final evaluation: Use same 20 samples as initial eval for fair comparison
+            progress(0.85, desc="Evaluating best prompt on 20 samples (same as initial)...")
             final_eval = evaluate_prompt(
+                best_prompt, dataset_name, dataset_split, 20,
                 model, input_field, target_field,
+                fixed_indices=eval_indices  # Use same 20 samples as initial eval!
             )
+            # If evolution regressed, fall back to the initial prompt
+            if final_eval['accuracy'] < initial_eval['accuracy']:
+                best_prompt = initial_prompt
+                final_eval = initial_eval
+                regression = True
+            else:
+                regression = False
             progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
+            improvement = final_eval['accuracy'] - initial_eval['accuracy']
             final_results = f"""
+### Best Prompt
 **Prompt:**
 ```
 {best_prompt}
 ```
 **Results:**
 - Accuracy: {final_eval['accuracy']:.2f}%
 - Correct: {final_eval['correct']}/{final_eval['total']}
 """
+            if regression:
+                final_results += "\n**Note:** Evolution did not improve on the initial prompt. Keeping the original.\n"
+            else:
+                final_results += f"\n- Improvement: {improvement:+.2f}%\n"
+            final_results += "\n**Sample Results:**\n"
             for i, result in enumerate(final_eval['results'][:5], 1):
                 final_results += f"\n{i}. Input: {result['input']}\n"
                 final_results += f"   Target: {result['target']}\n"
                 final_results += f"   Prediction: {result['prediction']}\n"
                 final_results += f"   ✓ Correct\n" if result['correct'] else f"   ✗ Incorrect\n"
+            if regression:
+                summary_title = "## Optimization Complete (No Improvement)"
+                summary_note = "\n**Evolution did not find a better prompt.** The initial prompt is already strong for this task.\n"
+            else:
+                summary_title = "## Optimization Complete!"
+                summary_note = ""
             summary = f"""
+{summary_title}
 ### Summary
 - **Dataset**: {dataset_name} ({dataset_split} split)
 - **Evaluation Model**: {model}
+- **Evolution Model**: google/gemini-2.5-flash
+- **Samples**: 20 (same for initial, evolution, and final eval)
+- **Iterations**: 5
+{summary_note}
 ### Results
 - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
 - **Final Accuracy**: {final_eval['accuracy']:.2f}% ({final_eval['correct']}/{final_eval['total']})
+- **Improvement**: {improvement:+.2f}%
 {validation_message}
 """
         pass
+# Custom CSS for a polished, branded look
+custom_css = """
+/* Minimal CSS — only style what Gradio can't handle natively.
+   All text-bearing elements use gr.Markdown (inherits theme colors).
+   Only the run button gets custom styling. */
+.gradio-container { max-width: 1200px !important; margin: auto; }
+/* Primary action button — always purple with white text */
+.run-btn button, .run-btn > button, button.run-btn, .run-btn {
+    background: linear-gradient(135deg, #7c3aed 0%, #6d28d9 100%) !important;
+    color: #fff !important;
+    border: none !important;
+    border-radius: 12px !important;
+    font-size: 1.05rem !important;
+    font-weight: 600 !important;
+    padding: 14px 28px !important;
+    transition: transform 0.1s, box-shadow 0.2s !important;
+}
+.run-btn:hover, .run-btn button:hover {
+    transform: translateY(-1px) !important;
+    box-shadow: 0 8px 24px rgba(124,58,237,0.35) !important;
+    color: #fff !important;
+}
+"""
+# Preset configurations
+PRESETS = {
+    "imdb": {
+        "dataset": "stanfordnlp/imdb",
+        "split": "test",
+        "input": "text",
+        "target": "label",
+        "prompt": "What do you think about this? {input}",
+    },
+}
+def load_preset(name):
+    p = PRESETS[name]
+    return p["dataset"], p["split"], p["input"], p["target"], p["prompt"]
+# Create Gradio interface
+with gr.Blocks(title="OpenEvolve Prompt Optimizer") as demo:
+    # --- Hero header (self-contained dark bg, always light text) ---
+    gr.HTML("""
+    <div style="background:linear-gradient(135deg,#0f0c29 0%,#302b63 50%,#24243e 100%);
+                border-radius:16px;padding:32px 40px;margin-bottom:8px;text-align:center;">
+        <h1 style="color:#fff;font-size:2rem;font-weight:700;margin:0 0 8px 0;letter-spacing:-0.02em;">
+            OpenEvolve Prompt Optimizer
+        </h1>
+        <p style="color:#c4b5fd;font-size:0.95rem;margin:0;">
+            Evolve better prompts automatically using
+            <a href="https://github.com/codelion/openevolve" target="_blank" style="color:#c4b5fd;text-decoration:underline;">OpenEvolve</a>.
+            Powered by <code style="background:rgba(255,255,255,0.12);color:#e0d4ff;padding:2px 6px;border-radius:4px;">gemini-2.5-flash</code> via
+            <a href="https://openrouter.ai/" target="_blank" style="color:#c4b5fd;text-decoration:underline;">OpenRouter</a>.
+        </p>
+        <p style="color:#94a3b8;font-size:0.82rem;margin:12px 0 0 0;">
+            <strong style="color:#a78bfa;">1.</strong> Pick a dataset &amp; prompt &rarr;
+            <strong style="color:#a78bfa;">2.</strong> Evolve 5 iterations &rarr;
+            <strong style="color:#a78bfa;">3.</strong> Compare results side-by-side
+        </p>
+    </div>
+    """)
+    # --- Setup ---
+    gr.Markdown("#### Dataset")
+    gr.Markdown("Quick preset:")
+    preset_imdb = gr.Button("IMDB Sentiment", size="sm")
+    dataset_name = gr.Textbox(
+        label="HuggingFace Dataset",
+        value="stanfordnlp/imdb",
+        placeholder="org/dataset-name",
+    )
     with gr.Row():
+        dataset_split = gr.Textbox(label="Split", value="test", scale=1)
+        input_field = gr.Textbox(label="Input Field", value="text", scale=1)
+        target_field = gr.Textbox(label="Target Field", value="label", scale=1)
+    gr.Markdown("#### Prompt")
+    initial_prompt = gr.TextArea(
+        label="Initial Prompt",
+        value="What do you think about this? {input}",
+        lines=5,
+        info="Must contain {input} placeholder. Start with a weak prompt -- evolution will improve it!",
+    )
+    gr.Markdown(
+        "*Eval model:* `gemini-2.5-flash-lite` (20 samples) | *Evolution model:* `gemini-2.5-flash` (5 iterations)  \n"
+        "**Note:** Optimization can take up to 10 minutes to complete."
+    )
+    # Run button
+    optimize_btn = gr.Button(
+        "Optimize Prompt",
+        variant="primary",
+        size="lg",
+        elem_classes="run-btn",
+    )
+    # --- Results ---
     gr.Markdown("---")
+    summary = gr.Markdown("")
+    with gr.Row(equal_height=True):
         with gr.Column():
+            initial_results = gr.Markdown("**Initial Prompt**\n\nResults will appear here after optimization...")
         with gr.Column():
+            final_results = gr.Markdown("**Evolved Prompt**\n\nResults will appear here after optimization...")
+    # --- Wiring ---
     def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
                                    input_field, target_field, progress=gr.Progress()):
         return optimize_prompt(
             initial_prompt, dataset_name, dataset_split,
+            MODELS[0],
             input_field, target_field, progress
         )
         fn=optimize_with_fixed_model,
         inputs=[initial_prompt, dataset_name, dataset_split,
                 input_field, target_field],
+        outputs=[summary, initial_results, final_results],
     )
+    preset_outputs = [dataset_name, dataset_split, input_field, target_field, initial_prompt]
+    preset_imdb.click(fn=lambda: load_preset("imdb"), outputs=preset_outputs)
 if __name__ == "__main__":
+    demo.launch(css=custom_css)