Asankhaya Sharma Claude Opus 4.6 (1M context) commited on
Commit
9efc6b5
·
1 Parent(s): 1332a7b

Redesign UI, fix dark mode, generic evaluator, and reduce run time

Browse files

- Redesigned UI: single-page layout with inline-styled hero header,
removed tabs to fix width inconsistency, all text uses gr.Markdown
for proper dark mode theming
- Generic answer matching: supports IMDB (positive/negative), BoolQ
(true/false), GSM8K (#### extraction), and numeric answers
- Regression protection: if evolution doesn't improve, keeps initial
prompt instead of reporting a worse one
- Reduced to 20 samples and 5 iterations to fit within HF Space
600s timeout (~546s observed)
- Single IMDB preset with intentionally weak starting prompt to
showcase evolution improvement
- Added timing note: "Optimization can take up to 10 minutes"
- Fixed incorrect "10 variants per generation" text

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +254 -141
app.py CHANGED
@@ -15,10 +15,73 @@ import glob
15
 
16
  # Model for OpenRouter
17
  MODELS = [
18
- "google/gemini-2.5-flash-lite",
19
  ]
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
23
  """
24
  Validate that the dataset exists and has the required fields.
@@ -237,30 +300,8 @@ def evaluate_prompt(prompt: str, dataset_name: str, split: str, num_samples: int
237
  # Small delay to avoid rate limiting
238
  time.sleep(0.1)
239
 
240
- # IMDB labels: 0 = negative, 1 = positive
241
- true_label = int(target) # 0 or 1
242
-
243
- # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
244
- # This is strict enough to fail conversational responses, but learnable through evolution
245
- pred_lower = prediction.lower()
246
- pred_start = pred_lower[:150] # First 150 chars
247
-
248
- # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
249
- has_sentiment_keyword = "sentiment" in pred_start
250
-
251
- # Check for positive/negative indicators
252
- has_positive = "positive" in pred_start
253
- has_negative = "negative" in pred_start
254
-
255
- # Only count as correct if sentiment keyword present AND unambiguous positive/negative
256
- if has_sentiment_keyword and has_positive and not has_negative:
257
- predicted_label = 1
258
- elif has_sentiment_keyword and has_negative and not has_positive:
259
- predicted_label = 0
260
- else:
261
- predicted_label = -1
262
-
263
- is_correct = (predicted_label == true_label)
264
 
265
  if is_correct:
266
  correct += 1
@@ -486,10 +527,10 @@ def parse_evolution_history(output_dir: str) -> str:
486
  if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
487
  evolution_viz += "### Evolution Complete\n\n"
488
  evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
489
- evolution_viz += "- **Population Size**: 10 prompts per generation\n"
490
- evolution_viz += "- **Selection Strategy**: 10% elite, 30% explore, 60% exploit\n"
491
- evolution_viz += "- **Islands**: 1 population with mutation and crossover\n"
492
- evolution_viz += "- **Evaluation**: 50 samples per prompt variant\n\n"
493
 
494
  # Count files in output directory
495
  all_files = os.listdir(output_dir)
@@ -503,7 +544,7 @@ def parse_evolution_history(output_dir: str) -> str:
503
 
504
  def create_evaluator_file(dataset_name: str, split: str, model: str,
505
  input_field: str, target_field: str, work_dir: str):
506
- """Create an evaluator.py file for OpenEvolve that uses same 50 samples as initial/final eval."""
507
  evaluator_code = f'''
508
  import os
509
  import random
@@ -516,7 +557,7 @@ def evaluate(prompt: str) -> dict:
516
  Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
517
 
518
  OpenEvolve passes a file path, so we need to read the prompt from the file.
519
- Using the same 50 samples ensures evolution optimizes for the exact test set.
520
  Includes early stopping and rate limit handling.
521
  """
522
  try:
@@ -546,8 +587,8 @@ def evaluate(prompt: str) -> dict:
546
  else:
547
  raise
548
 
549
- # Sample 50 samples with seed 42 - SAME as initial/final evaluation for consistency!
550
- num_samples = 50
551
  if len(dataset) > num_samples:
552
  # Use SAME sampling logic as initial/final eval
553
  indices = random.sample(range(len(dataset)), num_samples)
@@ -607,30 +648,48 @@ def evaluate(prompt: str) -> dict:
607
 
608
  prediction = response.choices[0].message.content.strip()
609
 
610
- # IMDB labels: 0 = negative, 1 = positive
611
- true_label = int(target) # 0 or 1
612
 
613
- # FORMAT REQUIREMENT: Need "sentiment" keyword + positive/negative in first 150 chars
614
- # This is strict enough to fail conversational responses, but learnable through evolution
615
- pred_lower = prediction.lower()
616
- pred_start = pred_lower[:150] # First 150 chars
 
617
 
618
- # Must mention "sentiment" to get credit (helps evolution learn to add this keyword)
619
- has_sentiment_keyword = "sentiment" in pred_start
620
 
621
- # Check for positive/negative indicators
622
- has_positive = "positive" in pred_start
623
- has_negative = "negative" in pred_start
624
 
625
- # Only count as correct if sentiment keyword present AND unambiguous positive/negative
626
- if has_sentiment_keyword and has_positive and not has_negative:
627
- predicted_label = 1
628
- elif has_sentiment_keyword and has_negative and not has_positive:
629
- predicted_label = 0
630
- else:
631
- predicted_label = -1
632
 
633
- is_correct = (predicted_label == true_label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
 
635
  if is_correct:
636
  correct += 1
@@ -781,7 +840,7 @@ Your improved prompt here
781
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
782
  "temperature": 1.2, # Even higher temperature for more creative variations
783
  },
784
- "max_iterations": 10, # More iterations for better convergence
785
  "checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
786
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
787
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
@@ -855,11 +914,11 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
855
  progress(0.15, desc="Creating configuration...")
856
  config_path = create_config_file(model, work_dir)
857
 
858
- # Run initial evaluation with 50 samples
859
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
860
- progress(0.2, desc="Running initial evaluation on 50 samples...")
861
  initial_eval = evaluate_prompt(
862
- initial_prompt, dataset_name, dataset_split, 50,
863
  model, input_field, target_field
864
  )
865
 
@@ -893,7 +952,7 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
893
  initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
894
 
895
  # Run OpenEvolve
896
- progress(0.3, desc="Starting evolution: 10 iterations, 10 variants per generation...")
897
 
898
  output_dir = os.path.join(work_dir, "output")
899
  os.makedirs(output_dir, exist_ok=True)
@@ -965,57 +1024,71 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
965
  best_prompt = initial_prompt
966
  print(f"\n[SELECTION] WARNING: No best_program.txt found, using initial prompt")
967
 
968
- # Final evaluation: Use same 50 samples as initial eval for fair comparison
969
- progress(0.85, desc="Evaluating best prompt on 50 samples (same as initial)...")
970
  final_eval = evaluate_prompt(
971
- best_prompt, dataset_name, dataset_split, 50,
972
  model, input_field, target_field,
973
- fixed_indices=eval_indices # Use same 50 samples as initial eval!
974
  )
975
 
 
 
 
 
 
 
 
 
976
  progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
977
 
 
 
978
  final_results = f"""
979
- ### Evolved Prompt Evaluation
980
 
981
  **Prompt:**
982
  ```
983
  {best_prompt}
984
  ```
985
 
986
- **Validation:**
987
- - Contains {{input}} placeholder: {'✓ Yes' if '{input}' in best_prompt else '❌ NO - This will break evaluation!'}
988
- - Prompt length: {len(best_prompt)} characters
989
-
990
  **Results:**
991
  - Accuracy: {final_eval['accuracy']:.2f}%
992
  - Correct: {final_eval['correct']}/{final_eval['total']}
993
- - Improvement: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
994
-
995
- **Sample Results:**
996
  """
 
 
 
 
 
 
997
  for i, result in enumerate(final_eval['results'][:5], 1):
998
  final_results += f"\n{i}. Input: {result['input']}\n"
999
  final_results += f" Target: {result['target']}\n"
1000
  final_results += f" Prediction: {result['prediction']}\n"
1001
  final_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
1002
 
 
 
 
 
 
 
 
1003
  summary = f"""
1004
- ## 🎉 Optimization Complete!
1005
 
1006
  ### Summary
1007
  - **Dataset**: {dataset_name} ({dataset_split} split)
1008
  - **Evaluation Model**: {model}
1009
- - **Evolution Model**: google/gemini-2.5-flash (larger model for better prompt generation)
1010
- - **Initial Eval**: 50 samples
1011
- - **Final Eval**: 50 samples (same samples for fair comparison)
1012
- - **Evolution**: 50 samples per variant (SAME samples as initial/final!)
1013
- - **Iterations**: 10 (population: 15, elite: 40%, explore: 10%, exploit: 50%)
1014
-
1015
  ### Results
1016
  - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
1017
  - **Final Accuracy**: {final_eval['accuracy']:.2f}% ({final_eval['correct']}/{final_eval['total']})
1018
- - **Improvement**: {final_eval['accuracy'] - initial_eval['accuracy']:+.2f}%
1019
 
1020
  {validation_message}
1021
  """
@@ -1033,88 +1106,125 @@ def optimize_prompt(initial_prompt: str, dataset_name: str, dataset_split: str,
1033
  pass
1034
 
1035
 
1036
- # Create Gradio interface
1037
- with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as demo:
1038
- gr.Markdown("""
1039
- # 🧬 OpenEvolve Prompt Optimizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1040
 
1041
- Automatically optimize prompts using evolutionary algorithms. Evolves better prompts by testing on real datasets.
 
 
 
 
 
 
 
 
 
1042
 
1043
- **Setup**: Duplicate this space, add your OpenRouter API key (`OPENAI_API_KEY`) in Settings → Secrets. Get free key at [openrouter.ai](https://openrouter.ai/)
1044
 
1045
- **Usage**: Enter initial prompt with `{input}` placeholder → Click optimize → Compare results
 
 
1046
 
1047
- **Model**: `google/gemini-2.5-flash-lite`
1048
- """)
1049
 
1050
- with gr.Row():
1051
- with gr.Column():
1052
- gr.Markdown("### Configuration")
1053
-
1054
- dataset_name = gr.Textbox(
1055
- label="HuggingFace Dataset (Full Name)",
1056
- value="stanfordnlp/imdb",
1057
- placeholder="e.g., stanfordnlp/imdb, gsm8k, MathArena/aime_2025",
1058
- info="Dataset name from HuggingFace Hub. Default: IMDB (sentiment classification)"
1059
- )
1060
-
1061
- dataset_split = gr.Textbox(
1062
- label="Dataset Split",
1063
- value="test",
1064
- placeholder="e.g., train, test, validation"
1065
- )
1066
-
1067
- input_field = gr.Textbox(
1068
- label="Input Field Name",
1069
- value="text",
1070
- placeholder="e.g., text, question, sentence",
1071
- info="The field containing inputs to process"
1072
- )
1073
 
1074
- target_field = gr.Textbox(
1075
- label="Target Field Name",
1076
- value="label",
1077
- placeholder="e.g., label, answer, target",
1078
- info="The field containing expected outputs"
1079
- )
1080
 
1081
- initial_prompt = gr.TextArea(
1082
- label="Initial Prompt",
1083
- value="Review sentiment {input}",
1084
- lines=5,
1085
- info="Use {input} as placeholder. This baseline scores ~60% - evolution will improve it!"
1086
- )
1087
 
1088
- # Button outside the column for better visibility
 
 
 
 
1089
  with gr.Row():
1090
- with gr.Column():
1091
- optimize_btn = gr.Button("🚀 Validate & Optimize Prompt", variant="primary", size="lg")
1092
-
1093
- # Results section - clearly separated
1094
- gr.Markdown("---")
1095
- gr.Markdown("## 📊 Results")
 
 
 
 
 
 
 
 
 
1096
 
1097
- with gr.Row():
1098
- with gr.Column():
1099
- summary = gr.Markdown("Click 'Validate & Optimize Prompt' to start optimization...", visible=True)
 
 
 
 
1100
 
1101
- # Side-by-side comparison: Initial vs Best Prompt
1102
  gr.Markdown("---")
1103
- gr.Markdown("## 🔍 Prompt Comparison: Initial vs Best")
1104
 
1105
- with gr.Row():
1106
  with gr.Column():
1107
- initial_results = gr.Markdown("### Initial Prompt\nWill appear here after validation...", visible=True)
1108
  with gr.Column():
1109
- final_results = gr.Markdown("### Best Prompt\nWill appear here after optimization...", visible=True)
1110
 
1111
- # Wire up the optimize button with hardcoded model
1112
  def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
1113
  input_field, target_field, progress=gr.Progress()):
1114
- """Wrapper to use fixed model instead of dropdown"""
1115
  return optimize_prompt(
1116
  initial_prompt, dataset_name, dataset_split,
1117
- MODELS[0],
1118
  input_field, target_field, progress
1119
  )
1120
 
@@ -1122,8 +1232,11 @@ with gr.Blocks(title="OpenEvolve Prompt Optimizer", theme=gr.themes.Soft()) as d
1122
  fn=optimize_with_fixed_model,
1123
  inputs=[initial_prompt, dataset_name, dataset_split,
1124
  input_field, target_field],
1125
- outputs=[summary, initial_results, final_results]
1126
  )
1127
 
 
 
 
1128
  if __name__ == "__main__":
1129
- demo.launch()
 
15
 
16
  # Model for OpenRouter
17
  MODELS = [
18
+ "google/gemini-2.5-flash-lite",
19
  ]
20
 
21
 
22
+ def extract_answer(text: str) -> str:
23
+ """Extract the core answer from a string.
24
+
25
+ Handles:
26
+ - GSM8K format: "reasoning...\n#### 2280" -> "2280"
27
+ - Numeric labels: "0" or "1" -> "0" or "1"
28
+ - Plain text answers
29
+ """
30
+ text = str(text).strip()
31
+ # GSM8K: extract number after ####
32
+ if "####" in text:
33
+ answer = text.split("####")[-1].strip()
34
+ # Remove commas from numbers like "1,234"
35
+ answer = answer.replace(",", "")
36
+ return answer
37
+ return text
38
+
39
+
40
+ def check_answer(prediction: str, target: str) -> bool:
41
+ """Check if prediction matches target using flexible matching."""
42
+ target_answer = extract_answer(target).lower().strip()
43
+ pred_lower = prediction.lower().strip()
44
+
45
+ # Handle boolean targets (e.g., BoolQ returns Python True/False)
46
+ if target_answer in ("true", "false"):
47
+ pred_start = pred_lower[:200]
48
+ has_yes = any(w in pred_start for w in ("true", "yes"))
49
+ has_no = any(w in pred_start for w in ("false", "no"))
50
+ if target_answer == "true" and has_yes and not has_no:
51
+ return True
52
+ if target_answer == "false" and has_no and not has_yes:
53
+ return True
54
+ return False
55
+
56
+ # Direct containment check
57
+ if target_answer in pred_lower:
58
+ return True
59
+
60
+ # For numeric targets, look for the number in the prediction
61
+ try:
62
+ target_num = float(target_answer)
63
+ numbers = re.findall(r'-?[\d,]+\.?\d*', pred_lower)
64
+ for n in numbers:
65
+ try:
66
+ if float(n.replace(",", "")) == target_num:
67
+ return True
68
+ except ValueError:
69
+ continue
70
+ except ValueError:
71
+ pass
72
+
73
+ # For IMDB-style labels (0/1), check for positive/negative keywords
74
+ if target_answer in ("0", "1"):
75
+ has_positive = "positive" in pred_lower[:200]
76
+ has_negative = "negative" in pred_lower[:200]
77
+ if target_answer == "1" and has_positive and not has_negative:
78
+ return True
79
+ if target_answer == "0" and has_negative and not has_positive:
80
+ return True
81
+
82
+ return False
83
+
84
+
85
  def validate_dataset(dataset_name: str, split: str, input_field: str, target_field: str) -> Tuple[bool, str]:
86
  """
87
  Validate that the dataset exists and has the required fields.
 
300
  # Small delay to avoid rate limiting
301
  time.sleep(0.1)
302
 
303
+ # Generic answer matching: extract core answer from both target and prediction
304
+ is_correct = check_answer(prediction, str(target))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
  if is_correct:
307
  correct += 1
 
527
  if not generation_files and not os.path.exists(log_file) and not os.path.exists(scores_file):
528
  evolution_viz += "### Evolution Complete\n\n"
529
  evolution_viz += "OpenEvolve ran 5 iterations of evolutionary optimization using:\n"
530
+ evolution_viz += "- **Variants**: 1 new prompt per iteration\n"
531
+ evolution_viz += "- **Selection Strategy**: 40% elite, 10% explore, 50% exploit\n"
532
+ evolution_viz += "- **Population**: 1 island, up to 15 programs retained\n"
533
+ evolution_viz += "- **Evaluation**: 20 samples per prompt variant\n\n"
534
 
535
  # Count files in output directory
536
  all_files = os.listdir(output_dir)
 
544
 
545
  def create_evaluator_file(dataset_name: str, split: str, model: str,
546
  input_field: str, target_field: str, work_dir: str):
547
+ """Create an evaluator.py file for OpenEvolve that uses same 20 samples as initial/final eval."""
548
  evaluator_code = f'''
549
  import os
550
  import random
 
557
  Evaluate a prompt using 50 fixed samples - SAME as initial and final evaluation.
558
 
559
  OpenEvolve passes a file path, so we need to read the prompt from the file.
560
+ Using the same 20 samples ensures evolution optimizes for the exact test set.
561
  Includes early stopping and rate limit handling.
562
  """
563
  try:
 
587
  else:
588
  raise
589
 
590
+ # Sample 20 samples with seed 42 - SAME as initial/final evaluation for consistency!
591
+ num_samples = 20
592
  if len(dataset) > num_samples:
593
  # Use SAME sampling logic as initial/final eval
594
  indices = random.sample(range(len(dataset)), num_samples)
 
648
 
649
  prediction = response.choices[0].message.content.strip()
650
 
651
+ # Generic answer matching
652
+ target_str = str(target).strip()
653
 
654
+ # Extract core answer (handles GSM8K "####" format, plain labels, etc.)
655
+ if "####" in target_str:
656
+ target_answer = target_str.split("####")[-1].strip().replace(",", "")
657
+ else:
658
+ target_answer = target_str
659
 
660
+ pred_lower = prediction.lower().strip()
661
+ target_lower = target_answer.lower().strip()
662
 
663
+ is_correct = False
 
 
664
 
665
+ # Direct containment
666
+ if target_lower in pred_lower:
667
+ is_correct = True
 
 
 
 
668
 
669
+ # Numeric matching
670
+ if not is_correct:
671
+ import re as _re
672
+ try:
673
+ target_num = float(target_lower)
674
+ numbers = _re.findall(r'-?[\\d,]+\\.?\\d*', pred_lower)
675
+ for n in numbers:
676
+ try:
677
+ if float(n.replace(",", "")) == target_num:
678
+ is_correct = True
679
+ break
680
+ except ValueError:
681
+ continue
682
+ except ValueError:
683
+ pass
684
+
685
+ # IMDB-style 0/1 labels
686
+ if not is_correct and target_lower in ("0", "1"):
687
+ has_positive = "positive" in pred_lower[:200]
688
+ has_negative = "negative" in pred_lower[:200]
689
+ if target_lower == "1" and has_positive and not has_negative:
690
+ is_correct = True
691
+ if target_lower == "0" and has_negative and not has_positive:
692
+ is_correct = True
693
 
694
  if is_correct:
695
  correct += 1
 
840
  "api_base": "https://openrouter.ai/api/v1", # Use OpenRouter endpoint
841
  "temperature": 1.2, # Even higher temperature for more creative variations
842
  },
843
+ "max_iterations": 5, # Fewer iterations to fit within time limits
844
  "checkpoint_interval": 1, # Save checkpoints every iteration to preserve prompt history
845
  "diff_based_evolution": False, # Use full rewrite mode for prompts (not diff/patch mode)
846
  "language": "text", # CRITICAL: Optimize text/prompts, not Python code!
 
914
  progress(0.15, desc="Creating configuration...")
915
  config_path = create_config_file(model, work_dir)
916
 
917
+ # Run initial evaluation with 20 samples
918
  # IMPORTANT: We save the indices to ensure final eval uses THE SAME samples
919
+ progress(0.2, desc="Running initial evaluation on 20 samples...")
920
  initial_eval = evaluate_prompt(
921
+ initial_prompt, dataset_name, dataset_split, 20,
922
  model, input_field, target_field
923
  )
924
 
 
952
  initial_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
953
 
954
  # Run OpenEvolve
955
+ progress(0.3, desc="Starting evolution: 5 iterations...")
956
 
957
  output_dir = os.path.join(work_dir, "output")
958
  os.makedirs(output_dir, exist_ok=True)
 
1024
  best_prompt = initial_prompt
1025
  print(f"\n[SELECTION] WARNING: No best_program.txt found, using initial prompt")
1026
 
1027
+ # Final evaluation: Use same 20 samples as initial eval for fair comparison
1028
+ progress(0.85, desc="Evaluating best prompt on 20 samples (same as initial)...")
1029
  final_eval = evaluate_prompt(
1030
+ best_prompt, dataset_name, dataset_split, 20,
1031
  model, input_field, target_field,
1032
+ fixed_indices=eval_indices # Use same 20 samples as initial eval!
1033
  )
1034
 
1035
+ # If evolution regressed, fall back to the initial prompt
1036
+ if final_eval['accuracy'] < initial_eval['accuracy']:
1037
+ best_prompt = initial_prompt
1038
+ final_eval = initial_eval
1039
+ regression = True
1040
+ else:
1041
+ regression = False
1042
+
1043
  progress(0.95, desc=f"Evaluation complete: {final_eval['correct']}/{final_eval['total']} = {final_eval['accuracy']:.1f}%")
1044
 
1045
+ improvement = final_eval['accuracy'] - initial_eval['accuracy']
1046
+
1047
  final_results = f"""
1048
+ ### Best Prompt
1049
 
1050
  **Prompt:**
1051
  ```
1052
  {best_prompt}
1053
  ```
1054
 
 
 
 
 
1055
  **Results:**
1056
  - Accuracy: {final_eval['accuracy']:.2f}%
1057
  - Correct: {final_eval['correct']}/{final_eval['total']}
 
 
 
1058
  """
1059
+ if regression:
1060
+ final_results += "\n**Note:** Evolution did not improve on the initial prompt. Keeping the original.\n"
1061
+ else:
1062
+ final_results += f"\n- Improvement: {improvement:+.2f}%\n"
1063
+
1064
+ final_results += "\n**Sample Results:**\n"
1065
  for i, result in enumerate(final_eval['results'][:5], 1):
1066
  final_results += f"\n{i}. Input: {result['input']}\n"
1067
  final_results += f" Target: {result['target']}\n"
1068
  final_results += f" Prediction: {result['prediction']}\n"
1069
  final_results += f" ✓ Correct\n" if result['correct'] else f" ✗ Incorrect\n"
1070
 
1071
+ if regression:
1072
+ summary_title = "## Optimization Complete (No Improvement)"
1073
+ summary_note = "\n**Evolution did not find a better prompt.** The initial prompt is already strong for this task.\n"
1074
+ else:
1075
+ summary_title = "## Optimization Complete!"
1076
+ summary_note = ""
1077
+
1078
  summary = f"""
1079
+ {summary_title}
1080
 
1081
  ### Summary
1082
  - **Dataset**: {dataset_name} ({dataset_split} split)
1083
  - **Evaluation Model**: {model}
1084
+ - **Evolution Model**: google/gemini-2.5-flash
1085
+ - **Samples**: 20 (same for initial, evolution, and final eval)
1086
+ - **Iterations**: 5
1087
+ {summary_note}
 
 
1088
  ### Results
1089
  - **Initial Accuracy**: {initial_eval['accuracy']:.2f}% ({initial_eval['correct']}/{initial_eval['total']})
1090
  - **Final Accuracy**: {final_eval['accuracy']:.2f}% ({final_eval['correct']}/{final_eval['total']})
1091
+ - **Improvement**: {improvement:+.2f}%
1092
 
1093
  {validation_message}
1094
  """
 
1106
  pass
1107
 
1108
 
1109
+ # Custom CSS for a polished, branded look
1110
+ custom_css = """
1111
+ /* Minimal CSS — only style what Gradio can't handle natively.
1112
+ All text-bearing elements use gr.Markdown (inherits theme colors).
1113
+ Only the run button gets custom styling. */
1114
+
1115
+ .gradio-container { max-width: 1200px !important; margin: auto; }
1116
+
1117
+ /* Primary action button — always purple with white text */
1118
+ .run-btn button, .run-btn > button, button.run-btn, .run-btn {
1119
+ background: linear-gradient(135deg, #7c3aed 0%, #6d28d9 100%) !important;
1120
+ color: #fff !important;
1121
+ border: none !important;
1122
+ border-radius: 12px !important;
1123
+ font-size: 1.05rem !important;
1124
+ font-weight: 600 !important;
1125
+ padding: 14px 28px !important;
1126
+ transition: transform 0.1s, box-shadow 0.2s !important;
1127
+ }
1128
+ .run-btn:hover, .run-btn button:hover {
1129
+ transform: translateY(-1px) !important;
1130
+ box-shadow: 0 8px 24px rgba(124,58,237,0.35) !important;
1131
+ color: #fff !important;
1132
+ }
1133
+ """
1134
 
1135
+ # Preset configurations
1136
+ PRESETS = {
1137
+ "imdb": {
1138
+ "dataset": "stanfordnlp/imdb",
1139
+ "split": "test",
1140
+ "input": "text",
1141
+ "target": "label",
1142
+ "prompt": "What do you think about this? {input}",
1143
+ },
1144
+ }
1145
 
 
1146
 
1147
+ def load_preset(name):
1148
+ p = PRESETS[name]
1149
+ return p["dataset"], p["split"], p["input"], p["target"], p["prompt"]
1150
 
 
 
1151
 
1152
+ # Create Gradio interface
1153
+ with gr.Blocks(title="OpenEvolve Prompt Optimizer") as demo:
1154
+
1155
+ # --- Hero header (self-contained dark bg, always light text) ---
1156
+ gr.HTML("""
1157
+ <div style="background:linear-gradient(135deg,#0f0c29 0%,#302b63 50%,#24243e 100%);
1158
+ border-radius:16px;padding:32px 40px;margin-bottom:8px;text-align:center;">
1159
+ <h1 style="color:#fff;font-size:2rem;font-weight:700;margin:0 0 8px 0;letter-spacing:-0.02em;">
1160
+ OpenEvolve Prompt Optimizer
1161
+ </h1>
1162
+ <p style="color:#c4b5fd;font-size:0.95rem;margin:0;">
1163
+ Evolve better prompts automatically using
1164
+ <a href="https://github.com/codelion/openevolve" target="_blank" style="color:#c4b5fd;text-decoration:underline;">OpenEvolve</a>.
1165
+ Powered by <code style="background:rgba(255,255,255,0.12);color:#e0d4ff;padding:2px 6px;border-radius:4px;">gemini-2.5-flash</code> via
1166
+ <a href="https://openrouter.ai/" target="_blank" style="color:#c4b5fd;text-decoration:underline;">OpenRouter</a>.
1167
+ </p>
1168
+ <p style="color:#94a3b8;font-size:0.82rem;margin:12px 0 0 0;">
1169
+ <strong style="color:#a78bfa;">1.</strong> Pick a dataset &amp; prompt &rarr;
1170
+ <strong style="color:#a78bfa;">2.</strong> Evolve 5 iterations &rarr;
1171
+ <strong style="color:#a78bfa;">3.</strong> Compare results side-by-side
1172
+ </p>
1173
+ </div>
1174
+ """)
1175
 
1176
+ # --- Setup ---
1177
+ gr.Markdown("#### Dataset")
 
 
 
 
1178
 
1179
+ gr.Markdown("Quick preset:")
1180
+ preset_imdb = gr.Button("IMDB Sentiment", size="sm")
 
 
 
 
1181
 
1182
+ dataset_name = gr.Textbox(
1183
+ label="HuggingFace Dataset",
1184
+ value="stanfordnlp/imdb",
1185
+ placeholder="org/dataset-name",
1186
+ )
1187
  with gr.Row():
1188
+ dataset_split = gr.Textbox(label="Split", value="test", scale=1)
1189
+ input_field = gr.Textbox(label="Input Field", value="text", scale=1)
1190
+ target_field = gr.Textbox(label="Target Field", value="label", scale=1)
1191
+
1192
+ gr.Markdown("#### Prompt")
1193
+ initial_prompt = gr.TextArea(
1194
+ label="Initial Prompt",
1195
+ value="What do you think about this? {input}",
1196
+ lines=5,
1197
+ info="Must contain {input} placeholder. Start with a weak prompt -- evolution will improve it!",
1198
+ )
1199
+ gr.Markdown(
1200
+ "*Eval model:* `gemini-2.5-flash-lite` (20 samples) | *Evolution model:* `gemini-2.5-flash` (5 iterations) \n"
1201
+ "**Note:** Optimization can take up to 10 minutes to complete."
1202
+ )
1203
 
1204
+ # Run button
1205
+ optimize_btn = gr.Button(
1206
+ "Optimize Prompt",
1207
+ variant="primary",
1208
+ size="lg",
1209
+ elem_classes="run-btn",
1210
+ )
1211
 
1212
+ # --- Results ---
1213
  gr.Markdown("---")
1214
+ summary = gr.Markdown("")
1215
 
1216
+ with gr.Row(equal_height=True):
1217
  with gr.Column():
1218
+ initial_results = gr.Markdown("**Initial Prompt**\n\nResults will appear here after optimization...")
1219
  with gr.Column():
1220
+ final_results = gr.Markdown("**Evolved Prompt**\n\nResults will appear here after optimization...")
1221
 
1222
+ # --- Wiring ---
1223
  def optimize_with_fixed_model(initial_prompt, dataset_name, dataset_split,
1224
  input_field, target_field, progress=gr.Progress()):
 
1225
  return optimize_prompt(
1226
  initial_prompt, dataset_name, dataset_split,
1227
+ MODELS[0],
1228
  input_field, target_field, progress
1229
  )
1230
 
 
1232
  fn=optimize_with_fixed_model,
1233
  inputs=[initial_prompt, dataset_name, dataset_split,
1234
  input_field, target_field],
1235
+ outputs=[summary, initial_results, final_results],
1236
  )
1237
 
1238
+ preset_outputs = [dataset_name, dataset_split, input_field, target_field, initial_prompt]
1239
+ preset_imdb.click(fn=lambda: load_preset("imdb"), outputs=preset_outputs)
1240
+
1241
  if __name__ == "__main__":
1242
+ demo.launch(css=custom_css)