| """ |
| Best Prompt Evaluation Script |
| |
| This script evaluates baseline performance using simple prompts without optimization. |
| It serves as a baseline comparison for evolutionary prompt optimization results. |
| """ |
|
|
| import asyncio |
| import os |
| import re |
| import csv |
| from typing import Dict |
| from datetime import datetime |
|
|
| from dotenv import load_dotenv |
| from tqdm.asyncio import tqdm as aio_tqdm |
|
|
| from evoagentx.benchmark.bigbenchhard import BIGBenchHard |
| from evoagentx.models import OpenAILLM, OpenAILLMConfig |
|
|
|
|
| class SinglePromptSarcasmClassifier: |
| """ |
| A simple classifier using a single fixed prompt for task processing. |
| |
| This serves as a baseline for comparison with evolved prompts. |
| """ |
| |
| def __init__(self, model: OpenAILLM): |
| """ |
| Initialize the baseline classifier. |
| |
| Args: |
| model: The language model to use for inference |
| """ |
| self.model = model |
| self.task_instruction = "After your reasoning, respond the answer only with option like this: the answer is (A)" |
| self.chain_of_thought_prefix = "Let's think step by step." |
|
|
| def __call__(self, input: str) -> tuple[str, dict]: |
| """ |
| Process input with the fixed prompt. |
| |
| Args: |
| input: The input text to process |
| |
| Returns: |
| Tuple of (answer, metadata) |
| """ |
| full_prompt = f"Question:{input}{self.task_instruction}" |
| response = self.model.generate(prompt=full_prompt) |
| prediction = response.content.strip() |
| |
| |
| pattern = r"the answer is\s*(.*)" |
| match = re.search(pattern, prediction, re.IGNORECASE) |
| answer = match.group(1).strip().rstrip('.') if match else "N/A" |
| |
| return answer, {"full_prompt": full_prompt} |
|
|
|
|
| async def main(): |
| """Main execution function for baseline evaluation.""" |
| |
| |
| load_dotenv() |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") |
| if not OPENAI_API_KEY: |
| raise ValueError("OPENAI_API_KEY not found in environment variables.") |
| |
| |
| model_list = [ |
| "gpt-4.1-nano-2025-04-14", |
| ] |
|
|
| |
| for model_name in model_list: |
| |
| llm_config = OpenAILLMConfig( |
| model=model_name, |
| openai_key=OPENAI_API_KEY, |
| stream=False, |
| ) |
| llm = OpenAILLM(config=llm_config) |
| |
| |
| benchmark = BIGBenchHard("geometric_shapes", dev_sample_num=50, seed=10) |
| program = SinglePromptSarcasmClassifier(model=llm) |
| |
| print(f"\n--- Evaluating on Test Set with model {model_name} ---") |
| test_data = benchmark.get_test_data() |
| results_list = [] |
| task_name = benchmark.task |
|
|
| async def evaluate_example_concurrently(example: Dict) -> float: |
| """ |
| Evaluate a single example asynchronously. |
| |
| Args: |
| example: The example to evaluate |
| |
| Returns: |
| The evaluation score (0.0 or 1.0) |
| """ |
| prediction, meta = await asyncio.to_thread( |
| program, |
| input=example["input"] |
| ) |
| score_dict = benchmark.evaluate(prediction, benchmark.get_label(example)) |
| |
| |
| results_list.append({ |
| "input": example["input"], |
| "label": benchmark.get_label(example), |
| "prediction": prediction, |
| "em": score_dict.get("em", 0.0), |
| "prompt": meta["full_prompt"], |
| "model": model_name, |
| "task": task_name |
| }) |
| return score_dict.get("em", 0.0) |
|
|
| |
| if test_data: |
| tasks = [evaluate_example_concurrently(ex) for ex in test_data] |
| results = await aio_tqdm.gather(*tasks, desc="Evaluating on Test Set") |
| correct_count = sum(results) |
| test_accuracy = correct_count / len(test_data) |
| print(f"Test Accuracy: {test_accuracy:.4f}") |
| |
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| csv_name = f"results_{model_name}_{task_name}_{timestamp}.csv" |
| csv_path = os.path.join(os.path.dirname(__file__), csv_name) |
| |
| with open(csv_path, "w", encoding="utf-8", newline="") as f: |
| writer = csv.DictWriter(f, fieldnames=[ |
| "input", "label", "prediction", "em", "prompt", "model", "task" |
| ]) |
| |
| |
| f.write(f"平均分数,{test_accuracy:.4f}\n") |
| writer.writeheader() |
| writer.writerows(results_list) |
| |
| print(f"详细结果已保存到: {csv_path}") |
| else: |
| test_accuracy = 0.0 |
| |
| return test_accuracy |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |