iLOVE2D's picture
Upload 2846 files
5374a2d verified
"""
Best Prompt Evaluation Script
This script evaluates baseline performance using simple prompts without optimization.
It serves as a baseline comparison for evolutionary prompt optimization results.
"""
import asyncio
import os
import re
import csv
from typing import Dict
from datetime import datetime
from dotenv import load_dotenv
from tqdm.asyncio import tqdm as aio_tqdm
from evoagentx.benchmark.bigbenchhard import BIGBenchHard
from evoagentx.models import OpenAILLM, OpenAILLMConfig
class SinglePromptSarcasmClassifier:
"""
A simple classifier using a single fixed prompt for task processing.
This serves as a baseline for comparison with evolved prompts.
"""
def __init__(self, model: OpenAILLM):
"""
Initialize the baseline classifier.
Args:
model: The language model to use for inference
"""
self.model = model
self.task_instruction = "After your reasoning, respond the answer only with option like this: the answer is (A)"
self.chain_of_thought_prefix = "Let's think step by step."
def __call__(self, input: str) -> tuple[str, dict]:
"""
Process input with the fixed prompt.
Args:
input: The input text to process
Returns:
Tuple of (answer, metadata)
"""
full_prompt = f"Question:{input}{self.task_instruction}"
response = self.model.generate(prompt=full_prompt)
prediction = response.content.strip()
# Extract answer using regex pattern
pattern = r"the answer is\s*(.*)"
match = re.search(pattern, prediction, re.IGNORECASE)
answer = match.group(1).strip().rstrip('.') if match else "N/A"
return answer, {"full_prompt": full_prompt}
async def main():
"""Main execution function for baseline evaluation."""
# Load environment configuration
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY not found in environment variables.")
# Models to evaluate
model_list = [
"gpt-4.1-nano-2025-04-14",
]
# Evaluate each model
for model_name in model_list:
# Configure language model
llm_config = OpenAILLMConfig(
model=model_name,
openai_key=OPENAI_API_KEY,
stream=False,
)
llm = OpenAILLM(config=llm_config)
# Set up benchmark and classifier
benchmark = BIGBenchHard("geometric_shapes", dev_sample_num=50, seed=10)
program = SinglePromptSarcasmClassifier(model=llm)
print(f"\n--- Evaluating on Test Set with model {model_name} ---")
test_data = benchmark.get_test_data()
results_list = []
task_name = benchmark.task
async def evaluate_example_concurrently(example: Dict) -> float:
"""
Evaluate a single example asynchronously.
Args:
example: The example to evaluate
Returns:
The evaluation score (0.0 or 1.0)
"""
prediction, meta = await asyncio.to_thread(
program,
input=example["input"]
)
score_dict = benchmark.evaluate(prediction, benchmark.get_label(example))
# Save detailed results for each sample
results_list.append({
"input": example["input"],
"label": benchmark.get_label(example),
"prediction": prediction,
"em": score_dict.get("em", 0.0),
"prompt": meta["full_prompt"],
"model": model_name,
"task": task_name
})
return score_dict.get("em", 0.0)
# Run evaluation on test set
if test_data:
tasks = [evaluate_example_concurrently(ex) for ex in test_data]
results = await aio_tqdm.gather(*tasks, desc="Evaluating on Test Set")
correct_count = sum(results)
test_accuracy = correct_count / len(test_data)
print(f"Test Accuracy: {test_accuracy:.4f}")
# Save results to CSV with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_name = f"results_{model_name}_{task_name}_{timestamp}.csv"
csv_path = os.path.join(os.path.dirname(__file__), csv_name)
with open(csv_path, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"input", "label", "prediction", "em", "prompt", "model", "task"
])
# Write average score at the top
f.write(f"平均分数,{test_accuracy:.4f}\n")
writer.writeheader()
writer.writerows(results_list)
print(f"详细结果已保存到: {csv_path}")
else:
test_accuracy = 0.0
return test_accuracy
if __name__ == "__main__":
asyncio.run(main())