Upload 2846 files

5374a2d verified 3 months ago

5.22 kB

	"""
	Best Prompt Evaluation Script

	This script evaluates baseline performance using simple prompts without optimization.
	It serves as a baseline comparison for evolutionary prompt optimization results.
	"""

	import asyncio
	import os
	import re
	import csv
	from typing import Dict
	from datetime import datetime

	from dotenv import load_dotenv
	from tqdm.asyncio import tqdm as aio_tqdm

	from evoagentx.benchmark.bigbenchhard import BIGBenchHard
	from evoagentx.models import OpenAILLM, OpenAILLMConfig


	class SinglePromptSarcasmClassifier:
	"""
	A simple classifier using a single fixed prompt for task processing.

	This serves as a baseline for comparison with evolved prompts.
	"""

	def __init__(self, model: OpenAILLM):
	"""
	Initialize the baseline classifier.

	Args:
	model: The language model to use for inference
	"""
	self.model = model
	self.task_instruction = "After your reasoning, respond the answer only with option like this: the answer is (A)"
	self.chain_of_thought_prefix = "Let's think step by step."

	def __call__(self, input: str) -> tuple[str, dict]:
	"""
	Process input with the fixed prompt.

	Args:
	input: The input text to process

	Returns:
	Tuple of (answer, metadata)
	"""
	full_prompt = f"Question:{input}{self.task_instruction}"
	response = self.model.generate(prompt=full_prompt)
	prediction = response.content.strip()

	# Extract answer using regex pattern
	pattern = r"the answer is\s(.)"
	match = re.search(pattern, prediction, re.IGNORECASE)
	answer = match.group(1).strip().rstrip('.') if match else "N/A"

	return answer, {"full_prompt": full_prompt}


	async def main():
	"""Main execution function for baseline evaluation."""

	# Load environment configuration
	load_dotenv()
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
	if not OPENAI_API_KEY:
	raise ValueError("OPENAI_API_KEY not found in environment variables.")

	# Models to evaluate
	model_list = [
	"gpt-4.1-nano-2025-04-14",
	]

	# Evaluate each model
	for model_name in model_list:
	# Configure language model
	llm_config = OpenAILLMConfig(
	model=model_name,
	openai_key=OPENAI_API_KEY,
	stream=False,
	)
	llm = OpenAILLM(config=llm_config)

	# Set up benchmark and classifier
	benchmark = BIGBenchHard("geometric_shapes", dev_sample_num=50, seed=10)
	program = SinglePromptSarcasmClassifier(model=llm)

	print(f"\n--- Evaluating on Test Set with model {model_name} ---")
	test_data = benchmark.get_test_data()
	results_list = []
	task_name = benchmark.task

	async def evaluate_example_concurrently(example: Dict) -> float:
	"""
	Evaluate a single example asynchronously.

	Args:
	example: The example to evaluate

	Returns:
	The evaluation score (0.0 or 1.0)
	"""
	prediction, meta = await asyncio.to_thread(
	program,
	input=example["input"]
	)
	score_dict = benchmark.evaluate(prediction, benchmark.get_label(example))

	# Save detailed results for each sample
	results_list.append({
	"input": example["input"],
	"label": benchmark.get_label(example),
	"prediction": prediction,
	"em": score_dict.get("em", 0.0),
	"prompt": meta["full_prompt"],
	"model": model_name,
	"task": task_name
	})
	return score_dict.get("em", 0.0)

	# Run evaluation on test set
	if test_data:
	tasks = [evaluate_example_concurrently(ex) for ex in test_data]
	results = await aio_tqdm.gather(*tasks, desc="Evaluating on Test Set")
	correct_count = sum(results)
	test_accuracy = correct_count / len(test_data)
	print(f"Test Accuracy: {test_accuracy:.4f}")

	# Save results to CSV with timestamp
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	csv_name = f"results_{model_name}_{task_name}_{timestamp}.csv"
	csv_path = os.path.join(os.path.dirname(__file__), csv_name)

	with open(csv_path, "w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"input", "label", "prediction", "em", "prompt", "model", "task"
	])

	# Write average score at the top
	f.write(f"平均分数,{test_accuracy:.4f}\n")
	writer.writeheader()
	writer.writerows(results_list)

	print(f"详细结果已保存到: {csv_path}")
	else:
	test_accuracy = 0.0

	return test_accuracy


	if __name__ == "__main__":
	asyncio.run(main())