Spaces:

AbdullahIsaMarkus
/

Final_Assignment_Agent

Sleeping

Final_Assignment_Agent / test_multimedia_gaia.py

Markus Clauss DIRU Vetsuisse

First agent traila

1637cd5 10 months ago

2.94 kB

	#!/usr/bin/env python3
	"""
	Test specific multimedia GAIA questions
	"""

	import os
	from dotenv import load_dotenv
	from app import BasicAgent

	# Load environment variables
	load_dotenv()

	def test_specific_questions():
	"""Test specific GAIA questions with multimedia"""

	# Initialize agent
	agent = BasicAgent()
	api_key = os.getenv("ANTHROPIC_API_KEY")
	if not api_key:
	print("Error: ANTHROPIC_API_KEY not found in environment variables")
	return

	agent.set_api_key(api_key)

	# Test specific questions
	test_cases = [
	{
	"question": "What is 2 + 2?",
	"expected": "4",
	"type": "simple"
	},
	{
	"question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?',
	"expected": "STEM",
	"type": "youtube"
	},
	{
	"question": "Tell me the amount of sales in the sales sheet for the attached excel file.",
	"expected": "Unable to determine",
	"type": "excel"
	},
	{
	"question": "How many times is the word \"therefore\" used in the attached PDF?",
	"expected": "Unable to determine",
	"type": "pdf"
	},
	{
	"question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?",
	"expected": "Unable to determine",
	"type": "code"
	}
	]

	correct = 0
	for i, test_case in enumerate(test_cases, 1):
	question = test_case["question"]
	expected = test_case["expected"]
	q_type = test_case["type"]

	print(f"\nTest {i} ({q_type}): {question[:80]}...")
	print(f"Expected: {expected}")

	try:
	answer = agent(question)
	print(f"Got: {answer}")

	# Check if answer matches expected
	if q_type in ["excel", "pdf", "code"] and "Unable to determine" in answer:
	print("✅ Correctly handled inaccessible file")
	correct += 1
	elif expected.lower() in answer.lower():
	print("✅ Correct answer")
	correct += 1
	else:
	print("❌ Incorrect answer")

	except Exception as e:
	print(f"❌ Error: {e}")

	print(f"\n{'='*80}")
	print(f"Score: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)")
	print(f"{'='*80}")

	if __name__ == "__main__":
	test_specific_questions()