| |
| """ |
| Test specific multimedia GAIA questions |
| """ |
|
|
| import os |
| from dotenv import load_dotenv |
| from app import BasicAgent |
|
|
| |
| load_dotenv() |
|
|
| def test_specific_questions(): |
| """Test specific GAIA questions with multimedia""" |
| |
| |
| agent = BasicAgent() |
| api_key = os.getenv("ANTHROPIC_API_KEY") |
| if not api_key: |
| print("Error: ANTHROPIC_API_KEY not found in environment variables") |
| return |
| |
| agent.set_api_key(api_key) |
| |
| |
| test_cases = [ |
| { |
| "question": "What is 2 + 2?", |
| "expected": "4", |
| "type": "simple" |
| }, |
| { |
| "question": 'In the video https://www.youtube.com/watch?v=1htKBjuUWec, Verma claims the existence of "a "moat" in the education system that provides a systemic advantage for those who know about it and can get into the pipeline." Verma\'s "moat" is a well-known advantage for students. What is the four-letter abbreviation used to describe this systemic advantage?', |
| "expected": "STEM", |
| "type": "youtube" |
| }, |
| { |
| "question": "Tell me the amount of sales in the sales sheet for the attached excel file.", |
| "expected": "Unable to determine", |
| "type": "excel" |
| }, |
| { |
| "question": "How many times is the word \"therefore\" used in the attached PDF?", |
| "expected": "Unable to determine", |
| "type": "pdf" |
| }, |
| { |
| "question": "In the attached Python code, I try to use the string method zfill. It does not work. Can you fix the problem for me and give me the only the complete corrected code?", |
| "expected": "Unable to determine", |
| "type": "code" |
| } |
| ] |
| |
| correct = 0 |
| for i, test_case in enumerate(test_cases, 1): |
| question = test_case["question"] |
| expected = test_case["expected"] |
| q_type = test_case["type"] |
| |
| print(f"\nTest {i} ({q_type}): {question[:80]}...") |
| print(f"Expected: {expected}") |
| |
| try: |
| answer = agent(question) |
| print(f"Got: {answer}") |
| |
| |
| if q_type in ["excel", "pdf", "code"] and "Unable to determine" in answer: |
| print("β
Correctly handled inaccessible file") |
| correct += 1 |
| elif expected.lower() in answer.lower(): |
| print("β
Correct answer") |
| correct += 1 |
| else: |
| print("β Incorrect answer") |
| |
| except Exception as e: |
| print(f"β Error: {e}") |
| |
| print(f"\n{'='*80}") |
| print(f"Score: {correct}/{len(test_cases)} ({correct/len(test_cases)*100:.0f}%)") |
| print(f"{'='*80}") |
|
|
| if __name__ == "__main__": |
| test_specific_questions() |