| """ |
| LLM Integration Tests - Stage 3 Validation |
| Author: @mangubee |
| Date: 2026-01-02 |
| |
| Tests for Stage 3 LLM integration: |
| - Planning with LLM |
| - Tool selection via function calling |
| - Answer synthesis from evidence |
| - Full workflow with mocked LLM responses |
| """ |
|
|
| import pytest |
| from unittest.mock import patch, MagicMock |
| from src.agent.llm_client import ( |
| plan_question, |
| select_tools_with_function_calling, |
| synthesize_answer |
| ) |
| from src.tools import TOOLS |
|
|
|
|
| class TestPlanningFunction: |
| """Test LLM-based planning function.""" |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_plan_question_basic(self, mock_anthropic): |
| """Test planning with simple question.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
| mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| plan = plan_question( |
| question="What is the capital of France?", |
| available_tools=TOOLS |
| ) |
|
|
| assert isinstance(plan, str) |
| assert len(plan) > 0 |
| print(f"✓ Generated plan: {plan[:50]}...") |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_plan_with_files(self, mock_anthropic): |
| """Test planning with file context.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
| mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| plan = plan_question( |
| question="What is the total in the spreadsheet?", |
| available_tools=TOOLS, |
| file_paths=["data.xlsx"] |
| ) |
|
|
| assert isinstance(plan, str) |
| assert len(plan) > 0 |
| print(f"✓ Generated plan with files: {plan[:50]}...") |
|
|
|
|
| class TestToolSelection: |
| """Test LLM function calling for tool selection.""" |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_select_single_tool(self, mock_anthropic): |
| """Test selecting single tool with parameters.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
|
|
| |
| mock_tool_use = MagicMock() |
| mock_tool_use.type = "tool_use" |
| mock_tool_use.name = "search" |
| mock_tool_use.input = {"query": "capital of France"} |
| mock_tool_use.id = "call_001" |
|
|
| mock_response.content = [mock_tool_use] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| tool_calls = select_tools_with_function_calling( |
| question="What is the capital of France?", |
| plan="1. Search for capital of France", |
| available_tools=TOOLS |
| ) |
|
|
| assert isinstance(tool_calls, list) |
| assert len(tool_calls) == 1 |
| assert tool_calls[0]["tool"] == "search" |
| assert "query" in tool_calls[0]["params"] |
| print(f"✓ Selected tool: {tool_calls[0]}") |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_select_multiple_tools(self, mock_anthropic): |
| """Test selecting multiple tools in sequence.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
|
|
| |
| mock_tool1 = MagicMock() |
| mock_tool1.type = "tool_use" |
| mock_tool1.name = "parse_file" |
| mock_tool1.input = {"file_path": "data.xlsx"} |
| mock_tool1.id = "call_001" |
|
|
| mock_tool2 = MagicMock() |
| mock_tool2.type = "tool_use" |
| mock_tool2.name = "safe_eval" |
| mock_tool2.input = {"expression": "sum(values)"} |
| mock_tool2.id = "call_002" |
|
|
| mock_response.content = [mock_tool1, mock_tool2] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| tool_calls = select_tools_with_function_calling( |
| question="What is the sum in data.xlsx?", |
| plan="1. Parse file\n2. Calculate sum", |
| available_tools=TOOLS |
| ) |
|
|
| assert isinstance(tool_calls, list) |
| assert len(tool_calls) == 2 |
| assert tool_calls[0]["tool"] == "parse_file" |
| assert tool_calls[1]["tool"] == "safe_eval" |
| print(f"✓ Selected {len(tool_calls)} tools") |
|
|
|
|
| class TestAnswerSynthesis: |
| """Test LLM-based answer synthesis.""" |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_synthesize_simple_answer(self, mock_anthropic): |
| """Test synthesizing answer from single evidence.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
| mock_response.content = [MagicMock(text="Paris")] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| answer = synthesize_answer( |
| question="What is the capital of France?", |
| evidence=["[search] Paris is the capital and most populous city of France"] |
| ) |
|
|
| assert isinstance(answer, str) |
| assert len(answer) > 0 |
| assert answer == "Paris" |
| print(f"✓ Synthesized answer: {answer}") |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_synthesize_from_multiple_evidence(self, mock_anthropic): |
| """Test synthesizing answer from multiple evidence sources.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
| mock_response.content = [MagicMock(text="42")] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| answer = synthesize_answer( |
| question="What is the answer?", |
| evidence=[ |
| "[search] The answer to life is 42", |
| "[safe_eval] 6 * 7 = 42", |
| "[parse_file] Result: 42" |
| ] |
| ) |
|
|
| assert isinstance(answer, str) |
| assert answer == "42" |
| print(f"✓ Synthesized answer from {3} evidence items: {answer}") |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| def test_synthesize_with_conflicts(self, mock_anthropic): |
| """Test synthesizing answer when evidence conflicts.""" |
| |
| mock_client = MagicMock() |
| mock_response = MagicMock() |
| mock_response.content = [MagicMock(text="Paris")] |
| mock_client.messages.create.return_value = mock_response |
| mock_anthropic.return_value = mock_client |
|
|
| |
| answer = synthesize_answer( |
| question="What is the capital of France?", |
| evidence=[ |
| "[search] Paris is the capital of France (source: Wikipedia, 2024)", |
| "[search] Lyon was briefly capital during revolution (source: old text, 1793)" |
| ] |
| ) |
|
|
| assert isinstance(answer, str) |
| assert answer == "Paris" |
| print(f"✓ Resolved conflict, answer: {answer}") |
|
|
|
|
| class TestEndToEndWorkflow: |
| """Test full agent workflow with mocked LLM.""" |
|
|
| @patch('src.agent.llm_client.Anthropic') |
| @patch('src.tools.web_search.tavily_search') |
| def test_full_search_workflow(self, mock_tavily, mock_anthropic): |
| """Test complete workflow: plan → search → answer.""" |
| from src.agent import GAIAAgent |
|
|
| |
| mock_tavily.return_value = "Paris is the capital and most populous city of France" |
|
|
| |
| mock_client = MagicMock() |
|
|
| |
| |
| |
|
|
| mock_plan_response = MagicMock() |
| mock_plan_response.content = [MagicMock(text="1. Search for capital of France")] |
|
|
| mock_tool_response = MagicMock() |
| mock_tool_use = MagicMock() |
| mock_tool_use.type = "tool_use" |
| mock_tool_use.name = "web_search" |
| mock_tool_use.input = {"query": "capital of France"} |
| mock_tool_use.id = "call_001" |
| mock_tool_response.content = [mock_tool_use] |
|
|
| mock_answer_response = MagicMock() |
| mock_answer_response.content = [MagicMock(text="Paris")] |
|
|
| |
| mock_client.messages.create.side_effect = [ |
| mock_plan_response, |
| mock_tool_response, |
| mock_answer_response |
| ] |
|
|
| mock_anthropic.return_value = mock_client |
|
|
| |
| agent = GAIAAgent() |
| answer = agent("What is the capital of France?") |
|
|
| assert isinstance(answer, str) |
| assert answer == "Paris" |
| print(f"✓ Full workflow completed, answer: {answer}") |
|
|
|
|
| if __name__ == "__main__": |
| print("\n" + "="*70) |
| print("GAIA Agent - Stage 3 LLM Integration Tests") |
| print("="*70 + "\n") |
|
|
| |
| test_plan = TestPlanningFunction() |
| test_plan.test_plan_question_basic() |
| test_plan.test_plan_with_files() |
|
|
| test_tools = TestToolSelection() |
| test_tools.test_select_single_tool() |
| test_tools.test_select_multiple_tools() |
|
|
| test_answer = TestAnswerSynthesis() |
| test_answer.test_synthesize_simple_answer() |
| test_answer.test_synthesize_from_multiple_evidence() |
| test_answer.test_synthesize_with_conflicts() |
|
|
| test_e2e = TestEndToEndWorkflow() |
| test_e2e.test_full_search_workflow() |
|
|
| print("\n" + "="*70) |
| print("✓ All Stage 3 LLM integration tests passed!") |
| print("="*70 + "\n") |
|
|