| """ |
| LocalMate Agent Test Script - Comprehensive Tool Coverage |
| |
| Tests 5 queries covering ALL tools in both modes: |
| 1. Greeting (no tools) - tests greeting detection |
| 2. Text search (retrieve_context_text) |
| 3. Location search (find_nearby_places) |
| 4. Social search (search_social_media) |
| 5. Complex query (multiple tools in ReAct mode) |
| |
| Run: python tests/test_react_comparison.py |
| """ |
|
|
| import asyncio |
| import json |
| import time |
| from datetime import datetime |
| import httpx |
|
|
| |
| |
| |
|
|
| |
| |
| API_BASE = "http://localhost:8000/api/v1" |
| USER_ID = "test_comprehensive" |
|
|
| |
| SINGLE_MODE_DELAY = 20 |
| REACT_MODE_DELAY = 60 |
| MODE_SWITCH_DELAY = 60 |
| REQUEST_TIMEOUT = 60 |
|
|
| |
| |
| |
| PROVIDER = "Google" |
| |
| MODEL = "gemini-2.5-flash" |
|
|
| |
| |
| |
|
|
| TEST_CASES = [ |
| { |
| "id": 1, |
| "query": "xin chΓ o", |
| "description": "Greeting - No tools expected", |
| "expected_tools": [], |
| "tool_coverage": "No tools (greeting detection)", |
| }, |
| { |
| "id": 2, |
| "query": "QuΓ‘n cafe view ΔαΊΉp α» ΔΓ NαΊ΅ng", |
| "description": "Text search - Semantic search in reviews", |
| "expected_tools": ["retrieve_context_text"], |
| "tool_coverage": "retrieve_context_text", |
| }, |
| { |
| "id": 3, |
| "query": "NhΓ hΓ ng gαΊ§n CαΊ§u Rα»ng", |
| "description": "Location search - Neo4j spatial query", |
| "expected_tools": ["find_nearby_places"], |
| "tool_coverage": "find_nearby_places", |
| }, |
| { |
| "id": 4, |
| "query": "Review quΓ‘n Δn hot trΓͺn tiktok ΔΓ NαΊ΅ng", |
| "description": "Social search - Brave API news/trends", |
| "expected_tools": ["search_social_media"], |
| "tool_coverage": "search_social_media", |
| }, |
| { |
| "id": 5, |
| "query": "QuΓ‘n cafe khΓ΄ng gian ΔαΊΉp gαΊ§n biα»n Mα»Ή KhΓͺ cΓ³ review tα»t", |
| "description": "Complex query - Multiple tools (ReAct advantage)", |
| "expected_tools": ["find_nearby_places", "retrieve_context_text"], |
| "tool_coverage": "Multiple tools", |
| }, |
| ] |
|
|
|
|
| async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict: |
| """Run a single test case and return results.""" |
| start_time = time.time() |
| |
| try: |
| response = await client.post( |
| f"{API_BASE}/chat", |
| json={ |
| "message": test_case["query"], |
| "user_id": USER_ID, |
| "provider": PROVIDER, |
| "model": MODEL, |
| "react_mode": react_mode, |
| "max_steps": 5, |
| }, |
| timeout=float(REQUEST_TIMEOUT), |
| ) |
| |
| duration = (time.time() - start_time) * 1000 |
| |
| if response.status_code == 200: |
| data = response.json() |
| return { |
| "success": True, |
| "test_id": test_case["id"], |
| "query": test_case["query"], |
| "description": test_case["description"], |
| "tool_coverage": test_case["tool_coverage"], |
| "expected_tools": test_case["expected_tools"], |
| "react_mode": react_mode, |
| "response": data.get("response", "")[:300], |
| "workflow": data.get("workflow", {}), |
| "tools_used": data.get("tools_used", []), |
| "places_count": len(data.get("places", [])), |
| "api_duration_ms": data.get("duration_ms", 0), |
| "total_duration_ms": duration, |
| } |
| else: |
| return { |
| "success": False, |
| "test_id": test_case["id"], |
| "query": test_case["query"], |
| "react_mode": react_mode, |
| "error": f"HTTP {response.status_code}: {response.text[:200]}", |
| "total_duration_ms": duration, |
| } |
| |
| except Exception as e: |
| return { |
| "success": False, |
| "test_id": test_case["id"], |
| "query": test_case["query"], |
| "react_mode": react_mode, |
| "error": str(e), |
| "total_duration_ms": (time.time() - start_time) * 1000, |
| } |
|
|
|
|
| def check_tool_match(expected: list, actual: list) -> str: |
| """Check if expected tools match actual tools used.""" |
| if not expected and not actual: |
| return "β
Match" |
| if set(expected) == set(actual): |
| return "β
Match" |
| if set(expected).issubset(set(actual)): |
| return "β οΈ Extra tools" |
| if any(t in actual for t in expected): |
| return "β οΈ Partial" |
| return "β Mismatch" |
|
|
|
|
| def generate_report(single_results: list, react_results: list) -> str: |
| """Generate detailed markdown report.""" |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| |
| |
| single_success = sum(1 for r in single_results if r.get('success')) |
| react_success = sum(1 for r in react_results if r.get('success')) |
| single_avg = sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, single_success) |
| react_avg = sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, react_success) |
| |
| |
| all_tools_single = set() |
| all_tools_react = set() |
| for r in single_results: |
| if r.get('success'): |
| all_tools_single.update(r.get('tools_used', [])) |
| for r in react_results: |
| if r.get('success'): |
| all_tools_react.update(r.get('tools_used', [])) |
| |
| report = f"""# LocalMate Agent Comprehensive Test Report |
| |
| **Generated:** {timestamp} |
| **Provider:** {PROVIDER} |
| **Model:** {MODEL} |
| |
| --- |
| |
| ## Summary |
| |
| | Metric | Single Mode | ReAct Mode | |
| |--------|:-----------:|:----------:| |
| | Success Rate | {single_success}/{len(single_results)} | {react_success}/{len(react_results)} | |
| | Avg Duration | {single_avg:.0f}ms | {react_avg:.0f}ms | |
| | Unique Tools | {len(all_tools_single)} | {len(all_tools_react)} | |
| |
| ### Tools Covered |
| |
| | Tool | Single Mode | ReAct Mode | |
| |------|:-----------:|:----------:| |
| | `retrieve_context_text` | {"β
" if "retrieve_context_text" in all_tools_single else "β"} | {"β
" if "retrieve_context_text" in all_tools_react else "β"} | |
| | `find_nearby_places` | {"β
" if "find_nearby_places" in all_tools_single else "β"} | {"β
" if "find_nearby_places" in all_tools_react else "β"} | |
| | `search_social_media` | {"β
" if "search_social_media" in all_tools_single else "β"} | {"β
" if "search_social_media" in all_tools_react else "β"} | |
| | No tools (greeting) | {"β
" if any(not r.get('tools_used') for r in single_results if r.get('success')) else "β"} | {"β
" if any(not r.get('tools_used') for r in react_results if r.get('success')) else "β"} | |
| |
| --- |
| |
| ## Test Results |
| |
| | ID | Description | Single Tools | ReAct Tools | Match | |
| |----|-------------|--------------|-------------|-------| |
| """ |
| |
| for single, react in zip(single_results, react_results): |
| test_id = single.get("test_id", "?") |
| desc = single.get("description", "")[:30] |
| expected = single.get("expected_tools", []) |
| |
| if single.get("success"): |
| single_tools = ", ".join(single.get("tools_used", [])) or "β
(none)" |
| single_match = check_tool_match(expected, single.get("tools_used", [])) |
| else: |
| single_tools = "β Error" |
| single_match = "β" |
| |
| if react.get("success"): |
| react_tools = ", ".join(react.get("tools_used", [])) or "β
(none)" |
| react_match = check_tool_match(expected, react.get("tools_used", [])) |
| else: |
| react_tools = "β Error" |
| react_match = "β" |
| |
| report += f"| {test_id} | {desc} | {single_tools} | {react_tools} | {single_match}/{react_match} |\n" |
| |
| report += "\n---\n\n## Detailed Results\n\n" |
| |
| for i, (single, react) in enumerate(zip(single_results, react_results)): |
| test_id = single.get("test_id", i + 1) |
| query = single.get("query", "N/A") |
| description = single.get("description", "") |
| coverage = single.get("tool_coverage", "") |
| |
| report += f"""### Test {test_id}: {description} |
| |
| **Query:** `{query}` |
| **Expected Tools:** {coverage} |
| |
| | Mode | Status | Duration | Tools Used | Places | |
| |------|--------|----------|------------|--------| |
| """ |
| |
| if single.get("success"): |
| s_tools = ", ".join(single.get("tools_used", [])) or "None" |
| report += f"| Single | β
| {single.get('api_duration_ms', 0):.0f}ms | {s_tools} | {single.get('places_count', 0)} |\n" |
| else: |
| report += f"| Single | β | - | Error: {single.get('error', 'Unknown')[:50]} | - |\n" |
| |
| if react.get("success"): |
| r_tools = ", ".join(react.get("tools_used", [])) or "None" |
| report += f"| ReAct | β
| {react.get('api_duration_ms', 0):.0f}ms | {r_tools} | {react.get('places_count', 0)} |\n" |
| else: |
| report += f"| ReAct | β | - | Error: {react.get('error', 'Unknown')[:50]} | - |\n" |
| |
| report += "\n" |
| |
| |
| if single.get("success"): |
| report += f"**Single Response:** {single.get('response', '')[:150]}...\n\n" |
| if react.get("success"): |
| report += f"**ReAct Response:** {react.get('response', '')[:150]}...\n\n" |
| |
| report += "---\n\n" |
| |
| |
| all_tools_expected = {"retrieve_context_text", "find_nearby_places", "search_social_media"} |
| single_coverage = all_tools_expected.issubset(all_tools_single) |
| react_coverage = all_tools_expected.issubset(all_tools_react) |
| |
| report += f"""## Verdict |
| |
| | Criteria | Single Mode | ReAct Mode | |
| |----------|:-----------:|:----------:| |
| | All tests passed | {"β
" if single_success == len(single_results) else "β"} | {"β
" if react_success == len(react_results) else "β"} | |
| | All 3 search tools covered | {"β
" if single_coverage else "β"} | {"β
" if react_coverage else "β"} | |
| | Greeting detection works | {"β
" if any(not r.get('tools_used') and r.get('success') for r in single_results) else "β"} | {"β
" if any(not r.get('tools_used') and r.get('success') for r in react_results) else "β"} | |
| |
| **Overall:** {"π ALL TESTS PASSED!" if single_success == len(single_results) and react_success == len(react_results) else "β οΈ Some tests failed"} |
| """ |
| |
| return report |
|
|
|
|
| async def main(): |
| """Main test runner.""" |
| print("=" * 60) |
| print("LocalMate Agent Comprehensive Test") |
| print(f"Provider: {PROVIDER} | Model: {MODEL}") |
| print("=" * 60) |
| print() |
| |
| single_results = [] |
| react_results = [] |
| |
| async with httpx.AsyncClient() as client: |
| |
| print(f"π Running Single Mode Tests (react_mode=false, {SINGLE_MODE_DELAY}s delay)...") |
| print("-" * 50) |
| |
| for i, test in enumerate(TEST_CASES): |
| print(f" [{test['id']}/5] {test['description'][:40]}...") |
| result = await run_test(client, test, react_mode=False) |
| single_results.append(result) |
| |
| status = "β
" if result.get("success") else "β" |
| tools = ", ".join(result.get("tools_used", [])) or "None" |
| places = result.get("places_count", 0) |
| print(f" {status} Tools: [{tools}] | Places: {places} | {result.get('api_duration_ms', 0):.0f}ms") |
| |
| if i < len(TEST_CASES) - 1: |
| await asyncio.sleep(SINGLE_MODE_DELAY) |
| |
| print() |
| print(f"βΈοΈ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...") |
| await asyncio.sleep(MODE_SWITCH_DELAY) |
| |
| |
| print() |
| print(f"π§ Running ReAct Mode Tests (react_mode=true, {REACT_MODE_DELAY}s delay)...") |
| print("-" * 50) |
| |
| for i, test in enumerate(TEST_CASES): |
| print(f" [{test['id']}/5] {test['description'][:40]}...") |
| result = await run_test(client, test, react_mode=True) |
| react_results.append(result) |
| |
| status = "β
" if result.get("success") else "β" |
| tools = ", ".join(result.get("tools_used", [])) or "None" |
| places = result.get("places_count", 0) |
| steps = len(result.get("workflow", {}).get("steps", [])) |
| print(f" {status} Tools: [{tools}] | Places: {places} | Steps: {steps} | {result.get('api_duration_ms', 0):.0f}ms") |
| |
| if i < len(TEST_CASES) - 1: |
| await asyncio.sleep(REACT_MODE_DELAY) |
| |
| |
| print() |
| print("π Generating report...") |
| report = generate_report(single_results, react_results) |
| |
| |
| import os |
| script_dir = os.path.dirname(os.path.abspath(__file__)) |
| report_path = os.path.join(script_dir, "react_comparison_report.md") |
| with open(report_path, "w", encoding="utf-8") as f: |
| f.write(report) |
| |
| print(f"β
Report saved to: {report_path}") |
| print() |
| |
| |
| single_success = sum(1 for r in single_results if r.get('success')) |
| react_success = sum(1 for r in react_results if r.get('success')) |
| |
| print("=" * 60) |
| print("SUMMARY") |
| print("=" * 60) |
| print(f"Single Mode: {single_success}/{len(single_results)} passed") |
| print(f"ReAct Mode: {react_success}/{len(react_results)} passed") |
| print() |
| |
| if single_success == len(single_results) and react_success == len(react_results): |
| print("π ALL TESTS PASSED!") |
| else: |
| print("β οΈ Some tests failed - check report for details") |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |
|
|