""" LocalMate Agent Test Script - Comprehensive Tool Coverage Tests 5 queries covering ALL tools in both modes: 1. Greeting (no tools) - tests greeting detection 2. Text search (retrieve_context_text) 3. Location search (find_nearby_places) 4. Social search (search_social_media) 5. Complex query (multiple tools in ReAct mode) Run: python tests/test_react_comparison.py """ import asyncio import json import time from datetime import datetime import httpx # ============================================================================= # CONFIGURATION # ============================================================================= # API Settings - Use localhost for local testing # API_BASE = "https://cuong2004-localmate.hf.space/api/v1" API_BASE = "http://localhost:8000/api/v1" USER_ID = "test_comprehensive" # Delay Settings (in seconds) SINGLE_MODE_DELAY = 20 # Delay between queries in single mode REACT_MODE_DELAY = 60 # Delay between queries in ReAct mode MODE_SWITCH_DELAY = 60 # Delay between switching modes REQUEST_TIMEOUT = 60 # Timeout for each API request # Provider settings # PROVIDER = "MegaLLM" # MODEL = "deepseek-ai/deepseek-v3.1-terminus" PROVIDER = "Google" # MODEL = "gemini-3-flash-preview" MODEL = "gemini-2.5-flash" # ============================================================================= # 5 TEST CASES - Covering ALL tools # ============================================================================= TEST_CASES = [ { "id": 1, "query": "xin chào", "description": "Greeting - No tools expected", "expected_tools": [], "tool_coverage": "No tools (greeting detection)", }, { "id": 2, "query": "Quán cafe view đẹp ở Đà Nẵng", "description": "Text search - Semantic search in reviews", "expected_tools": ["retrieve_context_text"], "tool_coverage": "retrieve_context_text", }, { "id": 3, "query": "Nhà hàng gần Cầu Rồng", "description": "Location search - Neo4j spatial query", "expected_tools": ["find_nearby_places"], "tool_coverage": "find_nearby_places", }, { "id": 4, "query": "Review quán ăn hot trên tiktok Đà Nẵng", "description": "Social search - Brave API news/trends", "expected_tools": ["search_social_media"], "tool_coverage": "search_social_media", }, { "id": 5, "query": "Quán cafe không gian đẹp gần biển Mỹ Khê có review tốt", "description": "Complex query - Multiple tools (ReAct advantage)", "expected_tools": ["find_nearby_places", "retrieve_context_text"], "tool_coverage": "Multiple tools", }, ] async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict: """Run a single test case and return results.""" start_time = time.time() try: response = await client.post( f"{API_BASE}/chat", json={ "message": test_case["query"], "user_id": USER_ID, "provider": PROVIDER, "model": MODEL, "react_mode": react_mode, "max_steps": 5, }, timeout=float(REQUEST_TIMEOUT), ) duration = (time.time() - start_time) * 1000 if response.status_code == 200: data = response.json() return { "success": True, "test_id": test_case["id"], "query": test_case["query"], "description": test_case["description"], "tool_coverage": test_case["tool_coverage"], "expected_tools": test_case["expected_tools"], "react_mode": react_mode, "response": data.get("response", "")[:300], "workflow": data.get("workflow", {}), "tools_used": data.get("tools_used", []), "places_count": len(data.get("places", [])), "api_duration_ms": data.get("duration_ms", 0), "total_duration_ms": duration, } else: return { "success": False, "test_id": test_case["id"], "query": test_case["query"], "react_mode": react_mode, "error": f"HTTP {response.status_code}: {response.text[:200]}", "total_duration_ms": duration, } except Exception as e: return { "success": False, "test_id": test_case["id"], "query": test_case["query"], "react_mode": react_mode, "error": str(e), "total_duration_ms": (time.time() - start_time) * 1000, } def check_tool_match(expected: list, actual: list) -> str: """Check if expected tools match actual tools used.""" if not expected and not actual: return "✅ Match" if set(expected) == set(actual): return "✅ Match" if set(expected).issubset(set(actual)): return "⚠️ Extra tools" if any(t in actual for t in expected): return "⚠️ Partial" return "❌ Mismatch" def generate_report(single_results: list, react_results: list) -> str: """Generate detailed markdown report.""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Calculate stats single_success = sum(1 for r in single_results if r.get('success')) react_success = sum(1 for r in react_results if r.get('success')) single_avg = sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, single_success) react_avg = sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, react_success) # Collect all unique tools used all_tools_single = set() all_tools_react = set() for r in single_results: if r.get('success'): all_tools_single.update(r.get('tools_used', [])) for r in react_results: if r.get('success'): all_tools_react.update(r.get('tools_used', [])) report = f"""# LocalMate Agent Comprehensive Test Report **Generated:** {timestamp} **Provider:** {PROVIDER} **Model:** {MODEL} --- ## Summary | Metric | Single Mode | ReAct Mode | |--------|:-----------:|:----------:| | Success Rate | {single_success}/{len(single_results)} | {react_success}/{len(react_results)} | | Avg Duration | {single_avg:.0f}ms | {react_avg:.0f}ms | | Unique Tools | {len(all_tools_single)} | {len(all_tools_react)} | ### Tools Covered | Tool | Single Mode | ReAct Mode | |------|:-----------:|:----------:| | `retrieve_context_text` | {"✅" if "retrieve_context_text" in all_tools_single else "❌"} | {"✅" if "retrieve_context_text" in all_tools_react else "❌"} | | `find_nearby_places` | {"✅" if "find_nearby_places" in all_tools_single else "❌"} | {"✅" if "find_nearby_places" in all_tools_react else "❌"} | | `search_social_media` | {"✅" if "search_social_media" in all_tools_single else "❌"} | {"✅" if "search_social_media" in all_tools_react else "❌"} | | No tools (greeting) | {"✅" if any(not r.get('tools_used') for r in single_results if r.get('success')) else "❌"} | {"✅" if any(not r.get('tools_used') for r in react_results if r.get('success')) else "❌"} | --- ## Test Results | ID | Description | Single Tools | ReAct Tools | Match | |----|-------------|--------------|-------------|-------| """ for single, react in zip(single_results, react_results): test_id = single.get("test_id", "?") desc = single.get("description", "")[:30] expected = single.get("expected_tools", []) if single.get("success"): single_tools = ", ".join(single.get("tools_used", [])) or "∅ (none)" single_match = check_tool_match(expected, single.get("tools_used", [])) else: single_tools = "❌ Error" single_match = "❌" if react.get("success"): react_tools = ", ".join(react.get("tools_used", [])) or "∅ (none)" react_match = check_tool_match(expected, react.get("tools_used", [])) else: react_tools = "❌ Error" react_match = "❌" report += f"| {test_id} | {desc} | {single_tools} | {react_tools} | {single_match}/{react_match} |\n" report += "\n---\n\n## Detailed Results\n\n" for i, (single, react) in enumerate(zip(single_results, react_results)): test_id = single.get("test_id", i + 1) query = single.get("query", "N/A") description = single.get("description", "") coverage = single.get("tool_coverage", "") report += f"""### Test {test_id}: {description} **Query:** `{query}` **Expected Tools:** {coverage} | Mode | Status | Duration | Tools Used | Places | |------|--------|----------|------------|--------| """ if single.get("success"): s_tools = ", ".join(single.get("tools_used", [])) or "None" report += f"| Single | ✅ | {single.get('api_duration_ms', 0):.0f}ms | {s_tools} | {single.get('places_count', 0)} |\n" else: report += f"| Single | ❌ | - | Error: {single.get('error', 'Unknown')[:50]} | - |\n" if react.get("success"): r_tools = ", ".join(react.get("tools_used", [])) or "None" report += f"| ReAct | ✅ | {react.get('api_duration_ms', 0):.0f}ms | {r_tools} | {react.get('places_count', 0)} |\n" else: report += f"| ReAct | ❌ | - | Error: {react.get('error', 'Unknown')[:50]} | - |\n" report += "\n" # Show response preview for successful tests if single.get("success"): report += f"**Single Response:** {single.get('response', '')[:150]}...\n\n" if react.get("success"): report += f"**ReAct Response:** {react.get('response', '')[:150]}...\n\n" report += "---\n\n" # Overall verdict all_tools_expected = {"retrieve_context_text", "find_nearby_places", "search_social_media"} single_coverage = all_tools_expected.issubset(all_tools_single) react_coverage = all_tools_expected.issubset(all_tools_react) report += f"""## Verdict | Criteria | Single Mode | ReAct Mode | |----------|:-----------:|:----------:| | All tests passed | {"✅" if single_success == len(single_results) else "❌"} | {"✅" if react_success == len(react_results) else "❌"} | | All 3 search tools covered | {"✅" if single_coverage else "❌"} | {"✅" if react_coverage else "❌"} | | Greeting detection works | {"✅" if any(not r.get('tools_used') and r.get('success') for r in single_results) else "❌"} | {"✅" if any(not r.get('tools_used') and r.get('success') for r in react_results) else "❌"} | **Overall:** {"🎉 ALL TESTS PASSED!" if single_success == len(single_results) and react_success == len(react_results) else "⚠️ Some tests failed"} """ return report async def main(): """Main test runner.""" print("=" * 60) print("LocalMate Agent Comprehensive Test") print(f"Provider: {PROVIDER} | Model: {MODEL}") print("=" * 60) print() single_results = [] react_results = [] async with httpx.AsyncClient() as client: # Test Single Mode (react_mode=False) print(f"📌 Running Single Mode Tests (react_mode=false, {SINGLE_MODE_DELAY}s delay)...") print("-" * 50) for i, test in enumerate(TEST_CASES): print(f" [{test['id']}/5] {test['description'][:40]}...") result = await run_test(client, test, react_mode=False) single_results.append(result) status = "✅" if result.get("success") else "❌" tools = ", ".join(result.get("tools_used", [])) or "None" places = result.get("places_count", 0) print(f" {status} Tools: [{tools}] | Places: {places} | {result.get('api_duration_ms', 0):.0f}ms") if i < len(TEST_CASES) - 1: await asyncio.sleep(SINGLE_MODE_DELAY) print() print(f"⏸️ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...") await asyncio.sleep(MODE_SWITCH_DELAY) # Test ReAct Mode (react_mode=True) print() print(f"🧠 Running ReAct Mode Tests (react_mode=true, {REACT_MODE_DELAY}s delay)...") print("-" * 50) for i, test in enumerate(TEST_CASES): print(f" [{test['id']}/5] {test['description'][:40]}...") result = await run_test(client, test, react_mode=True) react_results.append(result) status = "✅" if result.get("success") else "❌" tools = ", ".join(result.get("tools_used", [])) or "None" places = result.get("places_count", 0) steps = len(result.get("workflow", {}).get("steps", [])) print(f" {status} Tools: [{tools}] | Places: {places} | Steps: {steps} | {result.get('api_duration_ms', 0):.0f}ms") if i < len(TEST_CASES) - 1: await asyncio.sleep(REACT_MODE_DELAY) # Generate report print() print("📝 Generating report...") report = generate_report(single_results, react_results) # Save report import os script_dir = os.path.dirname(os.path.abspath(__file__)) report_path = os.path.join(script_dir, "react_comparison_report.md") with open(report_path, "w", encoding="utf-8") as f: f.write(report) print(f"✅ Report saved to: {report_path}") print() # Quick summary single_success = sum(1 for r in single_results if r.get('success')) react_success = sum(1 for r in react_results if r.get('success')) print("=" * 60) print("SUMMARY") print("=" * 60) print(f"Single Mode: {single_success}/{len(single_results)} passed") print(f"ReAct Mode: {react_success}/{len(react_results)} passed") print() if single_success == len(single_results) and react_success == len(react_results): print("🎉 ALL TESTS PASSED!") else: print("⚠️ Some tests failed - check report for details") if __name__ == "__main__": asyncio.run(main())