LocalMate / tests /test_react_comparison.py
Cuong2004's picture
fix intent and add plan
51ba917
"""
LocalMate Agent Test Script - Comprehensive Tool Coverage
Tests 5 queries covering ALL tools in both modes:
1. Greeting (no tools) - tests greeting detection
2. Text search (retrieve_context_text)
3. Location search (find_nearby_places)
4. Social search (search_social_media)
5. Complex query (multiple tools in ReAct mode)
Run: python tests/test_react_comparison.py
"""
import asyncio
import json
import time
from datetime import datetime
import httpx
# =============================================================================
# CONFIGURATION
# =============================================================================
# API Settings - Use localhost for local testing
# API_BASE = "https://cuong2004-localmate.hf.space/api/v1"
API_BASE = "http://localhost:8000/api/v1"
USER_ID = "test_comprehensive"
# Delay Settings (in seconds)
SINGLE_MODE_DELAY = 20 # Delay between queries in single mode
REACT_MODE_DELAY = 60 # Delay between queries in ReAct mode
MODE_SWITCH_DELAY = 60 # Delay between switching modes
REQUEST_TIMEOUT = 60 # Timeout for each API request
# Provider settings
# PROVIDER = "MegaLLM"
# MODEL = "deepseek-ai/deepseek-v3.1-terminus"
PROVIDER = "Google"
# MODEL = "gemini-3-flash-preview"
MODEL = "gemini-2.5-flash"
# =============================================================================
# 5 TEST CASES - Covering ALL tools
# =============================================================================
TEST_CASES = [
{
"id": 1,
"query": "xin chào",
"description": "Greeting - No tools expected",
"expected_tools": [],
"tool_coverage": "No tools (greeting detection)",
},
{
"id": 2,
"query": "Quán cafe view đẹp ở Đà Nẵng",
"description": "Text search - Semantic search in reviews",
"expected_tools": ["retrieve_context_text"],
"tool_coverage": "retrieve_context_text",
},
{
"id": 3,
"query": "Nhà hàng gần Cầu Rồng",
"description": "Location search - Neo4j spatial query",
"expected_tools": ["find_nearby_places"],
"tool_coverage": "find_nearby_places",
},
{
"id": 4,
"query": "Review quán ăn hot trên tiktok Đà Nẵng",
"description": "Social search - Brave API news/trends",
"expected_tools": ["search_social_media"],
"tool_coverage": "search_social_media",
},
{
"id": 5,
"query": "Quán cafe không gian đẹp gần biển Mỹ Khê có review tốt",
"description": "Complex query - Multiple tools (ReAct advantage)",
"expected_tools": ["find_nearby_places", "retrieve_context_text"],
"tool_coverage": "Multiple tools",
},
]
async def run_test(client: httpx.AsyncClient, test_case: dict, react_mode: bool) -> dict:
"""Run a single test case and return results."""
start_time = time.time()
try:
response = await client.post(
f"{API_BASE}/chat",
json={
"message": test_case["query"],
"user_id": USER_ID,
"provider": PROVIDER,
"model": MODEL,
"react_mode": react_mode,
"max_steps": 5,
},
timeout=float(REQUEST_TIMEOUT),
)
duration = (time.time() - start_time) * 1000
if response.status_code == 200:
data = response.json()
return {
"success": True,
"test_id": test_case["id"],
"query": test_case["query"],
"description": test_case["description"],
"tool_coverage": test_case["tool_coverage"],
"expected_tools": test_case["expected_tools"],
"react_mode": react_mode,
"response": data.get("response", "")[:300],
"workflow": data.get("workflow", {}),
"tools_used": data.get("tools_used", []),
"places_count": len(data.get("places", [])),
"api_duration_ms": data.get("duration_ms", 0),
"total_duration_ms": duration,
}
else:
return {
"success": False,
"test_id": test_case["id"],
"query": test_case["query"],
"react_mode": react_mode,
"error": f"HTTP {response.status_code}: {response.text[:200]}",
"total_duration_ms": duration,
}
except Exception as e:
return {
"success": False,
"test_id": test_case["id"],
"query": test_case["query"],
"react_mode": react_mode,
"error": str(e),
"total_duration_ms": (time.time() - start_time) * 1000,
}
def check_tool_match(expected: list, actual: list) -> str:
"""Check if expected tools match actual tools used."""
if not expected and not actual:
return "✅ Match"
if set(expected) == set(actual):
return "✅ Match"
if set(expected).issubset(set(actual)):
return "⚠️ Extra tools"
if any(t in actual for t in expected):
return "⚠️ Partial"
return "❌ Mismatch"
def generate_report(single_results: list, react_results: list) -> str:
"""Generate detailed markdown report."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Calculate stats
single_success = sum(1 for r in single_results if r.get('success'))
react_success = sum(1 for r in react_results if r.get('success'))
single_avg = sum(r.get('api_duration_ms', 0) for r in single_results if r.get('success')) / max(1, single_success)
react_avg = sum(r.get('api_duration_ms', 0) for r in react_results if r.get('success')) / max(1, react_success)
# Collect all unique tools used
all_tools_single = set()
all_tools_react = set()
for r in single_results:
if r.get('success'):
all_tools_single.update(r.get('tools_used', []))
for r in react_results:
if r.get('success'):
all_tools_react.update(r.get('tools_used', []))
report = f"""# LocalMate Agent Comprehensive Test Report
**Generated:** {timestamp}
**Provider:** {PROVIDER}
**Model:** {MODEL}
---
## Summary
| Metric | Single Mode | ReAct Mode |
|--------|:-----------:|:----------:|
| Success Rate | {single_success}/{len(single_results)} | {react_success}/{len(react_results)} |
| Avg Duration | {single_avg:.0f}ms | {react_avg:.0f}ms |
| Unique Tools | {len(all_tools_single)} | {len(all_tools_react)} |
### Tools Covered
| Tool | Single Mode | ReAct Mode |
|------|:-----------:|:----------:|
| `retrieve_context_text` | {"✅" if "retrieve_context_text" in all_tools_single else "❌"} | {"✅" if "retrieve_context_text" in all_tools_react else "❌"} |
| `find_nearby_places` | {"✅" if "find_nearby_places" in all_tools_single else "❌"} | {"✅" if "find_nearby_places" in all_tools_react else "❌"} |
| `search_social_media` | {"✅" if "search_social_media" in all_tools_single else "❌"} | {"✅" if "search_social_media" in all_tools_react else "❌"} |
| No tools (greeting) | {"✅" if any(not r.get('tools_used') for r in single_results if r.get('success')) else "❌"} | {"✅" if any(not r.get('tools_used') for r in react_results if r.get('success')) else "❌"} |
---
## Test Results
| ID | Description | Single Tools | ReAct Tools | Match |
|----|-------------|--------------|-------------|-------|
"""
for single, react in zip(single_results, react_results):
test_id = single.get("test_id", "?")
desc = single.get("description", "")[:30]
expected = single.get("expected_tools", [])
if single.get("success"):
single_tools = ", ".join(single.get("tools_used", [])) or "∅ (none)"
single_match = check_tool_match(expected, single.get("tools_used", []))
else:
single_tools = "❌ Error"
single_match = "❌"
if react.get("success"):
react_tools = ", ".join(react.get("tools_used", [])) or "∅ (none)"
react_match = check_tool_match(expected, react.get("tools_used", []))
else:
react_tools = "❌ Error"
react_match = "❌"
report += f"| {test_id} | {desc} | {single_tools} | {react_tools} | {single_match}/{react_match} |\n"
report += "\n---\n\n## Detailed Results\n\n"
for i, (single, react) in enumerate(zip(single_results, react_results)):
test_id = single.get("test_id", i + 1)
query = single.get("query", "N/A")
description = single.get("description", "")
coverage = single.get("tool_coverage", "")
report += f"""### Test {test_id}: {description}
**Query:** `{query}`
**Expected Tools:** {coverage}
| Mode | Status | Duration | Tools Used | Places |
|------|--------|----------|------------|--------|
"""
if single.get("success"):
s_tools = ", ".join(single.get("tools_used", [])) or "None"
report += f"| Single | ✅ | {single.get('api_duration_ms', 0):.0f}ms | {s_tools} | {single.get('places_count', 0)} |\n"
else:
report += f"| Single | ❌ | - | Error: {single.get('error', 'Unknown')[:50]} | - |\n"
if react.get("success"):
r_tools = ", ".join(react.get("tools_used", [])) or "None"
report += f"| ReAct | ✅ | {react.get('api_duration_ms', 0):.0f}ms | {r_tools} | {react.get('places_count', 0)} |\n"
else:
report += f"| ReAct | ❌ | - | Error: {react.get('error', 'Unknown')[:50]} | - |\n"
report += "\n"
# Show response preview for successful tests
if single.get("success"):
report += f"**Single Response:** {single.get('response', '')[:150]}...\n\n"
if react.get("success"):
report += f"**ReAct Response:** {react.get('response', '')[:150]}...\n\n"
report += "---\n\n"
# Overall verdict
all_tools_expected = {"retrieve_context_text", "find_nearby_places", "search_social_media"}
single_coverage = all_tools_expected.issubset(all_tools_single)
react_coverage = all_tools_expected.issubset(all_tools_react)
report += f"""## Verdict
| Criteria | Single Mode | ReAct Mode |
|----------|:-----------:|:----------:|
| All tests passed | {"✅" if single_success == len(single_results) else "❌"} | {"✅" if react_success == len(react_results) else "❌"} |
| All 3 search tools covered | {"✅" if single_coverage else "❌"} | {"✅" if react_coverage else "❌"} |
| Greeting detection works | {"✅" if any(not r.get('tools_used') and r.get('success') for r in single_results) else "❌"} | {"✅" if any(not r.get('tools_used') and r.get('success') for r in react_results) else "❌"} |
**Overall:** {"🎉 ALL TESTS PASSED!" if single_success == len(single_results) and react_success == len(react_results) else "⚠️ Some tests failed"}
"""
return report
async def main():
"""Main test runner."""
print("=" * 60)
print("LocalMate Agent Comprehensive Test")
print(f"Provider: {PROVIDER} | Model: {MODEL}")
print("=" * 60)
print()
single_results = []
react_results = []
async with httpx.AsyncClient() as client:
# Test Single Mode (react_mode=False)
print(f"📌 Running Single Mode Tests (react_mode=false, {SINGLE_MODE_DELAY}s delay)...")
print("-" * 50)
for i, test in enumerate(TEST_CASES):
print(f" [{test['id']}/5] {test['description'][:40]}...")
result = await run_test(client, test, react_mode=False)
single_results.append(result)
status = "✅" if result.get("success") else "❌"
tools = ", ".join(result.get("tools_used", [])) or "None"
places = result.get("places_count", 0)
print(f" {status} Tools: [{tools}] | Places: {places} | {result.get('api_duration_ms', 0):.0f}ms")
if i < len(TEST_CASES) - 1:
await asyncio.sleep(SINGLE_MODE_DELAY)
print()
print(f"⏸️ Waiting {MODE_SWITCH_DELAY}s before ReAct mode...")
await asyncio.sleep(MODE_SWITCH_DELAY)
# Test ReAct Mode (react_mode=True)
print()
print(f"🧠 Running ReAct Mode Tests (react_mode=true, {REACT_MODE_DELAY}s delay)...")
print("-" * 50)
for i, test in enumerate(TEST_CASES):
print(f" [{test['id']}/5] {test['description'][:40]}...")
result = await run_test(client, test, react_mode=True)
react_results.append(result)
status = "✅" if result.get("success") else "❌"
tools = ", ".join(result.get("tools_used", [])) or "None"
places = result.get("places_count", 0)
steps = len(result.get("workflow", {}).get("steps", []))
print(f" {status} Tools: [{tools}] | Places: {places} | Steps: {steps} | {result.get('api_duration_ms', 0):.0f}ms")
if i < len(TEST_CASES) - 1:
await asyncio.sleep(REACT_MODE_DELAY)
# Generate report
print()
print("📝 Generating report...")
report = generate_report(single_results, react_results)
# Save report
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
report_path = os.path.join(script_dir, "react_comparison_report.md")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"✅ Report saved to: {report_path}")
print()
# Quick summary
single_success = sum(1 for r in single_results if r.get('success'))
react_success = sum(1 for r in react_results if r.get('success'))
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Single Mode: {single_success}/{len(single_results)} passed")
print(f"ReAct Mode: {react_success}/{len(react_results)} passed")
print()
if single_success == len(single_results) and react_success == len(react_results):
print("🎉 ALL TESTS PASSED!")
else:
print("⚠️ Some tests failed - check report for details")
if __name__ == "__main__":
asyncio.run(main())