File size: 4,017 Bytes
c7fed4d
 
 
 
 
 
 
 
 
8275699
c7fed4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
API routes for agent performance metrics.
"""

from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Response
from prometheus_client import CONTENT_TYPE_LATEST

from src.core import get_logger
from src.api.auth import User
from src.api.dependencies import get_current_user
from src.services.agent_metrics import agent_metrics_service


router = APIRouter()
logger = get_logger("api.agent_metrics")


@router.get("/agents/{agent_name}/stats")
async def get_agent_stats(
    agent_name: str,
    current_user: User = Depends(get_current_user)
):
    """Get detailed statistics for a specific agent."""
    try:
        stats = await agent_metrics_service.get_agent_stats(agent_name)
        
        if stats.get("status") == "no_data":
            raise HTTPException(
                status_code=404,
                detail=f"No metrics found for agent: {agent_name}"
            )
        
        return {
            "status": "success",
            "data": stats
        }
        
    except Exception as e:
        logger.error(f"Error getting agent stats: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/agents/summary")
async def get_all_agents_summary(
    current_user: User = Depends(get_current_user)
):
    """Get summary statistics for all agents."""
    try:
        summary = await agent_metrics_service.get_all_agents_summary()
        
        return {
            "status": "success",
            "data": summary
        }
        
    except Exception as e:
        logger.error(f"Error getting agents summary: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/prometheus")
async def get_prometheus_metrics():
    """
    Expose metrics in Prometheus format.
    This endpoint is typically not authenticated to allow Prometheus scraping.
    """
    try:
        metrics = agent_metrics_service.get_prometheus_metrics()
        return Response(
            content=metrics,
            media_type=CONTENT_TYPE_LATEST,
            headers={"Content-Type": CONTENT_TYPE_LATEST}
        )
        
    except Exception as e:
        logger.error(f"Error generating Prometheus metrics: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/agents/{agent_name}/reset")
async def reset_agent_metrics(
    agent_name: str,
    current_user: User = Depends(get_current_user)
):
    """Reset metrics for a specific agent."""
    try:
        await agent_metrics_service.reset_metrics(agent_name)
        
        return {
            "status": "success",
            "message": f"Metrics reset for agent: {agent_name}"
        }
        
    except Exception as e:
        logger.error(f"Error resetting agent metrics: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.post("/reset")
async def reset_all_metrics(
    current_user: User = Depends(get_current_user)
):
    """Reset metrics for all agents."""
    try:
        await agent_metrics_service.reset_metrics()
        
        return {
            "status": "success",
            "message": "All agent metrics have been reset"
        }
        
    except Exception as e:
        logger.error(f"Error resetting all metrics: {e}")
        raise HTTPException(status_code=500, detail=str(e))


@router.get("/health")
async def metrics_health_check():
    """Check if metrics service is healthy."""
    try:
        # Get summary to verify service is working
        summary = await agent_metrics_service.get_all_agents_summary()
        
        return {
            "status": "healthy",
            "service": "agent_metrics",
            "agents_tracked": summary.get("total_agents", 0),
            "total_requests": summary.get("total_requests", 0)
        }
        
    except Exception as e:
        logger.error(f"Metrics service health check failed: {e}")
        return {
            "status": "unhealthy",
            "service": "agent_metrics",
            "error": str(e)
        }