anderson-ufrj
feat: implement comprehensive monitoring and observability stack
c97e35f
raw
history blame
9.34 kB
{
"dashboard": {
"id": null,
"title": "Cidadão.AI - Agent Performance",
"tags": ["cidadao-ai", "agents"],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"graphTooltip": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "Agent Task Success Rate",
"type": "stat",
"targets": [
{
"expr": "sum(rate(cidadao_ai_agent_tasks_total{status=\"success\"}[5m])) / sum(rate(cidadao_ai_agent_tasks_total[5m])) * 100",
"legendFormat": "Overall Success Rate"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 90},
{"color": "green", "value": 95}
]
}
}
}
},
{
"id": 2,
"title": "Active Agents",
"type": "stat",
"targets": [
{
"expr": "count(count by (agent_name) (cidadao_ai_agent_tasks_total))",
"legendFormat": "Active Agents"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
}
},
{
"id": 3,
"title": "Average Task Duration",
"type": "stat",
"targets": [
{
"expr": "avg(cidadao_ai_agent_task_duration_seconds)",
"legendFormat": "Avg Duration"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 5},
{"color": "red", "value": 10}
]
}
}
}
},
{
"id": 4,
"title": "Reflection Iterations",
"type": "stat",
"targets": [
{
"expr": "avg(cidadao_ai_agent_reflection_iterations)",
"legendFormat": "Avg Reflections"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
}
},
{
"id": 5,
"title": "Agent Performance by Type",
"type": "graph",
"targets": [
{
"expr": "rate(cidadao_ai_agent_tasks_total{status=\"success\"}[5m])",
"legendFormat": "{{agent_name}} - Success"
},
{
"expr": "rate(cidadao_ai_agent_tasks_total{status=\"error\"}[5m])",
"legendFormat": "{{agent_name}} - Error"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"legend": {
"show": true,
"values": true,
"current": true,
"alignAsTable": true
}
},
{
"id": 6,
"title": "Task Duration by Agent",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(cidadao_ai_agent_task_duration_seconds_bucket[5m]))",
"legendFormat": "{{agent_name}} - P95"
},
{
"expr": "histogram_quantile(0.50, rate(cidadao_ai_agent_task_duration_seconds_bucket[5m]))",
"legendFormat": "{{agent_name}} - P50"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"yAxes": [
{
"label": "Duration (seconds)",
"min": 0
},
{
"show": false
}
]
},
{
"id": 7,
"title": "Agent Status Distribution",
"type": "piechart",
"targets": [
{
"expr": "sum by (status) (cidadao_ai_agent_tasks_total)",
"legendFormat": "{{status}}"
}
],
"gridPos": {
"h": 8,
"w": 8,
"x": 0,
"y": 12
}
},
{
"id": 8,
"title": "Top Performing Agents",
"type": "table",
"targets": [
{
"expr": "topk(10, sum by (agent_name) (rate(cidadao_ai_agent_tasks_total{status=\"success\"}[1h])))",
"legendFormat": "{{agent_name}}"
}
],
"gridPos": {
"h": 8,
"w": 8,
"x": 8,
"y": 12
},
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"agent_name": "Agent",
"Value": "Tasks/hour"
}
}
}
]
},
{
"id": 9,
"title": "Agent Error Distribution",
"type": "table",
"targets": [
{
"expr": "sum by (agent_name, error_type) (cidadao_ai_agent_errors_total)",
"legendFormat": "{{agent_name}} - {{error_type}}"
}
],
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 12
}
},
{
"id": 10,
"title": "Zumbi Agent - Anomaly Detection",
"type": "graph",
"targets": [
{
"expr": "rate(cidadao_ai_anomalies_detected_total{agent=\"zumbi\"}[5m])",
"legendFormat": "{{anomaly_type}}"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 20
}
},
{
"id": 11,
"title": "Anita Agent - Analysis Accuracy",
"type": "stat",
"targets": [
{
"expr": "avg(cidadao_ai_analysis_accuracy{agent=\"anita\"})",
"legendFormat": "Analysis Accuracy"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 12,
"y": 20
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 12,
"title": "Tiradentes Agent - Report Generation",
"type": "stat",
"targets": [
{
"expr": "sum(rate(cidadao_ai_reports_generated_total{agent=\"tiradentes\"}[1h]))",
"legendFormat": "Reports/hour"
}
],
"gridPos": {
"h": 6,
"w": 6,
"x": 18,
"y": 20
}
},
{
"id": 13,
"title": "Agent Memory Usage",
"type": "graph",
"targets": [
{
"expr": "cidadao_ai_agent_memory_usage_bytes",
"legendFormat": "{{agent_name}}"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 26
},
"yAxes": [
{
"label": "Memory (bytes)",
"min": 0
},
{
"show": false
}
]
},
{
"id": 14,
"title": "Agent Communication Matrix",
"type": "heatmap",
"targets": [
{
"expr": "sum by (source_agent, target_agent) (cidadao_ai_agent_messages_total)",
"legendFormat": "{{source_agent}} -> {{target_agent}}"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 26
}
},
{
"id": 15,
"title": "Quality Score Trends",
"type": "graph",
"targets": [
{
"expr": "avg_over_time(cidadao_ai_agent_quality_score[30m])",
"legendFormat": "{{agent_name}}"
}
],
"gridPos": {
"h": 6,
"w": 24,
"x": 0,
"y": 32
},
"yAxes": [
{
"label": "Quality Score",
"min": 0,
"max": 1
},
{
"show": false
}
],
"alert": {
"conditions": [
{
"evaluator": {
"params": [0.8],
"type": "lt"
},
"operator": {
"type": "and"
},
"query": {
"params": ["A", "5m", "now"]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "3m",
"frequency": "30s",
"handler": 1,
"name": "Agent Quality Score Below Threshold",
"noDataState": "no_data",
"notifications": []
}
}
]
}
}