anderson-ufrj
feat: implement comprehensive monitoring and observability stack
c97e35f
raw
history blame
10.2 kB
{
"dashboard": {
"id": null,
"title": "Cidadão.AI - System Overview",
"tags": ["cidadao-ai", "overview"],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"graphTooltip": 1,
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
"time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"]
},
"refresh": "30s",
"panels": [
{
"id": 1,
"title": "System Health Status",
"type": "stat",
"targets": [
{
"expr": "up{job=\"cidadao-ai-backend\"}",
"legendFormat": "Service Status"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Active Investigations",
"type": "stat",
"targets": [
{
"expr": "cidadao_ai_investigations_total{status=\"active\"}",
"legendFormat": "Active Count"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
}
},
{
"id": 3,
"title": "API Response Time P95",
"type": "stat",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(cidadao_ai_request_duration_seconds_bucket[5m]))",
"legendFormat": "P95 Latency"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 2}
]
}
}
}
},
{
"id": 4,
"title": "Anomalies Detected (24h)",
"type": "stat",
"targets": [
{
"expr": "increase(cidadao_ai_anomalies_detected_total[24h])",
"legendFormat": "Anomalies"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
}
},
{
"id": 5,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(cidadao_ai_request_duration_seconds_count[5m])",
"legendFormat": "Requests/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"yAxes": [
{
"label": "Requests/sec",
"min": 0
},
{
"show": false
}
]
},
{
"id": 6,
"title": "Agent Tasks Performance",
"type": "graph",
"targets": [
{
"expr": "rate(cidadao_ai_agent_tasks_total[5m])",
"legendFormat": "{{agent_name}} - {{status}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"legend": {
"show": true,
"values": true,
"current": true
}
},
{
"id": 7,
"title": "SLO Compliance",
"type": "table",
"targets": [
{
"expr": "cidadao_ai_slo_compliance_percentage",
"legendFormat": "{{slo_name}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 12
},
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"slo_name": "SLO",
"Value": "Compliance %"
}
}
}
],
"fieldConfig": {
"defaults": {
"custom": {
"align": "auto"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 95},
{"color": "green", "value": 99}
]
}
}
}
},
{
"id": 8,
"title": "Error Budget Consumption",
"type": "graph",
"targets": [
{
"expr": "cidadao_ai_error_budget_consumed_percentage",
"legendFormat": "{{slo_name}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 12
},
"yAxes": [
{
"label": "Budget Consumed %",
"max": 100,
"min": 0
},
{
"show": false
}
],
"alert": {
"conditions": [
{
"evaluator": {
"params": [80],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": ["A", "5m", "now"]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"for": "2m",
"frequency": "10s",
"handler": 1,
"name": "High Error Budget Consumption",
"noDataState": "no_data",
"notifications": []
}
},
{
"id": 9,
"title": "Database Connection Pool",
"type": "graph",
"targets": [
{
"expr": "cidadao_ai_database_connections_active",
"legendFormat": "Active Connections"
},
{
"expr": "cidadao_ai_database_connections_total",
"legendFormat": "Total Pool Size"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 20
}
},
{
"id": 10,
"title": "Cache Hit Rate",
"type": "stat",
"targets": [
{
"expr": "cidadao_ai_cache_hits_total / (cidadao_ai_cache_hits_total + cidadao_ai_cache_misses_total) * 100",
"legendFormat": "Hit Rate %"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 20
},
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 70},
{"color": "green", "value": 80}
]
}
}
}
},
{
"id": 11,
"title": "External API Health",
"type": "table",
"targets": [
{
"expr": "up{job=\"external-apis\"}",
"legendFormat": "{{instance}}"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 20
}
},
{
"id": 12,
"title": "Investigation Success Rate",
"type": "graph",
"targets": [
{
"expr": "rate(cidadao_ai_investigations_total{status=\"completed\"}[5m]) / rate(cidadao_ai_investigations_total[5m]) * 100",
"legendFormat": "Success Rate %"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 26
},
"yAxes": [
{
"label": "Success Rate %",
"max": 100,
"min": 0
},
{
"show": false
}
]
},
{
"id": 13,
"title": "Top Anomaly Types",
"type": "piechart",
"targets": [
{
"expr": "topk(5, sum by (anomaly_type) (cidadao_ai_anomalies_detected_total))",
"legendFormat": "{{anomaly_type}}"
}
],
"gridPos": {
"h": 6,
"w": 12,
"x": 12,
"y": 26
}
},
{
"id": 14,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "process_resident_memory_bytes{job=\"cidadao-ai-backend\"}",
"legendFormat": "Memory Usage"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 32
},
"yAxes": [
{
"label": "Bytes",
"logBase": 1
},
{
"show": false
}
]
},
{
"id": 15,
"title": "CPU Usage",
"type": "graph",
"targets": [
{
"expr": "rate(process_cpu_seconds_total{job=\"cidadao-ai-backend\"}[5m]) * 100",
"legendFormat": "CPU Usage %"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 32
},
"yAxes": [
{
"label": "CPU %",
"max": 100,
"min": 0
},
{
"show": false
}
]
},
{
"id": 16,
"title": "Alert Status",
"type": "table",
"targets": [
{
"expr": "ALERTS{alertstate=\"firing\"}",
"legendFormat": "{{alertname}}"
}
],
"gridPos": {
"h": 6,
"w": 8,
"x": 16,
"y": 32
},
"transformations": [
{
"id": "organize",
"options": {
"renameByName": {
"alertname": "Alert",
"severity": "Severity",
"instance": "Instance"
}
}
}
]
}
]
}
}