anderson-ufrj
commited on
Commit
·
2395a41
1
Parent(s):
640d7fb
feat(monitoring): configure comprehensive Prometheus alerts and Alertmanager
Browse files- Add critical alerts for API availability (<99% triggers)
- Configure latency alerts (warning at 2s, critical at 5s)
- Set up agent failure rate monitoring
- Add investigation backlog and stalled processing alerts
- Configure cache hit rate alerts (<70% triggers)
- Monitor circuit breaker states
- Add resource utilization alerts (memory, CPU)
- Configure database connection pool monitoring
- Set up anomaly detection rate alerts
- Add SLO violation alerts for key metrics
- Configure Alertmanager with routing rules
- Set up notification channels (email, Slack, PagerDuty)
- Add inhibition rules to prevent alert storms
- Create alert templates for better formatting
- Update Prometheus config to include new alerts
monitoring/alertmanager/alertmanager.yml
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
# SMTP configuration for email alerts
|
| 3 |
+
smtp_smarthost: 'smtp.gmail.com:587'
|
| 4 |
+
smtp_from: '[email protected]'
|
| 5 |
+
smtp_auth_username: '[email protected]'
|
| 6 |
+
smtp_auth_password: 'YOUR_APP_PASSWORD'
|
| 7 |
+
smtp_require_tls: true
|
| 8 |
+
|
| 9 |
+
# Slack webhook URL
|
| 10 |
+
slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'
|
| 11 |
+
|
| 12 |
+
# PagerDuty configuration
|
| 13 |
+
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
|
| 14 |
+
|
| 15 |
+
# Templates for alert formatting
|
| 16 |
+
templates:
|
| 17 |
+
- '/etc/alertmanager/templates/*.tmpl'
|
| 18 |
+
|
| 19 |
+
# The root route for all alerts
|
| 20 |
+
route:
|
| 21 |
+
# Default receiver for alerts
|
| 22 |
+
receiver: 'default'
|
| 23 |
+
|
| 24 |
+
# Group alerts by these labels
|
| 25 |
+
group_by: ['alertname', 'cluster', 'service', 'severity']
|
| 26 |
+
|
| 27 |
+
# Wait before sending grouped alerts
|
| 28 |
+
group_wait: 30s
|
| 29 |
+
|
| 30 |
+
# How long to wait before sending new alerts for a group
|
| 31 |
+
group_interval: 5m
|
| 32 |
+
|
| 33 |
+
# How long to wait before repeating alerts
|
| 34 |
+
repeat_interval: 12h
|
| 35 |
+
|
| 36 |
+
# Child routes for specific routing
|
| 37 |
+
routes:
|
| 38 |
+
# Critical alerts go to PagerDuty
|
| 39 |
+
- match:
|
| 40 |
+
severity: critical
|
| 41 |
+
pagerduty: 'true'
|
| 42 |
+
receiver: pagerduty
|
| 43 |
+
continue: true
|
| 44 |
+
|
| 45 |
+
# All critical alerts also go to Slack
|
| 46 |
+
- match:
|
| 47 |
+
severity: critical
|
| 48 |
+
receiver: slack-critical
|
| 49 |
+
|
| 50 |
+
# Warning alerts go to Slack
|
| 51 |
+
- match:
|
| 52 |
+
severity: warning
|
| 53 |
+
receiver: slack-warning
|
| 54 |
+
|
| 55 |
+
# SLO violations have special handling
|
| 56 |
+
- match:
|
| 57 |
+
slo: 'true'
|
| 58 |
+
receiver: slo-violations
|
| 59 |
+
group_interval: 1h
|
| 60 |
+
repeat_interval: 4h
|
| 61 |
+
|
| 62 |
+
# ML team alerts
|
| 63 |
+
- match:
|
| 64 |
+
team: ml
|
| 65 |
+
receiver: ml-team
|
| 66 |
+
|
| 67 |
+
# Backend team alerts
|
| 68 |
+
- match:
|
| 69 |
+
team: backend
|
| 70 |
+
receiver: backend-team
|
| 71 |
+
|
| 72 |
+
# Receivers define notification destinations
|
| 73 |
+
receivers:
|
| 74 |
+
# Default receiver - logs only
|
| 75 |
+
- name: 'default'
|
| 76 |
+
webhook_configs:
|
| 77 |
+
- url: 'http://localhost:9093/api/v1/alerts'
|
| 78 |
+
send_resolved: true
|
| 79 |
+
|
| 80 |
+
# PagerDuty for critical incidents
|
| 81 |
+
- name: 'pagerduty'
|
| 82 |
+
pagerduty_configs:
|
| 83 |
+
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
|
| 84 |
+
description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}'
|
| 85 |
+
details:
|
| 86 |
+
firing: '{{ .Alerts.Firing | len }}'
|
| 87 |
+
resolved: '{{ .Alerts.Resolved | len }}'
|
| 88 |
+
description: '{{ .CommonAnnotations.description }}'
|
| 89 |
+
runbook: '{{ .CommonAnnotations.runbook_url }}'
|
| 90 |
+
|
| 91 |
+
# Slack channels for different severities
|
| 92 |
+
- name: 'slack-critical'
|
| 93 |
+
slack_configs:
|
| 94 |
+
- api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK'
|
| 95 |
+
channel: '#alerts-critical'
|
| 96 |
+
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
|
| 97 |
+
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
| 98 |
+
send_resolved: true
|
| 99 |
+
color: 'danger'
|
| 100 |
+
actions:
|
| 101 |
+
- type: button
|
| 102 |
+
text: 'View Dashboard'
|
| 103 |
+
url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview'
|
| 104 |
+
- type: button
|
| 105 |
+
text: 'Runbook'
|
| 106 |
+
url: '{{ .CommonAnnotations.runbook_url }}'
|
| 107 |
+
|
| 108 |
+
- name: 'slack-warning'
|
| 109 |
+
slack_configs:
|
| 110 |
+
- api_url: 'YOUR_SLACK_WARNING_WEBHOOK'
|
| 111 |
+
channel: '#alerts-warning'
|
| 112 |
+
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
|
| 113 |
+
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
| 114 |
+
send_resolved: true
|
| 115 |
+
color: 'warning'
|
| 116 |
+
|
| 117 |
+
# SLO violations channel
|
| 118 |
+
- name: 'slo-violations'
|
| 119 |
+
slack_configs:
|
| 120 |
+
- api_url: 'YOUR_SLACK_SLO_WEBHOOK'
|
| 121 |
+
channel: '#slo-violations'
|
| 122 |
+
title: '📊 SLO Violation: {{ .GroupLabels.alertname }}'
|
| 123 |
+
text: |
|
| 124 |
+
*Service:* Cidadão.AI Backend
|
| 125 |
+
*SLO:* {{ .GroupLabels.alertname }}
|
| 126 |
+
*Current Value:* {{ .CommonAnnotations.description }}
|
| 127 |
+
*Time:* {{ .StartsAt.Format "15:04:05 MST" }}
|
| 128 |
+
send_resolved: true
|
| 129 |
+
color: '#ff9800'
|
| 130 |
+
email_configs:
|
| 131 |
+
- to: '[email protected]'
|
| 132 |
+
headers:
|
| 133 |
+
Subject: 'SLO Violation: {{ .GroupLabels.alertname }}'
|
| 134 |
+
|
| 135 |
+
# Team-specific receivers
|
| 136 |
+
- name: 'ml-team'
|
| 137 |
+
slack_configs:
|
| 138 |
+
- api_url: 'YOUR_SLACK_ML_WEBHOOK'
|
| 139 |
+
channel: '#ml-alerts'
|
| 140 |
+
title: '🤖 ML Alert: {{ .GroupLabels.alertname }}'
|
| 141 |
+
text: '{{ .CommonAnnotations.description }}'
|
| 142 |
+
email_configs:
|
| 143 |
+
- to: '[email protected]'
|
| 144 |
+
|
| 145 |
+
- name: 'backend-team'
|
| 146 |
+
slack_configs:
|
| 147 |
+
- api_url: 'YOUR_SLACK_BACKEND_WEBHOOK'
|
| 148 |
+
channel: '#backend-alerts'
|
| 149 |
+
title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}'
|
| 150 |
+
text: '{{ .CommonAnnotations.description }}'
|
| 151 |
+
email_configs:
|
| 152 |
+
- to: '[email protected]'
|
| 153 |
+
|
| 154 |
+
# Inhibition rules prevent certain alerts when others are firing
|
| 155 |
+
inhibit_rules:
|
| 156 |
+
# Don't alert on high latency if API is down
|
| 157 |
+
- source_match:
|
| 158 |
+
alertname: 'APIDown'
|
| 159 |
+
target_match:
|
| 160 |
+
alertname: 'APIHighLatency'
|
| 161 |
+
equal: ['service']
|
| 162 |
+
|
| 163 |
+
# Don't alert on agent failures if all agents are failing
|
| 164 |
+
- source_match:
|
| 165 |
+
alertname: 'AllAgentsFailing'
|
| 166 |
+
target_match:
|
| 167 |
+
alertname: 'AgentHighFailureRate'
|
| 168 |
+
equal: ['service']
|
| 169 |
+
|
| 170 |
+
# Don't alert on cache hit rate if Redis is down
|
| 171 |
+
- source_match:
|
| 172 |
+
alertname: 'RedisDown'
|
| 173 |
+
target_match:
|
| 174 |
+
alertname: 'LowCacheHitRate'
|
| 175 |
+
equal: ['service']
|
monitoring/alertmanager/templates/cidadao-ai.tmpl
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{ define "cidadao.ai.title" }}
|
| 2 |
+
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}
|
| 3 |
+
{{ end }}
|
| 4 |
+
|
| 5 |
+
{{ define "cidadao.ai.text" }}
|
| 6 |
+
{{ range .Alerts }}
|
| 7 |
+
*Alert:* {{ .Labels.alertname }}
|
| 8 |
+
*Severity:* {{ .Labels.severity }}
|
| 9 |
+
*Service:* {{ .Labels.service }}
|
| 10 |
+
*Instance:* {{ .Labels.instance }}
|
| 11 |
+
*Description:* {{ .Annotations.description }}
|
| 12 |
+
*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
|
| 13 |
+
{{ if .Annotations.runbook_url }}*Runbook:* <{{ .Annotations.runbook_url }}|View Runbook>{{ end }}
|
| 14 |
+
{{ end }}
|
| 15 |
+
{{ end }}
|
| 16 |
+
|
| 17 |
+
{{ define "cidadao.ai.html" }}
|
| 18 |
+
<!DOCTYPE html>
|
| 19 |
+
<html>
|
| 20 |
+
<head>
|
| 21 |
+
<style>
|
| 22 |
+
body { font-family: Arial, sans-serif; }
|
| 23 |
+
.alert-box {
|
| 24 |
+
border: 2px solid #ddd;
|
| 25 |
+
padding: 10px;
|
| 26 |
+
margin: 10px 0;
|
| 27 |
+
border-radius: 5px;
|
| 28 |
+
}
|
| 29 |
+
.critical { border-color: #f44336; background-color: #ffebee; }
|
| 30 |
+
.warning { border-color: #ff9800; background-color: #fff3e0; }
|
| 31 |
+
.info { border-color: #2196f3; background-color: #e3f2fd; }
|
| 32 |
+
.resolved { border-color: #4caf50; background-color: #e8f5e9; }
|
| 33 |
+
h2 { margin-top: 0; }
|
| 34 |
+
.label { font-weight: bold; }
|
| 35 |
+
.metadata { color: #666; font-size: 0.9em; }
|
| 36 |
+
</style>
|
| 37 |
+
</head>
|
| 38 |
+
<body>
|
| 39 |
+
<h1>Cidadão.AI Alert Notification</h1>
|
| 40 |
+
|
| 41 |
+
{{ range .Alerts }}
|
| 42 |
+
<div class="alert-box {{ .Labels.severity }}">
|
| 43 |
+
<h2>{{ .Labels.alertname }}</h2>
|
| 44 |
+
|
| 45 |
+
<p><span class="label">Status:</span> {{ .Status }}</p>
|
| 46 |
+
<p><span class="label">Severity:</span> {{ .Labels.severity | toUpper }}</p>
|
| 47 |
+
<p><span class="label">Service:</span> {{ .Labels.service }}</p>
|
| 48 |
+
<p><span class="label">Instance:</span> {{ .Labels.instance }}</p>
|
| 49 |
+
|
| 50 |
+
<p><span class="label">Description:</span><br>
|
| 51 |
+
{{ .Annotations.description }}</p>
|
| 52 |
+
|
| 53 |
+
<div class="metadata">
|
| 54 |
+
<p><span class="label">Started:</span> {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}</p>
|
| 55 |
+
{{ if .EndsAt }}
|
| 56 |
+
<p><span class="label">Ended:</span> {{ .EndsAt.Format "2006-01-02 15:04:05 MST" }}</p>
|
| 57 |
+
{{ end }}
|
| 58 |
+
|
| 59 |
+
{{ if .Annotations.runbook_url }}
|
| 60 |
+
<p><span class="label">Runbook:</span> <a href="{{ .Annotations.runbook_url }}">View Documentation</a></p>
|
| 61 |
+
{{ end }}
|
| 62 |
+
</div>
|
| 63 |
+
|
| 64 |
+
<details>
|
| 65 |
+
<summary>All Labels</summary>
|
| 66 |
+
<ul>
|
| 67 |
+
{{ range $key, $value := .Labels }}
|
| 68 |
+
<li><strong>{{ $key }}:</strong> {{ $value }}</li>
|
| 69 |
+
{{ end }}
|
| 70 |
+
</ul>
|
| 71 |
+
</details>
|
| 72 |
+
</div>
|
| 73 |
+
{{ end }}
|
| 74 |
+
|
| 75 |
+
<hr>
|
| 76 |
+
<p style="color: #666; font-size: 0.8em;">
|
| 77 |
+
This alert was generated by the Cidadão.AI monitoring system.
|
| 78 |
+
For more information, visit the <a href="https://grafana.cidadao.ai">Grafana Dashboard</a>.
|
| 79 |
+
</p>
|
| 80 |
+
</body>
|
| 81 |
+
</html>
|
| 82 |
+
{{ end }}
|
monitoring/prometheus/alerts.yml
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
groups:
|
| 2 |
+
- name: cidadao_ai_critical_alerts
|
| 3 |
+
interval: 30s
|
| 4 |
+
rules:
|
| 5 |
+
# API Availability Alerts
|
| 6 |
+
- alert: APIHighErrorRate
|
| 7 |
+
expr: |
|
| 8 |
+
(
|
| 9 |
+
sum(rate(cidadao_ai_http_errors_total[5m]))
|
| 10 |
+
/
|
| 11 |
+
sum(rate(cidadao_ai_http_requests_total[5m]))
|
| 12 |
+
) > 0.05
|
| 13 |
+
for: 5m
|
| 14 |
+
labels:
|
| 15 |
+
severity: critical
|
| 16 |
+
team: backend
|
| 17 |
+
annotations:
|
| 18 |
+
summary: "High API error rate detected"
|
| 19 |
+
description: "API error rate is {{ $value | humanizePercentage }} over the last 5 minutes, exceeding the 5% threshold"
|
| 20 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-error-rate"
|
| 21 |
+
|
| 22 |
+
- alert: APIDown
|
| 23 |
+
expr: up{job="cidadao-ai-backend"} == 0
|
| 24 |
+
for: 1m
|
| 25 |
+
labels:
|
| 26 |
+
severity: critical
|
| 27 |
+
team: backend
|
| 28 |
+
pagerduty: true
|
| 29 |
+
annotations:
|
| 30 |
+
summary: "Cidadão.AI API is down"
|
| 31 |
+
description: "The API service has been down for more than 1 minute"
|
| 32 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/api-down"
|
| 33 |
+
|
| 34 |
+
# Latency Alerts
|
| 35 |
+
- alert: APIHighLatency
|
| 36 |
+
expr: |
|
| 37 |
+
histogram_quantile(0.95,
|
| 38 |
+
sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
|
| 39 |
+
) > 2.0
|
| 40 |
+
for: 5m
|
| 41 |
+
labels:
|
| 42 |
+
severity: warning
|
| 43 |
+
team: backend
|
| 44 |
+
annotations:
|
| 45 |
+
summary: "High API latency detected"
|
| 46 |
+
description: "P95 latency is {{ $value }}s, exceeding the 2s threshold"
|
| 47 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-latency"
|
| 48 |
+
|
| 49 |
+
- alert: APICriticalLatency
|
| 50 |
+
expr: |
|
| 51 |
+
histogram_quantile(0.95,
|
| 52 |
+
sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
|
| 53 |
+
) > 5.0
|
| 54 |
+
for: 3m
|
| 55 |
+
labels:
|
| 56 |
+
severity: critical
|
| 57 |
+
team: backend
|
| 58 |
+
pagerduty: true
|
| 59 |
+
annotations:
|
| 60 |
+
summary: "Critical API latency detected"
|
| 61 |
+
description: "P95 latency is {{ $value }}s, exceeding the 5s critical threshold"
|
| 62 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/api-critical-latency"
|
| 63 |
+
|
| 64 |
+
# Agent Performance Alerts
|
| 65 |
+
- alert: AgentHighFailureRate
|
| 66 |
+
expr: |
|
| 67 |
+
(
|
| 68 |
+
sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m])) by (agent_name)
|
| 69 |
+
/
|
| 70 |
+
sum(rate(cidadao_ai_agent_tasks_total[5m])) by (agent_name)
|
| 71 |
+
) > 0.10
|
| 72 |
+
for: 5m
|
| 73 |
+
labels:
|
| 74 |
+
severity: warning
|
| 75 |
+
team: ml
|
| 76 |
+
annotations:
|
| 77 |
+
summary: "Agent {{ $labels.agent_name }} has high failure rate"
|
| 78 |
+
description: "Agent {{ $labels.agent_name }} failure rate is {{ $value | humanizePercentage }} over the last 5 minutes"
|
| 79 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/agent-high-failure-rate"
|
| 80 |
+
|
| 81 |
+
- alert: AllAgentsFailing
|
| 82 |
+
expr: |
|
| 83 |
+
(
|
| 84 |
+
sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m]))
|
| 85 |
+
/
|
| 86 |
+
sum(rate(cidadao_ai_agent_tasks_total[5m]))
|
| 87 |
+
) > 0.50
|
| 88 |
+
for: 2m
|
| 89 |
+
labels:
|
| 90 |
+
severity: critical
|
| 91 |
+
team: ml
|
| 92 |
+
pagerduty: true
|
| 93 |
+
annotations:
|
| 94 |
+
summary: "Multiple agents are failing"
|
| 95 |
+
description: "Overall agent failure rate is {{ $value | humanizePercentage }}, indicating systemic issues"
|
| 96 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/all-agents-failing"
|
| 97 |
+
|
| 98 |
+
# Investigation Alerts
|
| 99 |
+
- alert: InvestigationBacklog
|
| 100 |
+
expr: cidadao_ai_active_investigations > 100
|
| 101 |
+
for: 10m
|
| 102 |
+
labels:
|
| 103 |
+
severity: warning
|
| 104 |
+
team: backend
|
| 105 |
+
annotations:
|
| 106 |
+
summary: "High number of active investigations"
|
| 107 |
+
description: "{{ $value }} investigations are currently active, indicating potential processing delays"
|
| 108 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-backlog"
|
| 109 |
+
|
| 110 |
+
- alert: InvestigationProcessingStalled
|
| 111 |
+
expr: |
|
| 112 |
+
rate(cidadao_ai_investigations_total{status="completed"}[10m]) == 0
|
| 113 |
+
and
|
| 114 |
+
cidadao_ai_active_investigations > 10
|
| 115 |
+
for: 15m
|
| 116 |
+
labels:
|
| 117 |
+
severity: critical
|
| 118 |
+
team: backend
|
| 119 |
+
annotations:
|
| 120 |
+
summary: "Investigation processing appears to be stalled"
|
| 121 |
+
description: "No investigations completed in the last 15 minutes despite {{ $value }} active investigations"
|
| 122 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-stalled"
|
| 123 |
+
|
| 124 |
+
# Cache Performance Alerts
|
| 125 |
+
- alert: LowCacheHitRate
|
| 126 |
+
expr: |
|
| 127 |
+
(
|
| 128 |
+
sum(rate(cidadao_ai_cache_operations_total{result="hit"}[5m]))
|
| 129 |
+
/
|
| 130 |
+
sum(rate(cidadao_ai_cache_operations_total[5m]))
|
| 131 |
+
) < 0.70
|
| 132 |
+
for: 10m
|
| 133 |
+
labels:
|
| 134 |
+
severity: warning
|
| 135 |
+
team: backend
|
| 136 |
+
annotations:
|
| 137 |
+
summary: "Cache hit rate below threshold"
|
| 138 |
+
description: "Cache hit rate is {{ $value | humanizePercentage }}, below the 70% threshold"
|
| 139 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/low-cache-hit-rate"
|
| 140 |
+
|
| 141 |
+
# Circuit Breaker Alerts
|
| 142 |
+
- alert: CircuitBreakerOpen
|
| 143 |
+
expr: cidadao_ai_circuit_breaker_state{state="open"} == 1
|
| 144 |
+
for: 2m
|
| 145 |
+
labels:
|
| 146 |
+
severity: warning
|
| 147 |
+
team: backend
|
| 148 |
+
annotations:
|
| 149 |
+
summary: "Circuit breaker {{ $labels.service_name }} is open"
|
| 150 |
+
description: "Service {{ $labels.service_name }} circuit breaker has been open for more than 2 minutes"
|
| 151 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/circuit-breaker-open"
|
| 152 |
+
|
| 153 |
+
- alert: MultipleCircuitBreakersOpen
|
| 154 |
+
expr: |
|
| 155 |
+
count(cidadao_ai_circuit_breaker_state{state="open"} == 1) >= 3
|
| 156 |
+
for: 1m
|
| 157 |
+
labels:
|
| 158 |
+
severity: critical
|
| 159 |
+
team: backend
|
| 160 |
+
pagerduty: true
|
| 161 |
+
annotations:
|
| 162 |
+
summary: "Multiple circuit breakers are open"
|
| 163 |
+
description: "{{ $value }} circuit breakers are currently open, indicating widespread service issues"
|
| 164 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/multiple-circuit-breakers"
|
| 165 |
+
|
| 166 |
+
# Resource Utilization Alerts
|
| 167 |
+
- alert: HighMemoryUsage
|
| 168 |
+
expr: |
|
| 169 |
+
(
|
| 170 |
+
container_memory_usage_bytes{pod=~"cidadao-ai-.*"}
|
| 171 |
+
/
|
| 172 |
+
container_spec_memory_limit_bytes{pod=~"cidadao-ai-.*"}
|
| 173 |
+
) > 0.90
|
| 174 |
+
for: 5m
|
| 175 |
+
labels:
|
| 176 |
+
severity: warning
|
| 177 |
+
team: devops
|
| 178 |
+
annotations:
|
| 179 |
+
summary: "High memory usage in {{ $labels.pod }}"
|
| 180 |
+
description: "Memory usage is {{ $value | humanizePercentage }} of the limit"
|
| 181 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/high-memory-usage"
|
| 182 |
+
|
| 183 |
+
- alert: HighCPUUsage
|
| 184 |
+
expr: |
|
| 185 |
+
(
|
| 186 |
+
rate(container_cpu_usage_seconds_total{pod=~"cidadao-ai-.*"}[5m])
|
| 187 |
+
) > 0.80
|
| 188 |
+
for: 10m
|
| 189 |
+
labels:
|
| 190 |
+
severity: warning
|
| 191 |
+
team: devops
|
| 192 |
+
annotations:
|
| 193 |
+
summary: "High CPU usage in {{ $labels.pod }}"
|
| 194 |
+
description: "CPU usage is {{ $value | humanizePercentage }} over the last 10 minutes"
|
| 195 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/high-cpu-usage"
|
| 196 |
+
|
| 197 |
+
# Database Connection Alerts
|
| 198 |
+
- alert: DatabaseConnectionPoolExhausted
|
| 199 |
+
expr: |
|
| 200 |
+
(
|
| 201 |
+
cidadao_ai_database_connections_in_use
|
| 202 |
+
/
|
| 203 |
+
cidadao_ai_database_connections_total
|
| 204 |
+
) > 0.90
|
| 205 |
+
for: 5m
|
| 206 |
+
labels:
|
| 207 |
+
severity: critical
|
| 208 |
+
team: backend
|
| 209 |
+
annotations:
|
| 210 |
+
summary: "Database connection pool near exhaustion"
|
| 211 |
+
description: "{{ $value | humanizePercentage }} of database connections are in use"
|
| 212 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/db-connection-pool"
|
| 213 |
+
|
| 214 |
+
# Anomaly Detection Alerts
|
| 215 |
+
- alert: HighAnomalyDetectionRate
|
| 216 |
+
expr: |
|
| 217 |
+
sum(rate(cidadao_ai_anomalies_detected_total[1h])) > 100
|
| 218 |
+
for: 5m
|
| 219 |
+
labels:
|
| 220 |
+
severity: warning
|
| 221 |
+
team: compliance
|
| 222 |
+
annotations:
|
| 223 |
+
summary: "Unusually high rate of anomaly detections"
|
| 224 |
+
description: "{{ $value }} anomalies detected per hour, which is unusually high"
|
| 225 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/high-anomaly-rate"
|
| 226 |
+
|
| 227 |
+
# SLO Violation Alerts
|
| 228 |
+
- alert: SLOViolationAPIAvailability
|
| 229 |
+
expr: |
|
| 230 |
+
(
|
| 231 |
+
1 - (
|
| 232 |
+
sum(increase(cidadao_ai_http_errors_total[1h]))
|
| 233 |
+
/
|
| 234 |
+
sum(increase(cidadao_ai_http_requests_total[1h]))
|
| 235 |
+
)
|
| 236 |
+
) < 0.99
|
| 237 |
+
for: 5m
|
| 238 |
+
labels:
|
| 239 |
+
severity: critical
|
| 240 |
+
team: backend
|
| 241 |
+
slo: true
|
| 242 |
+
annotations:
|
| 243 |
+
summary: "API Availability SLO violation"
|
| 244 |
+
description: "API availability is {{ $value | humanizePercentage }}, below the 99% SLO"
|
| 245 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/slo-availability"
|
| 246 |
+
|
| 247 |
+
- alert: SLOViolationLatency
|
| 248 |
+
expr: |
|
| 249 |
+
(
|
| 250 |
+
sum(increase(cidadao_ai_request_duration_seconds_bucket{le="0.2"}[1h]))
|
| 251 |
+
/
|
| 252 |
+
sum(increase(cidadao_ai_request_duration_seconds_count[1h]))
|
| 253 |
+
) < 0.95
|
| 254 |
+
for: 5m
|
| 255 |
+
labels:
|
| 256 |
+
severity: warning
|
| 257 |
+
team: backend
|
| 258 |
+
slo: true
|
| 259 |
+
annotations:
|
| 260 |
+
summary: "Latency SLO violation"
|
| 261 |
+
description: "Only {{ $value | humanizePercentage }} of requests completed under 200ms, below the 95% SLO"
|
| 262 |
+
runbook_url: "https://wiki.cidadao.ai/runbooks/slo-latency"
|
monitoring/prometheus/prometheus.yml
CHANGED
|
@@ -6,17 +6,16 @@ global:
|
|
| 6 |
|
| 7 |
rule_files:
|
| 8 |
- "rules/*.yml"
|
|
|
|
| 9 |
|
| 10 |
scrape_configs:
|
| 11 |
# Cidadão.AI Backend Application Metrics
|
| 12 |
- job_name: 'cidadao-ai-backend'
|
| 13 |
static_configs:
|
| 14 |
-
- targets: ['cidadao-ai:
|
| 15 |
-
metrics_path: /
|
| 16 |
scrape_interval: 10s
|
| 17 |
scrape_timeout: 5s
|
| 18 |
-
params:
|
| 19 |
-
format: ['prometheus']
|
| 20 |
|
| 21 |
# Prometheus Self-Monitoring
|
| 22 |
- job_name: 'prometheus'
|
|
|
|
| 6 |
|
| 7 |
rule_files:
|
| 8 |
- "rules/*.yml"
|
| 9 |
+
- "alerts.yml"
|
| 10 |
|
| 11 |
scrape_configs:
|
| 12 |
# Cidadão.AI Backend Application Metrics
|
| 13 |
- job_name: 'cidadao-ai-backend'
|
| 14 |
static_configs:
|
| 15 |
+
- targets: ['cidadao-ai:8000', 'localhost:8000']
|
| 16 |
+
metrics_path: /api/v1/observability/metrics
|
| 17 |
scrape_interval: 10s
|
| 18 |
scrape_timeout: 5s
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Prometheus Self-Monitoring
|
| 21 |
- job_name: 'prometheus'
|