anderson-ufrj commited on
Commit
2395a41
·
1 Parent(s): 640d7fb

feat(monitoring): configure comprehensive Prometheus alerts and Alertmanager

Browse files

- Add critical alerts for API availability (<99% triggers)
- Configure latency alerts (warning at 2s, critical at 5s)
- Set up agent failure rate monitoring
- Add investigation backlog and stalled processing alerts
- Configure cache hit rate alerts (<70% triggers)
- Monitor circuit breaker states
- Add resource utilization alerts (memory, CPU)
- Configure database connection pool monitoring
- Set up anomaly detection rate alerts
- Add SLO violation alerts for key metrics
- Configure Alertmanager with routing rules
- Set up notification channels (email, Slack, PagerDuty)
- Add inhibition rules to prevent alert storms
- Create alert templates for better formatting
- Update Prometheus config to include new alerts

monitoring/alertmanager/alertmanager.yml ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ # SMTP configuration for email alerts
3
+ smtp_smarthost: 'smtp.gmail.com:587'
4
+ smtp_from: '[email protected]'
5
+ smtp_auth_username: '[email protected]'
6
+ smtp_auth_password: 'YOUR_APP_PASSWORD'
7
+ smtp_require_tls: true
8
+
9
+ # Slack webhook URL
10
+ slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'
11
+
12
+ # PagerDuty configuration
13
+ pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
14
+
15
+ # Templates for alert formatting
16
+ templates:
17
+ - '/etc/alertmanager/templates/*.tmpl'
18
+
19
+ # The root route for all alerts
20
+ route:
21
+ # Default receiver for alerts
22
+ receiver: 'default'
23
+
24
+ # Group alerts by these labels
25
+ group_by: ['alertname', 'cluster', 'service', 'severity']
26
+
27
+ # Wait before sending grouped alerts
28
+ group_wait: 30s
29
+
30
+ # How long to wait before sending new alerts for a group
31
+ group_interval: 5m
32
+
33
+ # How long to wait before repeating alerts
34
+ repeat_interval: 12h
35
+
36
+ # Child routes for specific routing
37
+ routes:
38
+ # Critical alerts go to PagerDuty
39
+ - match:
40
+ severity: critical
41
+ pagerduty: 'true'
42
+ receiver: pagerduty
43
+ continue: true
44
+
45
+ # All critical alerts also go to Slack
46
+ - match:
47
+ severity: critical
48
+ receiver: slack-critical
49
+
50
+ # Warning alerts go to Slack
51
+ - match:
52
+ severity: warning
53
+ receiver: slack-warning
54
+
55
+ # SLO violations have special handling
56
+ - match:
57
+ slo: 'true'
58
+ receiver: slo-violations
59
+ group_interval: 1h
60
+ repeat_interval: 4h
61
+
62
+ # ML team alerts
63
+ - match:
64
+ team: ml
65
+ receiver: ml-team
66
+
67
+ # Backend team alerts
68
+ - match:
69
+ team: backend
70
+ receiver: backend-team
71
+
72
+ # Receivers define notification destinations
73
+ receivers:
74
+ # Default receiver - logs only
75
+ - name: 'default'
76
+ webhook_configs:
77
+ - url: 'http://localhost:9093/api/v1/alerts'
78
+ send_resolved: true
79
+
80
+ # PagerDuty for critical incidents
81
+ - name: 'pagerduty'
82
+ pagerduty_configs:
83
+ - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
84
+ description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}'
85
+ details:
86
+ firing: '{{ .Alerts.Firing | len }}'
87
+ resolved: '{{ .Alerts.Resolved | len }}'
88
+ description: '{{ .CommonAnnotations.description }}'
89
+ runbook: '{{ .CommonAnnotations.runbook_url }}'
90
+
91
+ # Slack channels for different severities
92
+ - name: 'slack-critical'
93
+ slack_configs:
94
+ - api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK'
95
+ channel: '#alerts-critical'
96
+ title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
97
+ text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
98
+ send_resolved: true
99
+ color: 'danger'
100
+ actions:
101
+ - type: button
102
+ text: 'View Dashboard'
103
+ url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview'
104
+ - type: button
105
+ text: 'Runbook'
106
+ url: '{{ .CommonAnnotations.runbook_url }}'
107
+
108
+ - name: 'slack-warning'
109
+ slack_configs:
110
+ - api_url: 'YOUR_SLACK_WARNING_WEBHOOK'
111
+ channel: '#alerts-warning'
112
+ title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
113
+ text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
114
+ send_resolved: true
115
+ color: 'warning'
116
+
117
+ # SLO violations channel
118
+ - name: 'slo-violations'
119
+ slack_configs:
120
+ - api_url: 'YOUR_SLACK_SLO_WEBHOOK'
121
+ channel: '#slo-violations'
122
+ title: '📊 SLO Violation: {{ .GroupLabels.alertname }}'
123
+ text: |
124
+ *Service:* Cidadão.AI Backend
125
+ *SLO:* {{ .GroupLabels.alertname }}
126
+ *Current Value:* {{ .CommonAnnotations.description }}
127
+ *Time:* {{ .StartsAt.Format "15:04:05 MST" }}
128
+ send_resolved: true
129
+ color: '#ff9800'
130
+ email_configs:
131
+ - to: '[email protected]'
132
+ headers:
133
+ Subject: 'SLO Violation: {{ .GroupLabels.alertname }}'
134
+
135
+ # Team-specific receivers
136
+ - name: 'ml-team'
137
+ slack_configs:
138
+ - api_url: 'YOUR_SLACK_ML_WEBHOOK'
139
+ channel: '#ml-alerts'
140
+ title: '🤖 ML Alert: {{ .GroupLabels.alertname }}'
141
+ text: '{{ .CommonAnnotations.description }}'
142
+ email_configs:
143
+ - to: '[email protected]'
144
+
145
+ - name: 'backend-team'
146
+ slack_configs:
147
+ - api_url: 'YOUR_SLACK_BACKEND_WEBHOOK'
148
+ channel: '#backend-alerts'
149
+ title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}'
150
+ text: '{{ .CommonAnnotations.description }}'
151
+ email_configs:
152
+ - to: '[email protected]'
153
+
154
+ # Inhibition rules prevent certain alerts when others are firing
155
+ inhibit_rules:
156
+ # Don't alert on high latency if API is down
157
+ - source_match:
158
+ alertname: 'APIDown'
159
+ target_match:
160
+ alertname: 'APIHighLatency'
161
+ equal: ['service']
162
+
163
+ # Don't alert on agent failures if all agents are failing
164
+ - source_match:
165
+ alertname: 'AllAgentsFailing'
166
+ target_match:
167
+ alertname: 'AgentHighFailureRate'
168
+ equal: ['service']
169
+
170
+ # Don't alert on cache hit rate if Redis is down
171
+ - source_match:
172
+ alertname: 'RedisDown'
173
+ target_match:
174
+ alertname: 'LowCacheHitRate'
175
+ equal: ['service']
monitoring/alertmanager/templates/cidadao-ai.tmpl ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ define "cidadao.ai.title" }}
2
+ [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}
3
+ {{ end }}
4
+
5
+ {{ define "cidadao.ai.text" }}
6
+ {{ range .Alerts }}
7
+ *Alert:* {{ .Labels.alertname }}
8
+ *Severity:* {{ .Labels.severity }}
9
+ *Service:* {{ .Labels.service }}
10
+ *Instance:* {{ .Labels.instance }}
11
+ *Description:* {{ .Annotations.description }}
12
+ *Started:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
13
+ {{ if .Annotations.runbook_url }}*Runbook:* <{{ .Annotations.runbook_url }}|View Runbook>{{ end }}
14
+ {{ end }}
15
+ {{ end }}
16
+
17
+ {{ define "cidadao.ai.html" }}
18
+ <!DOCTYPE html>
19
+ <html>
20
+ <head>
21
+ <style>
22
+ body { font-family: Arial, sans-serif; }
23
+ .alert-box {
24
+ border: 2px solid #ddd;
25
+ padding: 10px;
26
+ margin: 10px 0;
27
+ border-radius: 5px;
28
+ }
29
+ .critical { border-color: #f44336; background-color: #ffebee; }
30
+ .warning { border-color: #ff9800; background-color: #fff3e0; }
31
+ .info { border-color: #2196f3; background-color: #e3f2fd; }
32
+ .resolved { border-color: #4caf50; background-color: #e8f5e9; }
33
+ h2 { margin-top: 0; }
34
+ .label { font-weight: bold; }
35
+ .metadata { color: #666; font-size: 0.9em; }
36
+ </style>
37
+ </head>
38
+ <body>
39
+ <h1>Cidadão.AI Alert Notification</h1>
40
+
41
+ {{ range .Alerts }}
42
+ <div class="alert-box {{ .Labels.severity }}">
43
+ <h2>{{ .Labels.alertname }}</h2>
44
+
45
+ <p><span class="label">Status:</span> {{ .Status }}</p>
46
+ <p><span class="label">Severity:</span> {{ .Labels.severity | toUpper }}</p>
47
+ <p><span class="label">Service:</span> {{ .Labels.service }}</p>
48
+ <p><span class="label">Instance:</span> {{ .Labels.instance }}</p>
49
+
50
+ <p><span class="label">Description:</span><br>
51
+ {{ .Annotations.description }}</p>
52
+
53
+ <div class="metadata">
54
+ <p><span class="label">Started:</span> {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}</p>
55
+ {{ if .EndsAt }}
56
+ <p><span class="label">Ended:</span> {{ .EndsAt.Format "2006-01-02 15:04:05 MST" }}</p>
57
+ {{ end }}
58
+
59
+ {{ if .Annotations.runbook_url }}
60
+ <p><span class="label">Runbook:</span> <a href="{{ .Annotations.runbook_url }}">View Documentation</a></p>
61
+ {{ end }}
62
+ </div>
63
+
64
+ <details>
65
+ <summary>All Labels</summary>
66
+ <ul>
67
+ {{ range $key, $value := .Labels }}
68
+ <li><strong>{{ $key }}:</strong> {{ $value }}</li>
69
+ {{ end }}
70
+ </ul>
71
+ </details>
72
+ </div>
73
+ {{ end }}
74
+
75
+ <hr>
76
+ <p style="color: #666; font-size: 0.8em;">
77
+ This alert was generated by the Cidadão.AI monitoring system.
78
+ For more information, visit the <a href="https://grafana.cidadao.ai">Grafana Dashboard</a>.
79
+ </p>
80
+ </body>
81
+ </html>
82
+ {{ end }}
monitoring/prometheus/alerts.yml ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groups:
2
+ - name: cidadao_ai_critical_alerts
3
+ interval: 30s
4
+ rules:
5
+ # API Availability Alerts
6
+ - alert: APIHighErrorRate
7
+ expr: |
8
+ (
9
+ sum(rate(cidadao_ai_http_errors_total[5m]))
10
+ /
11
+ sum(rate(cidadao_ai_http_requests_total[5m]))
12
+ ) > 0.05
13
+ for: 5m
14
+ labels:
15
+ severity: critical
16
+ team: backend
17
+ annotations:
18
+ summary: "High API error rate detected"
19
+ description: "API error rate is {{ $value | humanizePercentage }} over the last 5 minutes, exceeding the 5% threshold"
20
+ runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-error-rate"
21
+
22
+ - alert: APIDown
23
+ expr: up{job="cidadao-ai-backend"} == 0
24
+ for: 1m
25
+ labels:
26
+ severity: critical
27
+ team: backend
28
+ pagerduty: true
29
+ annotations:
30
+ summary: "Cidadão.AI API is down"
31
+ description: "The API service has been down for more than 1 minute"
32
+ runbook_url: "https://wiki.cidadao.ai/runbooks/api-down"
33
+
34
+ # Latency Alerts
35
+ - alert: APIHighLatency
36
+ expr: |
37
+ histogram_quantile(0.95,
38
+ sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
39
+ ) > 2.0
40
+ for: 5m
41
+ labels:
42
+ severity: warning
43
+ team: backend
44
+ annotations:
45
+ summary: "High API latency detected"
46
+ description: "P95 latency is {{ $value }}s, exceeding the 2s threshold"
47
+ runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-latency"
48
+
49
+ - alert: APICriticalLatency
50
+ expr: |
51
+ histogram_quantile(0.95,
52
+ sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
53
+ ) > 5.0
54
+ for: 3m
55
+ labels:
56
+ severity: critical
57
+ team: backend
58
+ pagerduty: true
59
+ annotations:
60
+ summary: "Critical API latency detected"
61
+ description: "P95 latency is {{ $value }}s, exceeding the 5s critical threshold"
62
+ runbook_url: "https://wiki.cidadao.ai/runbooks/api-critical-latency"
63
+
64
+ # Agent Performance Alerts
65
+ - alert: AgentHighFailureRate
66
+ expr: |
67
+ (
68
+ sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m])) by (agent_name)
69
+ /
70
+ sum(rate(cidadao_ai_agent_tasks_total[5m])) by (agent_name)
71
+ ) > 0.10
72
+ for: 5m
73
+ labels:
74
+ severity: warning
75
+ team: ml
76
+ annotations:
77
+ summary: "Agent {{ $labels.agent_name }} has high failure rate"
78
+ description: "Agent {{ $labels.agent_name }} failure rate is {{ $value | humanizePercentage }} over the last 5 minutes"
79
+ runbook_url: "https://wiki.cidadao.ai/runbooks/agent-high-failure-rate"
80
+
81
+ - alert: AllAgentsFailing
82
+ expr: |
83
+ (
84
+ sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m]))
85
+ /
86
+ sum(rate(cidadao_ai_agent_tasks_total[5m]))
87
+ ) > 0.50
88
+ for: 2m
89
+ labels:
90
+ severity: critical
91
+ team: ml
92
+ pagerduty: true
93
+ annotations:
94
+ summary: "Multiple agents are failing"
95
+ description: "Overall agent failure rate is {{ $value | humanizePercentage }}, indicating systemic issues"
96
+ runbook_url: "https://wiki.cidadao.ai/runbooks/all-agents-failing"
97
+
98
+ # Investigation Alerts
99
+ - alert: InvestigationBacklog
100
+ expr: cidadao_ai_active_investigations > 100
101
+ for: 10m
102
+ labels:
103
+ severity: warning
104
+ team: backend
105
+ annotations:
106
+ summary: "High number of active investigations"
107
+ description: "{{ $value }} investigations are currently active, indicating potential processing delays"
108
+ runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-backlog"
109
+
110
+ - alert: InvestigationProcessingStalled
111
+ expr: |
112
+ rate(cidadao_ai_investigations_total{status="completed"}[10m]) == 0
113
+ and
114
+ cidadao_ai_active_investigations > 10
115
+ for: 15m
116
+ labels:
117
+ severity: critical
118
+ team: backend
119
+ annotations:
120
+ summary: "Investigation processing appears to be stalled"
121
+ description: "No investigations completed in the last 15 minutes despite {{ $value }} active investigations"
122
+ runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-stalled"
123
+
124
+ # Cache Performance Alerts
125
+ - alert: LowCacheHitRate
126
+ expr: |
127
+ (
128
+ sum(rate(cidadao_ai_cache_operations_total{result="hit"}[5m]))
129
+ /
130
+ sum(rate(cidadao_ai_cache_operations_total[5m]))
131
+ ) < 0.70
132
+ for: 10m
133
+ labels:
134
+ severity: warning
135
+ team: backend
136
+ annotations:
137
+ summary: "Cache hit rate below threshold"
138
+ description: "Cache hit rate is {{ $value | humanizePercentage }}, below the 70% threshold"
139
+ runbook_url: "https://wiki.cidadao.ai/runbooks/low-cache-hit-rate"
140
+
141
+ # Circuit Breaker Alerts
142
+ - alert: CircuitBreakerOpen
143
+ expr: cidadao_ai_circuit_breaker_state{state="open"} == 1
144
+ for: 2m
145
+ labels:
146
+ severity: warning
147
+ team: backend
148
+ annotations:
149
+ summary: "Circuit breaker {{ $labels.service_name }} is open"
150
+ description: "Service {{ $labels.service_name }} circuit breaker has been open for more than 2 minutes"
151
+ runbook_url: "https://wiki.cidadao.ai/runbooks/circuit-breaker-open"
152
+
153
+ - alert: MultipleCircuitBreakersOpen
154
+ expr: |
155
+ count(cidadao_ai_circuit_breaker_state{state="open"} == 1) >= 3
156
+ for: 1m
157
+ labels:
158
+ severity: critical
159
+ team: backend
160
+ pagerduty: true
161
+ annotations:
162
+ summary: "Multiple circuit breakers are open"
163
+ description: "{{ $value }} circuit breakers are currently open, indicating widespread service issues"
164
+ runbook_url: "https://wiki.cidadao.ai/runbooks/multiple-circuit-breakers"
165
+
166
+ # Resource Utilization Alerts
167
+ - alert: HighMemoryUsage
168
+ expr: |
169
+ (
170
+ container_memory_usage_bytes{pod=~"cidadao-ai-.*"}
171
+ /
172
+ container_spec_memory_limit_bytes{pod=~"cidadao-ai-.*"}
173
+ ) > 0.90
174
+ for: 5m
175
+ labels:
176
+ severity: warning
177
+ team: devops
178
+ annotations:
179
+ summary: "High memory usage in {{ $labels.pod }}"
180
+ description: "Memory usage is {{ $value | humanizePercentage }} of the limit"
181
+ runbook_url: "https://wiki.cidadao.ai/runbooks/high-memory-usage"
182
+
183
+ - alert: HighCPUUsage
184
+ expr: |
185
+ (
186
+ rate(container_cpu_usage_seconds_total{pod=~"cidadao-ai-.*"}[5m])
187
+ ) > 0.80
188
+ for: 10m
189
+ labels:
190
+ severity: warning
191
+ team: devops
192
+ annotations:
193
+ summary: "High CPU usage in {{ $labels.pod }}"
194
+ description: "CPU usage is {{ $value | humanizePercentage }} over the last 10 minutes"
195
+ runbook_url: "https://wiki.cidadao.ai/runbooks/high-cpu-usage"
196
+
197
+ # Database Connection Alerts
198
+ - alert: DatabaseConnectionPoolExhausted
199
+ expr: |
200
+ (
201
+ cidadao_ai_database_connections_in_use
202
+ /
203
+ cidadao_ai_database_connections_total
204
+ ) > 0.90
205
+ for: 5m
206
+ labels:
207
+ severity: critical
208
+ team: backend
209
+ annotations:
210
+ summary: "Database connection pool near exhaustion"
211
+ description: "{{ $value | humanizePercentage }} of database connections are in use"
212
+ runbook_url: "https://wiki.cidadao.ai/runbooks/db-connection-pool"
213
+
214
+ # Anomaly Detection Alerts
215
+ - alert: HighAnomalyDetectionRate
216
+ expr: |
217
+ sum(rate(cidadao_ai_anomalies_detected_total[1h])) > 100
218
+ for: 5m
219
+ labels:
220
+ severity: warning
221
+ team: compliance
222
+ annotations:
223
+ summary: "Unusually high rate of anomaly detections"
224
+ description: "{{ $value }} anomalies detected per hour, which is unusually high"
225
+ runbook_url: "https://wiki.cidadao.ai/runbooks/high-anomaly-rate"
226
+
227
+ # SLO Violation Alerts
228
+ - alert: SLOViolationAPIAvailability
229
+ expr: |
230
+ (
231
+ 1 - (
232
+ sum(increase(cidadao_ai_http_errors_total[1h]))
233
+ /
234
+ sum(increase(cidadao_ai_http_requests_total[1h]))
235
+ )
236
+ ) < 0.99
237
+ for: 5m
238
+ labels:
239
+ severity: critical
240
+ team: backend
241
+ slo: true
242
+ annotations:
243
+ summary: "API Availability SLO violation"
244
+ description: "API availability is {{ $value | humanizePercentage }}, below the 99% SLO"
245
+ runbook_url: "https://wiki.cidadao.ai/runbooks/slo-availability"
246
+
247
+ - alert: SLOViolationLatency
248
+ expr: |
249
+ (
250
+ sum(increase(cidadao_ai_request_duration_seconds_bucket{le="0.2"}[1h]))
251
+ /
252
+ sum(increase(cidadao_ai_request_duration_seconds_count[1h]))
253
+ ) < 0.95
254
+ for: 5m
255
+ labels:
256
+ severity: warning
257
+ team: backend
258
+ slo: true
259
+ annotations:
260
+ summary: "Latency SLO violation"
261
+ description: "Only {{ $value | humanizePercentage }} of requests completed under 200ms, below the 95% SLO"
262
+ runbook_url: "https://wiki.cidadao.ai/runbooks/slo-latency"
monitoring/prometheus/prometheus.yml CHANGED
@@ -6,17 +6,16 @@ global:
6
 
7
  rule_files:
8
  - "rules/*.yml"
 
9
 
10
  scrape_configs:
11
  # Cidadão.AI Backend Application Metrics
12
  - job_name: 'cidadao-ai-backend'
13
  static_configs:
14
- - targets: ['cidadao-ai:7860']
15
- metrics_path: /health/metrics
16
  scrape_interval: 10s
17
  scrape_timeout: 5s
18
- params:
19
- format: ['prometheus']
20
 
21
  # Prometheus Self-Monitoring
22
  - job_name: 'prometheus'
 
6
 
7
  rule_files:
8
  - "rules/*.yml"
9
+ - "alerts.yml"
10
 
11
  scrape_configs:
12
  # Cidadão.AI Backend Application Metrics
13
  - job_name: 'cidadao-ai-backend'
14
  static_configs:
15
+ - targets: ['cidadao-ai:8000', 'localhost:8000']
16
+ metrics_path: /api/v1/observability/metrics
17
  scrape_interval: 10s
18
  scrape_timeout: 5s
 
 
19
 
20
  # Prometheus Self-Monitoring
21
  - job_name: 'prometheus'