anderson-ufrj
feat(monitoring): configure comprehensive Prometheus alerts and Alertmanager
2395a41
global:
# SMTP configuration for email alerts
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'YOUR_APP_PASSWORD'
smtp_require_tls: true
# Slack webhook URL
slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'
# PagerDuty configuration
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
# Templates for alert formatting
templates:
- '/etc/alertmanager/templates/*.tmpl'
# The root route for all alerts
route:
# Default receiver for alerts
receiver: 'default'
# Group alerts by these labels
group_by: ['alertname', 'cluster', 'service', 'severity']
# Wait before sending grouped alerts
group_wait: 30s
# How long to wait before sending new alerts for a group
group_interval: 5m
# How long to wait before repeating alerts
repeat_interval: 12h
# Child routes for specific routing
routes:
# Critical alerts go to PagerDuty
- match:
severity: critical
pagerduty: 'true'
receiver: pagerduty
continue: true
# All critical alerts also go to Slack
- match:
severity: critical
receiver: slack-critical
# Warning alerts go to Slack
- match:
severity: warning
receiver: slack-warning
# SLO violations have special handling
- match:
slo: 'true'
receiver: slo-violations
group_interval: 1h
repeat_interval: 4h
# ML team alerts
- match:
team: ml
receiver: ml-team
# Backend team alerts
- match:
team: backend
receiver: backend-team
# Receivers define notification destinations
receivers:
# Default receiver - logs only
- name: 'default'
webhook_configs:
- url: 'http://localhost:9093/api/v1/alerts'
send_resolved: true
# PagerDuty for critical incidents
- name: 'pagerduty'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}'
details:
firing: '{{ .Alerts.Firing | len }}'
resolved: '{{ .Alerts.Resolved | len }}'
description: '{{ .CommonAnnotations.description }}'
runbook: '{{ .CommonAnnotations.runbook_url }}'
# Slack channels for different severities
- name: 'slack-critical'
slack_configs:
- api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK'
channel: '#alerts-critical'
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
send_resolved: true
color: 'danger'
actions:
- type: button
text: 'View Dashboard'
url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview'
- type: button
text: 'Runbook'
url: '{{ .CommonAnnotations.runbook_url }}'
- name: 'slack-warning'
slack_configs:
- api_url: 'YOUR_SLACK_WARNING_WEBHOOK'
channel: '#alerts-warning'
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
send_resolved: true
color: 'warning'
# SLO violations channel
- name: 'slo-violations'
slack_configs:
- api_url: 'YOUR_SLACK_SLO_WEBHOOK'
channel: '#slo-violations'
title: '📊 SLO Violation: {{ .GroupLabels.alertname }}'
text: |
*Service:* Cidadão.AI Backend
*SLO:* {{ .GroupLabels.alertname }}
*Current Value:* {{ .CommonAnnotations.description }}
*Time:* {{ .StartsAt.Format "15:04:05 MST" }}
send_resolved: true
color: '#ff9800'
email_configs:
- to: '[email protected]'
headers:
Subject: 'SLO Violation: {{ .GroupLabels.alertname }}'
# Team-specific receivers
- name: 'ml-team'
slack_configs:
- api_url: 'YOUR_SLACK_ML_WEBHOOK'
channel: '#ml-alerts'
title: '🤖 ML Alert: {{ .GroupLabels.alertname }}'
text: '{{ .CommonAnnotations.description }}'
email_configs:
- to: '[email protected]'
- name: 'backend-team'
slack_configs:
- api_url: 'YOUR_SLACK_BACKEND_WEBHOOK'
channel: '#backend-alerts'
title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}'
text: '{{ .CommonAnnotations.description }}'
email_configs:
- to: '[email protected]'
# Inhibition rules prevent certain alerts when others are firing
inhibit_rules:
# Don't alert on high latency if API is down
- source_match:
alertname: 'APIDown'
target_match:
alertname: 'APIHighLatency'
equal: ['service']
# Don't alert on agent failures if all agents are failing
- source_match:
alertname: 'AllAgentsFailing'
target_match:
alertname: 'AgentHighFailureRate'
equal: ['service']
# Don't alert on cache hit rate if Redis is down
- source_match:
alertname: 'RedisDown'
target_match:
alertname: 'LowCacheHitRate'
equal: ['service']