|
|
global: |
|
|
|
|
|
smtp_smarthost: 'smtp.gmail.com:587' |
|
|
smtp_from: '[email protected]' |
|
|
smtp_auth_username: '[email protected]' |
|
|
smtp_auth_password: 'YOUR_APP_PASSWORD' |
|
|
smtp_require_tls: true |
|
|
|
|
|
|
|
|
slack_api_url: 'YOUR_SLACK_WEBHOOK_URL' |
|
|
|
|
|
|
|
|
pagerduty_url: 'https://events.pagerduty.com/v2/enqueue' |
|
|
|
|
|
|
|
|
templates: |
|
|
- '/etc/alertmanager/templates/*.tmpl' |
|
|
|
|
|
|
|
|
route: |
|
|
|
|
|
receiver: 'default' |
|
|
|
|
|
|
|
|
group_by: ['alertname', 'cluster', 'service', 'severity'] |
|
|
|
|
|
|
|
|
group_wait: 30s |
|
|
|
|
|
|
|
|
group_interval: 5m |
|
|
|
|
|
|
|
|
repeat_interval: 12h |
|
|
|
|
|
|
|
|
routes: |
|
|
|
|
|
- match: |
|
|
severity: critical |
|
|
pagerduty: 'true' |
|
|
receiver: pagerduty |
|
|
continue: true |
|
|
|
|
|
|
|
|
- match: |
|
|
severity: critical |
|
|
receiver: slack-critical |
|
|
|
|
|
|
|
|
- match: |
|
|
severity: warning |
|
|
receiver: slack-warning |
|
|
|
|
|
|
|
|
- match: |
|
|
slo: 'true' |
|
|
receiver: slo-violations |
|
|
group_interval: 1h |
|
|
repeat_interval: 4h |
|
|
|
|
|
|
|
|
- match: |
|
|
team: ml |
|
|
receiver: ml-team |
|
|
|
|
|
|
|
|
- match: |
|
|
team: backend |
|
|
receiver: backend-team |
|
|
|
|
|
|
|
|
receivers: |
|
|
|
|
|
- name: 'default' |
|
|
webhook_configs: |
|
|
- url: 'http://localhost:9093/api/v1/alerts' |
|
|
send_resolved: true |
|
|
|
|
|
|
|
|
- name: 'pagerduty' |
|
|
pagerduty_configs: |
|
|
- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY' |
|
|
description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}' |
|
|
details: |
|
|
firing: '{{ .Alerts.Firing | len }}' |
|
|
resolved: '{{ .Alerts.Resolved | len }}' |
|
|
description: '{{ .CommonAnnotations.description }}' |
|
|
runbook: '{{ .CommonAnnotations.runbook_url }}' |
|
|
|
|
|
|
|
|
- name: 'slack-critical' |
|
|
slack_configs: |
|
|
- api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK' |
|
|
channel: '#alerts-critical' |
|
|
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}' |
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' |
|
|
send_resolved: true |
|
|
color: 'danger' |
|
|
actions: |
|
|
- type: button |
|
|
text: 'View Dashboard' |
|
|
url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview' |
|
|
- type: button |
|
|
text: 'Runbook' |
|
|
url: '{{ .CommonAnnotations.runbook_url }}' |
|
|
|
|
|
- name: 'slack-warning' |
|
|
slack_configs: |
|
|
- api_url: 'YOUR_SLACK_WARNING_WEBHOOK' |
|
|
channel: '#alerts-warning' |
|
|
title: '⚠️ WARNING: {{ .GroupLabels.alertname }}' |
|
|
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' |
|
|
send_resolved: true |
|
|
color: 'warning' |
|
|
|
|
|
|
|
|
- name: 'slo-violations' |
|
|
slack_configs: |
|
|
- api_url: 'YOUR_SLACK_SLO_WEBHOOK' |
|
|
channel: '#slo-violations' |
|
|
title: '📊 SLO Violation: {{ .GroupLabels.alertname }}' |
|
|
text: | |
|
|
*Service:* Cidadão.AI Backend |
|
|
*SLO:* {{ .GroupLabels.alertname }} |
|
|
*Current Value:* {{ .CommonAnnotations.description }} |
|
|
*Time:* {{ .StartsAt.Format "15:04:05 MST" }} |
|
|
send_resolved: true |
|
|
color: '#ff9800' |
|
|
email_configs: |
|
|
- to: '[email protected]' |
|
|
headers: |
|
|
Subject: 'SLO Violation: {{ .GroupLabels.alertname }}' |
|
|
|
|
|
|
|
|
- name: 'ml-team' |
|
|
slack_configs: |
|
|
- api_url: 'YOUR_SLACK_ML_WEBHOOK' |
|
|
channel: '#ml-alerts' |
|
|
title: '🤖 ML Alert: {{ .GroupLabels.alertname }}' |
|
|
text: '{{ .CommonAnnotations.description }}' |
|
|
email_configs: |
|
|
- to: '[email protected]' |
|
|
|
|
|
- name: 'backend-team' |
|
|
slack_configs: |
|
|
- api_url: 'YOUR_SLACK_BACKEND_WEBHOOK' |
|
|
channel: '#backend-alerts' |
|
|
title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}' |
|
|
text: '{{ .CommonAnnotations.description }}' |
|
|
email_configs: |
|
|
- to: '[email protected]' |
|
|
|
|
|
|
|
|
inhibit_rules: |
|
|
|
|
|
- source_match: |
|
|
alertname: 'APIDown' |
|
|
target_match: |
|
|
alertname: 'APIHighLatency' |
|
|
equal: ['service'] |
|
|
|
|
|
|
|
|
- source_match: |
|
|
alertname: 'AllAgentsFailing' |
|
|
target_match: |
|
|
alertname: 'AgentHighFailureRate' |
|
|
equal: ['service'] |
|
|
|
|
|
|
|
|
- source_match: |
|
|
alertname: 'RedisDown' |
|
|
target_match: |
|
|
alertname: 'LowCacheHitRate' |
|
|
equal: ['service'] |