Spaces:

neural-thinker
/

cidadao.ai-backend

Paused

cidadao.ai-backend / monitoring /alertmanager /alertmanager.yml

anderson-ufrj

feat(monitoring): configure comprehensive Prometheus alerts and Alertmanager

2395a41 3 months ago

5.11 kB

	global:
	# SMTP configuration for email alerts
	smtp_smarthost: 'smtp.gmail.com:587'
	smtp_from: '[email protected]'
	smtp_auth_username: '[email protected]'
	smtp_auth_password: 'YOUR_APP_PASSWORD'
	smtp_require_tls: true

	# Slack webhook URL
	slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'

	# PagerDuty configuration
	pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'

	# Templates for alert formatting
	templates:
	- '/etc/alertmanager/templates/*.tmpl'

	# The root route for all alerts
	route:
	# Default receiver for alerts
	receiver: 'default'

	# Group alerts by these labels
	group_by: ['alertname', 'cluster', 'service', 'severity']

	# Wait before sending grouped alerts
	group_wait: 30s

	# How long to wait before sending new alerts for a group
	group_interval: 5m

	# How long to wait before repeating alerts
	repeat_interval: 12h

	# Child routes for specific routing
	routes:
	# Critical alerts go to PagerDuty
	- match:
	severity: critical
	pagerduty: 'true'
	receiver: pagerduty
	continue: true

	# All critical alerts also go to Slack
	- match:
	severity: critical
	receiver: slack-critical

	# Warning alerts go to Slack
	- match:
	severity: warning
	receiver: slack-warning

	# SLO violations have special handling
	- match:
	slo: 'true'
	receiver: slo-violations
	group_interval: 1h
	repeat_interval: 4h

	# ML team alerts
	- match:
	team: ml
	receiver: ml-team

	# Backend team alerts
	- match:
	team: backend
	receiver: backend-team

	# Receivers define notification destinations
	receivers:
	# Default receiver - logs only
	- name: 'default'
	webhook_configs:
	- url: 'http://localhost:9093/api/v1/alerts'
	send_resolved: true

	# PagerDuty for critical incidents
	- name: 'pagerduty'
	pagerduty_configs:
	- service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
	description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}'
	details:
	firing: '{{ .Alerts.Firing \| len }}'
	resolved: '{{ .Alerts.Resolved \| len }}'
	description: '{{ .CommonAnnotations.description }}'
	runbook: '{{ .CommonAnnotations.runbook_url }}'

	# Slack channels for different severities
	- name: 'slack-critical'
	slack_configs:
	- api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK'
	channel: '#alerts-critical'
	title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
	text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
	send_resolved: true
	color: 'danger'
	actions:
	- type: button
	text: 'View Dashboard'
	url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview'
	- type: button
	text: 'Runbook'
	url: '{{ .CommonAnnotations.runbook_url }}'

	- name: 'slack-warning'
	slack_configs:
	- api_url: 'YOUR_SLACK_WARNING_WEBHOOK'
	channel: '#alerts-warning'
	title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
	text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
	send_resolved: true
	color: 'warning'

	# SLO violations channel
	- name: 'slo-violations'
	slack_configs:
	- api_url: 'YOUR_SLACK_SLO_WEBHOOK'
	channel: '#slo-violations'
	title: '📊 SLO Violation: {{ .GroupLabels.alertname }}'
	text: \|
	Service: Cidadão.AI Backend
	SLO: {{ .GroupLabels.alertname }}
	Current Value: {{ .CommonAnnotations.description }}
	Time: {{ .StartsAt.Format "15:04:05 MST" }}
	send_resolved: true
	color: '#ff9800'
	email_configs:
	- to: '[email protected]'
	headers:
	Subject: 'SLO Violation: {{ .GroupLabels.alertname }}'

	# Team-specific receivers
	- name: 'ml-team'
	slack_configs:
	- api_url: 'YOUR_SLACK_ML_WEBHOOK'
	channel: '#ml-alerts'
	title: '🤖 ML Alert: {{ .GroupLabels.alertname }}'
	text: '{{ .CommonAnnotations.description }}'
	email_configs:
	- to: '[email protected]'

	- name: 'backend-team'
	slack_configs:
	- api_url: 'YOUR_SLACK_BACKEND_WEBHOOK'
	channel: '#backend-alerts'
	title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}'
	text: '{{ .CommonAnnotations.description }}'
	email_configs:
	- to: '[email protected]'

	# Inhibition rules prevent certain alerts when others are firing
	inhibit_rules:
	# Don't alert on high latency if API is down
	- source_match:
	alertname: 'APIDown'
	target_match:
	alertname: 'APIHighLatency'
	equal: ['service']

	# Don't alert on agent failures if all agents are failing
	- source_match:
	alertname: 'AllAgentsFailing'
	target_match:
	alertname: 'AgentHighFailureRate'
	equal: ['service']

	# Don't alert on cache hit rate if Redis is down
	- source_match:
	alertname: 'RedisDown'
	target_match:
	alertname: 'LowCacheHitRate'
	equal: ['service']