Spaces:

neural-thinker
/

cidadao.ai-backend

Paused

anderson-ufrj commited on Sep 24

Commit

2395a41

1 Parent(s): 640d7fb

feat(monitoring): configure comprehensive Prometheus alerts and Alertmanager

- Add critical alerts for API availability (<99% triggers)
- Configure latency alerts (warning at 2s, critical at 5s)
- Set up agent failure rate monitoring
- Add investigation backlog and stalled processing alerts
- Configure cache hit rate alerts (<70% triggers)
- Monitor circuit breaker states
- Add resource utilization alerts (memory, CPU)
- Configure database connection pool monitoring
- Set up anomaly detection rate alerts
- Add SLO violation alerts for key metrics
- Configure Alertmanager with routing rules
- Set up notification channels (email, Slack, PagerDuty)
- Add inhibition rules to prevent alert storms
- Create alert templates for better formatting
- Update Prometheus config to include new alerts

Files changed (4) hide show

monitoring/alertmanager/alertmanager.yml +175 -0
monitoring/alertmanager/templates/cidadao-ai.tmpl +82 -0
monitoring/prometheus/alerts.yml +262 -0
monitoring/prometheus/prometheus.yml +3 -4

monitoring/alertmanager/alertmanager.yml ADDED Viewed

	@@ -0,0 +1,175 @@

+global:
+  # SMTP configuration for email alerts
+  smtp_smarthost: 'smtp.gmail.com:587'
+  smtp_from: '[email protected]'
+  smtp_auth_username: '[email protected]'
+  smtp_auth_password: 'YOUR_APP_PASSWORD'
+  smtp_require_tls: true
+  # Slack webhook URL
+  slack_api_url: 'YOUR_SLACK_WEBHOOK_URL'
+  # PagerDuty configuration
+  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
+# Templates for alert formatting
+templates:
+  - '/etc/alertmanager/templates/*.tmpl'
+# The root route for all alerts
+route:
+  # Default receiver for alerts
+  receiver: 'default'
+  # Group alerts by these labels
+  group_by: ['alertname', 'cluster', 'service', 'severity']
+  # Wait before sending grouped alerts
+  group_wait: 30s
+  # How long to wait before sending new alerts for a group
+  group_interval: 5m
+  # How long to wait before repeating alerts
+  repeat_interval: 12h
+  # Child routes for specific routing
+  routes:
+    # Critical alerts go to PagerDuty
+    - match:
+        severity: critical
+        pagerduty: 'true'
+      receiver: pagerduty
+      continue: true
+    # All critical alerts also go to Slack
+    - match:
+        severity: critical
+      receiver: slack-critical
+    # Warning alerts go to Slack
+    - match:
+        severity: warning
+      receiver: slack-warning
+    # SLO violations have special handling
+    - match:
+        slo: 'true'
+      receiver: slo-violations
+      group_interval: 1h
+      repeat_interval: 4h
+    # ML team alerts
+    - match:
+        team: ml
+      receiver: ml-team
+    # Backend team alerts
+    - match:
+        team: backend
+      receiver: backend-team
+# Receivers define notification destinations
+receivers:
+  # Default receiver - logs only
+  - name: 'default'
+    webhook_configs:
+      - url: 'http://localhost:9093/api/v1/alerts'
+        send_resolved: true
+  # PagerDuty for critical incidents
+  - name: 'pagerduty'
+    pagerduty_configs:
+      - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
+        description: '{{ .GroupLabels.alertname }} - {{ .CommonAnnotations.summary }}'
+        details:
+          firing: '{{ .Alerts.Firing | len }}'
+          resolved: '{{ .Alerts.Resolved | len }}'
+          description: '{{ .CommonAnnotations.description }}'
+          runbook: '{{ .CommonAnnotations.runbook_url }}'
+  # Slack channels for different severities
+  - name: 'slack-critical'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_CRITICAL_WEBHOOK'
+        channel: '#alerts-critical'
+        title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+        send_resolved: true
+        color: 'danger'
+        actions:
+          - type: button
+            text: 'View Dashboard'
+            url: 'https://grafana.cidadao.ai/d/cidadao-ai-overview'
+          - type: button
+            text: 'Runbook'
+            url: '{{ .CommonAnnotations.runbook_url }}'
+  - name: 'slack-warning'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_WARNING_WEBHOOK'
+        channel: '#alerts-warning'
+        title: '⚠️ WARNING: {{ .GroupLabels.alertname }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+        send_resolved: true
+        color: 'warning'
+  # SLO violations channel
+  - name: 'slo-violations'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_SLO_WEBHOOK'
+        channel: '#slo-violations'
+        title: '📊 SLO Violation: {{ .GroupLabels.alertname }}'
+        text: |
+          *Service:* Cidadão.AI Backend
+          *SLO:* {{ .GroupLabels.alertname }}
+          *Current Value:* {{ .CommonAnnotations.description }}
+          *Time:* {{ .StartsAt.Format "15:04:05 MST" }}
+        send_resolved: true
+        color: '#ff9800'
+    email_configs:
+      - to: '[email protected]'
+        headers:
+          Subject: 'SLO Violation: {{ .GroupLabels.alertname }}'
+  # Team-specific receivers
+  - name: 'ml-team'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_ML_WEBHOOK'
+        channel: '#ml-alerts'
+        title: '🤖 ML Alert: {{ .GroupLabels.alertname }}'
+        text: '{{ .CommonAnnotations.description }}'
+    email_configs:
+      - to: '[email protected]'
+  - name: 'backend-team'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_BACKEND_WEBHOOK'
+        channel: '#backend-alerts'
+        title: '🔧 Backend Alert: {{ .GroupLabels.alertname }}'
+        text: '{{ .CommonAnnotations.description }}'
+    email_configs:
+      - to: '[email protected]'
+# Inhibition rules prevent certain alerts when others are firing
+inhibit_rules:
+  # Don't alert on high latency if API is down
+  - source_match:
+      alertname: 'APIDown'
+    target_match:
+      alertname: 'APIHighLatency'
+    equal: ['service']
+  # Don't alert on agent failures if all agents are failing
+  - source_match:
+      alertname: 'AllAgentsFailing'
+    target_match:
+      alertname: 'AgentHighFailureRate'
+    equal: ['service']
+  # Don't alert on cache hit rate if Redis is down
+  - source_match:
+      alertname: 'RedisDown'
+    target_match:
+      alertname: 'LowCacheHitRate'
+    equal: ['service']

monitoring/alertmanager/templates/cidadao-ai.tmpl ADDED Viewed

	@@ -0,0 +1,82 @@

+{{ define "cidadao.ai.title" }}
+[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }}
+{{ end }}
+{{ define "cidadao.ai.text" }}
+{{ range .Alerts }}
+*Alert:* {{ .Labels.alertname }}
+*Severity:* {{ .Labels.severity }}
+*Service:* {{ .Labels.service }}
+*Instance:* {{ .Labels.instance }}
+*Description:* {{ .Annotations.description }}
+*Started:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
+{{ if .Annotations.runbook_url }}*Runbook:* <{{ .Annotations.runbook_url }}|View Runbook>{{ end }}
+{{ end }}
+{{ end }}
+{{ define "cidadao.ai.html" }}
+<!DOCTYPE html>
+<html>
+<head>
+    <style>
+        body { font-family: Arial, sans-serif; }
+        .alert-box {
+            border: 2px solid #ddd;
+            padding: 10px;
+            margin: 10px 0;
+            border-radius: 5px;
+        }
+        .critical { border-color: #f44336; background-color: #ffebee; }
+        .warning { border-color: #ff9800; background-color: #fff3e0; }
+        .info { border-color: #2196f3; background-color: #e3f2fd; }
+        .resolved { border-color: #4caf50; background-color: #e8f5e9; }
+        h2 { margin-top: 0; }
+        .label { font-weight: bold; }
+        .metadata { color: #666; font-size: 0.9em; }
+    </style>
+</head>
+<body>
+    <h1>Cidadão.AI Alert Notification</h1>
+    {{ range .Alerts }}
+    <div class="alert-box {{ .Labels.severity }}">
+        <h2>{{ .Labels.alertname }}</h2>
+        <p><span class="label">Status:</span> {{ .Status }}</p>
+        <p><span class="label">Severity:</span> {{ .Labels.severity | toUpper }}</p>
+        <p><span class="label">Service:</span> {{ .Labels.service }}</p>
+        <p><span class="label">Instance:</span> {{ .Labels.instance }}</p>
+        <p><span class="label">Description:</span><br>
+        {{ .Annotations.description }}</p>
+        <div class="metadata">
+            <p><span class="label">Started:</span> {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}</p>
+            {{ if .EndsAt }}
+            <p><span class="label">Ended:</span> {{ .EndsAt.Format "2006-01-02 15:04:05 MST" }}</p>
+            {{ end }}
+            {{ if .Annotations.runbook_url }}
+            <p><span class="label">Runbook:</span> <a href="{{ .Annotations.runbook_url }}">View Documentation</a></p>
+            {{ end }}
+        </div>
+        <details>
+            <summary>All Labels</summary>
+            <ul>
+            {{ range $key, $value := .Labels }}
+                <li><strong>{{ $key }}:</strong> {{ $value }}</li>
+            {{ end }}
+            </ul>
+        </details>
+    </div>
+    {{ end }}
+    <hr>
+    <p style="color: #666; font-size: 0.8em;">
+        This alert was generated by the Cidadão.AI monitoring system.
+        For more information, visit the <a href="https://grafana.cidadao.ai">Grafana Dashboard</a>.
+    </p>
+</body>
+</html>
+{{ end }}

monitoring/prometheus/alerts.yml ADDED Viewed

	@@ -0,0 +1,262 @@

+groups:
+  - name: cidadao_ai_critical_alerts
+    interval: 30s
+    rules:
+      # API Availability Alerts
+      - alert: APIHighErrorRate
+        expr: |
+          (
+            sum(rate(cidadao_ai_http_errors_total[5m]))
+            /
+            sum(rate(cidadao_ai_http_requests_total[5m]))
+          ) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+        annotations:
+          summary: "High API error rate detected"
+          description: "API error rate is {{ $value | humanizePercentage }} over the last 5 minutes, exceeding the 5% threshold"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-error-rate"
+      - alert: APIDown
+        expr: up{job="cidadao-ai-backend"} == 0
+        for: 1m
+        labels:
+          severity: critical
+          team: backend
+          pagerduty: true
+        annotations:
+          summary: "Cidadão.AI API is down"
+          description: "The API service has been down for more than 1 minute"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/api-down"
+      # Latency Alerts
+      - alert: APIHighLatency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
+          ) > 2.0
+        for: 5m
+        labels:
+          severity: warning
+          team: backend
+        annotations:
+          summary: "High API latency detected"
+          description: "P95 latency is {{ $value }}s, exceeding the 2s threshold"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/api-high-latency"
+      - alert: APICriticalLatency
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(cidadao_ai_request_duration_seconds_bucket[5m])) by (le)
+          ) > 5.0
+        for: 3m
+        labels:
+          severity: critical
+          team: backend
+          pagerduty: true
+        annotations:
+          summary: "Critical API latency detected"
+          description: "P95 latency is {{ $value }}s, exceeding the 5s critical threshold"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/api-critical-latency"
+      # Agent Performance Alerts
+      - alert: AgentHighFailureRate
+        expr: |
+          (
+            sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m])) by (agent_name)
+            /
+            sum(rate(cidadao_ai_agent_tasks_total[5m])) by (agent_name)
+          ) > 0.10
+        for: 5m
+        labels:
+          severity: warning
+          team: ml
+        annotations:
+          summary: "Agent {{ $labels.agent_name }} has high failure rate"
+          description: "Agent {{ $labels.agent_name }} failure rate is {{ $value | humanizePercentage }} over the last 5 minutes"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/agent-high-failure-rate"
+      - alert: AllAgentsFailing
+        expr: |
+          (
+            sum(rate(cidadao_ai_agent_tasks_total{status="failed"}[5m]))
+            /
+            sum(rate(cidadao_ai_agent_tasks_total[5m]))
+          ) > 0.50
+        for: 2m
+        labels:
+          severity: critical
+          team: ml
+          pagerduty: true
+        annotations:
+          summary: "Multiple agents are failing"
+          description: "Overall agent failure rate is {{ $value | humanizePercentage }}, indicating systemic issues"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/all-agents-failing"
+      # Investigation Alerts
+      - alert: InvestigationBacklog
+        expr: cidadao_ai_active_investigations > 100
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+        annotations:
+          summary: "High number of active investigations"
+          description: "{{ $value }} investigations are currently active, indicating potential processing delays"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-backlog"
+      - alert: InvestigationProcessingStalled
+        expr: |
+          rate(cidadao_ai_investigations_total{status="completed"}[10m]) == 0
+          and
+          cidadao_ai_active_investigations > 10
+        for: 15m
+        labels:
+          severity: critical
+          team: backend
+        annotations:
+          summary: "Investigation processing appears to be stalled"
+          description: "No investigations completed in the last 15 minutes despite {{ $value }} active investigations"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/investigation-stalled"
+      # Cache Performance Alerts
+      - alert: LowCacheHitRate
+        expr: |
+          (
+            sum(rate(cidadao_ai_cache_operations_total{result="hit"}[5m]))
+            /
+            sum(rate(cidadao_ai_cache_operations_total[5m]))
+          ) < 0.70
+        for: 10m
+        labels:
+          severity: warning
+          team: backend
+        annotations:
+          summary: "Cache hit rate below threshold"
+          description: "Cache hit rate is {{ $value | humanizePercentage }}, below the 70% threshold"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/low-cache-hit-rate"
+      # Circuit Breaker Alerts
+      - alert: CircuitBreakerOpen
+        expr: cidadao_ai_circuit_breaker_state{state="open"} == 1
+        for: 2m
+        labels:
+          severity: warning
+          team: backend
+        annotations:
+          summary: "Circuit breaker {{ $labels.service_name }} is open"
+          description: "Service {{ $labels.service_name }} circuit breaker has been open for more than 2 minutes"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/circuit-breaker-open"
+      - alert: MultipleCircuitBreakersOpen
+        expr: |
+          count(cidadao_ai_circuit_breaker_state{state="open"} == 1) >= 3
+        for: 1m
+        labels:
+          severity: critical
+          team: backend
+          pagerduty: true
+        annotations:
+          summary: "Multiple circuit breakers are open"
+          description: "{{ $value }} circuit breakers are currently open, indicating widespread service issues"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/multiple-circuit-breakers"
+      # Resource Utilization Alerts
+      - alert: HighMemoryUsage
+        expr: |
+          (
+            container_memory_usage_bytes{pod=~"cidadao-ai-.*"}
+            /
+            container_spec_memory_limit_bytes{pod=~"cidadao-ai-.*"}
+          ) > 0.90
+        for: 5m
+        labels:
+          severity: warning
+          team: devops
+        annotations:
+          summary: "High memory usage in {{ $labels.pod }}"
+          description: "Memory usage is {{ $value | humanizePercentage }} of the limit"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/high-memory-usage"
+      - alert: HighCPUUsage
+        expr: |
+          (
+            rate(container_cpu_usage_seconds_total{pod=~"cidadao-ai-.*"}[5m])
+          ) > 0.80
+        for: 10m
+        labels:
+          severity: warning
+          team: devops
+        annotations:
+          summary: "High CPU usage in {{ $labels.pod }}"
+          description: "CPU usage is {{ $value | humanizePercentage }} over the last 10 minutes"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/high-cpu-usage"
+      # Database Connection Alerts
+      - alert: DatabaseConnectionPoolExhausted
+        expr: |
+          (
+            cidadao_ai_database_connections_in_use
+            /
+            cidadao_ai_database_connections_total
+          ) > 0.90
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+        annotations:
+          summary: "Database connection pool near exhaustion"
+          description: "{{ $value | humanizePercentage }} of database connections are in use"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/db-connection-pool"
+      # Anomaly Detection Alerts
+      - alert: HighAnomalyDetectionRate
+        expr: |
+          sum(rate(cidadao_ai_anomalies_detected_total[1h])) > 100
+        for: 5m
+        labels:
+          severity: warning
+          team: compliance
+        annotations:
+          summary: "Unusually high rate of anomaly detections"
+          description: "{{ $value }} anomalies detected per hour, which is unusually high"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/high-anomaly-rate"
+      # SLO Violation Alerts
+      - alert: SLOViolationAPIAvailability
+        expr: |
+          (
+            1 - (
+              sum(increase(cidadao_ai_http_errors_total[1h]))
+              /
+              sum(increase(cidadao_ai_http_requests_total[1h]))
+            )
+          ) < 0.99
+        for: 5m
+        labels:
+          severity: critical
+          team: backend
+          slo: true
+        annotations:
+          summary: "API Availability SLO violation"
+          description: "API availability is {{ $value | humanizePercentage }}, below the 99% SLO"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/slo-availability"
+      - alert: SLOViolationLatency
+        expr: |
+          (
+            sum(increase(cidadao_ai_request_duration_seconds_bucket{le="0.2"}[1h]))
+            /
+            sum(increase(cidadao_ai_request_duration_seconds_count[1h]))
+          ) < 0.95
+        for: 5m
+        labels:
+          severity: warning
+          team: backend
+          slo: true
+        annotations:
+          summary: "Latency SLO violation"
+          description: "Only {{ $value | humanizePercentage }} of requests completed under 200ms, below the 95% SLO"
+          runbook_url: "https://wiki.cidadao.ai/runbooks/slo-latency"

monitoring/prometheus/prometheus.yml CHANGED Viewed

@@ -6,17 +6,16 @@ global:
 rule_files:
   - "rules/*.yml"
 scrape_configs:
   # Cidadão.AI Backend Application Metrics
   - job_name: 'cidadao-ai-backend'
     static_configs:
-      - targets: ['cidadao-ai:7860']
-    metrics_path: /health/metrics
     scrape_interval: 10s
     scrape_timeout: 5s
-    params:
-      format: ['prometheus']
   # Prometheus Self-Monitoring
   - job_name: 'prometheus'

 rule_files:
   - "rules/*.yml"
+  - "alerts.yml"
 scrape_configs:
   # Cidadão.AI Backend Application Metrics
   - job_name: 'cidadao-ai-backend'
     static_configs:
+      - targets: ['cidadao-ai:8000', 'localhost:8000']
+    metrics_path: /api/v1/observability/metrics
     scrape_interval: 10s
     scrape_timeout: 5s
   # Prometheus Self-Monitoring
   - job_name: 'prometheus'