# ======================================== # PROMETHEUS ALERTING RULES # Enterprise Grade Monitoring # ======================================== groups: # ======================================== # Backend API Alerts # ======================================== - name: backend_alerts interval: 30s rules: - alert: BackendDown expr: up{job="backend"} == 0 for: 1m labels: severity: critical annotations: summary: "Backend API is down" description: "Backend API has been down for more than 1 minute" - alert: BackendHighErrorRate expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05 for: 2m labels: severity: warning annotations: summary: "Backend high error rate" description: "Backend error rate is {{ $value | humanizePercentage }}" - alert: BackendHighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 2 for: 3m labels: severity: warning annotations: summary: "Backend high response time" description: "95th percentile response time is {{ $value }}s" - alert: BackendLowSuccessRate expr: rate(http_requests_total{job="backend", status=~"2.."}[5m]) / rate(http_requests_total{job="backend"}[5m]) < 0.95 for: 5m labels: severity: critical annotations: summary: "Backend low success rate" description: "Success rate is {{ $value | humanizePercentage }}" # ======================================== # Database Alerts # ======================================== - name: database_alerts interval: 30s rules: - alert: PostgreSQLDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL database has been down for more than 1 minute" - alert: PostgreSQLHighConnections expr: pg_stat_activity_count > 150 for: 5m labels: severity: warning annotations: summary: "PostgreSQL high connection count" description: "PostgreSQL has {{ $value }} connections (> 150)" - alert: PostgreSQLReplicationLag expr: pg_replication_lag_seconds > 30 for: 5m labels: severity: warning annotations: summary: "PostgreSQL replication lag" description: "Replication lag is {{ $value }}s" - alert: PostgreSQLSlowQueries expr: rate(pg_stat_statements_seconds_total[5m]) > 1 for: 5m labels: severity: warning annotations: summary: "PostgreSQL slow queries detected" description: "Slow query rate is {{ $value }}s/s" # ======================================== # Redis Alerts # ======================================== - name: redis_alerts interval: 30s rules: - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis has been down for more than 1 minute" - alert: RedisHighMemoryUsage expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "Redis high memory usage" description: "Redis memory usage is {{ $value | humanizePercentage }}" - alert: RedisRejectedConnections expr: rate(redis_rejected_connections_total[5m]) > 0 for: 1m labels: severity: warning annotations: summary: "Redis rejected connections" description: "Redis is rejecting connections" # ======================================== # Worker Alerts # ======================================== - name: worker_alerts interval: 30s rules: - alert: PDFWorkerDown expr: up{job="pdf-worker"} == 0 for: 2m labels: severity: warning annotations: summary: "PDF Worker is down" - alert: ExerciseWorkerDown expr: up{job="exercise-worker"} == 0 for: 2m labels: severity: warning annotations: summary: "Exercise Worker is down" - alert: NotificationWorkerDown expr: up{job="notification-worker"} == 0 for: 2m labels: severity: warning annotations: summary: "Notification Worker is down" - alert: WorkerHighCPUUsage expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning annotations: summary: "Worker high CPU usage" description: "CPU usage is {{ $value }}%" # ======================================== # Infrastructure Alerts # ======================================== - name: infrastructure_alerts interval: 30s rules: - alert: NodeHighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: critical annotations: summary: "Node high memory usage" description: "Memory usage is {{ $value | humanizePercentage }}" - alert: NodeDiskFull expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1 for: 5m labels: severity: critical annotations: summary: "Node disk is filling up" description: "Disk has {{ $value | humanizePercentage }} available" - alert: NodeHighLoad expr: node_load1 > 4 for: 5m labels: severity: warning annotations: summary: "Node high load average" description: "Load average is {{ $value }}" - alert: ContainerHighRestartRate expr: rate(container_start_count_total[15m]) > 0 for: 5m labels: severity: warning annotations: summary: "Container restarting frequently" description: "Container {{ $labels.name }} is restarting"