✨ Características: - 45 ejercicios universitarios (Basic → Advanced) - Renderizado LaTeX profesional - IA generativa (Z.ai/DashScope) - Docker 9 servicios - Tests 123/123 pasando - Seguridad enterprise (JWT, XSS, Rate limiting) 🐳 Infraestructura: - Next.js 14 + Node.js 20 - PostgreSQL 15 + Redis 7 - Docker Compose completo - Nginx + SSL ready 📚 Documentación: - 5 informes técnicos completos - README profesional - Scripts de deployment automatizados Estado: Producción lista ✅
204 lines
6.2 KiB
YAML
204 lines
6.2 KiB
YAML
# ========================================
|
|
# PROMETHEUS ALERTING RULES
|
|
# Enterprise Grade Monitoring
|
|
# ========================================
|
|
|
|
groups:
|
|
# ========================================
|
|
# Backend API Alerts
|
|
# ========================================
|
|
- name: backend_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: BackendDown
|
|
expr: up{job="backend"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Backend API is down"
|
|
description: "Backend API has been down for more than 1 minute"
|
|
|
|
- alert: BackendHighErrorRate
|
|
expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Backend high error rate"
|
|
description: "Backend error rate is {{ $value | humanizePercentage }}"
|
|
|
|
- alert: BackendHighResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 2
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Backend high response time"
|
|
description: "95th percentile response time is {{ $value }}s"
|
|
|
|
- alert: BackendLowSuccessRate
|
|
expr: rate(http_requests_total{job="backend", status=~"2.."}[5m]) / rate(http_requests_total{job="backend"}[5m]) < 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Backend low success rate"
|
|
description: "Success rate is {{ $value | humanizePercentage }}"
|
|
|
|
# ========================================
|
|
# Database Alerts
|
|
# ========================================
|
|
- name: database_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: PostgreSQLDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL database has been down for more than 1 minute"
|
|
|
|
- alert: PostgreSQLHighConnections
|
|
expr: pg_stat_activity_count > 150
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL high connection count"
|
|
description: "PostgreSQL has {{ $value }} connections (> 150)"
|
|
|
|
- alert: PostgreSQLReplicationLag
|
|
expr: pg_replication_lag_seconds > 30
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL replication lag"
|
|
description: "Replication lag is {{ $value }}s"
|
|
|
|
- alert: PostgreSQLSlowQueries
|
|
expr: rate(pg_stat_statements_seconds_total[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PostgreSQL slow queries detected"
|
|
description: "Slow query rate is {{ $value }}s/s"
|
|
|
|
# ========================================
|
|
# Redis Alerts
|
|
# ========================================
|
|
- name: redis_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis has been down for more than 1 minute"
|
|
|
|
- alert: RedisHighMemoryUsage
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Redis high memory usage"
|
|
description: "Redis memory usage is {{ $value | humanizePercentage }}"
|
|
|
|
- alert: RedisRejectedConnections
|
|
expr: rate(redis_rejected_connections_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Redis rejected connections"
|
|
description: "Redis is rejecting connections"
|
|
|
|
# ========================================
|
|
# Worker Alerts
|
|
# ========================================
|
|
- name: worker_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: PDFWorkerDown
|
|
expr: up{job="pdf-worker"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PDF Worker is down"
|
|
|
|
- alert: ExerciseWorkerDown
|
|
expr: up{job="exercise-worker"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Exercise Worker is down"
|
|
|
|
- alert: NotificationWorkerDown
|
|
expr: up{job="notification-worker"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Notification Worker is down"
|
|
|
|
- alert: WorkerHighCPUUsage
|
|
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Worker high CPU usage"
|
|
description: "CPU usage is {{ $value }}%"
|
|
|
|
# ========================================
|
|
# Infrastructure Alerts
|
|
# ========================================
|
|
- name: infrastructure_alerts
|
|
interval: 30s
|
|
rules:
|
|
- alert: NodeHighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node high memory usage"
|
|
description: "Memory usage is {{ $value | humanizePercentage }}"
|
|
|
|
- alert: NodeDiskFull
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node disk is filling up"
|
|
description: "Disk has {{ $value | humanizePercentage }} available"
|
|
|
|
- alert: NodeHighLoad
|
|
expr: node_load1 > 4
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node high load average"
|
|
description: "Load average is {{ $value }}"
|
|
|
|
- alert: ContainerHighRestartRate
|
|
expr: rate(container_start_count_total[15m]) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container restarting frequently"
|
|
description: "Container {{ $labels.name }} is restarting"
|