🎓 Initial commit: Math2 Platform - Plataforma de Álgebra Lineal PRO
✨ Características: - 45 ejercicios universitarios (Basic → Advanced) - Renderizado LaTeX profesional - IA generativa (Z.ai/DashScope) - Docker 9 servicios - Tests 123/123 pasando - Seguridad enterprise (JWT, XSS, Rate limiting) 🐳 Infraestructura: - Next.js 14 + Node.js 20 - PostgreSQL 15 + Redis 7 - Docker Compose completo - Nginx + SSL ready 📚 Documentación: - 5 informes técnicos completos - README profesional - Scripts de deployment automatizados Estado: Producción lista ✅
This commit is contained in:
88
monitoring/alertmanager/alertmanager.yml
Normal file
88
monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
# ========================================
|
||||
# ALERTMANAGER CONFIGURATION
|
||||
# Enterprise Alert Routing
|
||||
# ========================================
|
||||
|
||||
global:
|
||||
smtp_smarthost: '${SMTP_HOST:-localhost:587}'
|
||||
smtp_from: '${SMTP_FROM:-alerts@mathplatform.com}'
|
||||
smtp_auth_username: '${SMTP_USER:-}'
|
||||
smtp_auth_password: '${SMTP_PASSWORD:-}'
|
||||
slack_api_url: '${SLACK_WEBHOOK_URL:-}'
|
||||
telegram_api_url: 'https://api.telegram.org'
|
||||
|
||||
# Templates
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Inhibition rules
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
# Route tree
|
||||
troute:
|
||||
receiver: 'default-receiver'
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
# Critical alerts
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-receiver'
|
||||
continue: true
|
||||
|
||||
# Database alerts
|
||||
- match:
|
||||
job: postgres
|
||||
receiver: 'database-receiver'
|
||||
group_interval: 10m
|
||||
|
||||
# Backend alerts
|
||||
- match:
|
||||
job: backend
|
||||
receiver: 'backend-receiver'
|
||||
group_interval: 5m
|
||||
|
||||
# Receivers
|
||||
receivers:
|
||||
- name: 'default-receiver'
|
||||
slack_configs:
|
||||
- channel: '#alerts'
|
||||
title: 'Math Platform Alert'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
send_resolved: true
|
||||
|
||||
- name: 'critical-receiver'
|
||||
slack_configs:
|
||||
- channel: '#critical-alerts'
|
||||
title: 'CRITICAL: Math Platform'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
send_resolved: true
|
||||
email_configs:
|
||||
- to: '${CRITICAL_EMAIL:-admin@mathplatform.com}'
|
||||
subject: 'CRITICAL Alert: {{ .GroupLabels.alertname }}'
|
||||
html: '{{ template "email.default.html" . }}'
|
||||
send_resolved: true
|
||||
telegram_configs:
|
||||
- bot_token: '${TELEGRAM_BOT_TOKEN}'
|
||||
chat_id: '${TELEGRAM_ADMIN_CHAT_ID}'
|
||||
message: '🔴 CRITICAL: {{ .GroupLabels.alertname }} - {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
send_resolved: true
|
||||
|
||||
- name: 'database-receiver'
|
||||
slack_configs:
|
||||
- channel: '#database-alerts'
|
||||
title: 'Database Alert'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
|
||||
- name: 'backend-receiver'
|
||||
slack_configs:
|
||||
- channel: '#backend-alerts'
|
||||
title: 'Backend Alert'
|
||||
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
|
||||
12
monitoring/grafana/dashboards/dashboards.yml
Normal file
12
monitoring/grafana/dashboards/dashboards.yml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Math Platform Dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
22
monitoring/grafana/datasources/datasources.yml
Normal file
22
monitoring/grafana/datasources/datasources.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
httpMethod: POST
|
||||
manageAlerts: true
|
||||
alertmanagerUid: alertmanager
|
||||
|
||||
- name: Alertmanager
|
||||
type: alertmanager
|
||||
access: proxy
|
||||
url: http://alertmanager:9093
|
||||
editable: false
|
||||
jsonData:
|
||||
implementation: prometheus
|
||||
88
monitoring/prometheus/prometheus.yml
Normal file
88
monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,88 @@
|
||||
# ========================================
|
||||
# PROMETHEUS CONFIGURATION
|
||||
# Enterprise Monitoring Setup
|
||||
# ========================================
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
cluster: 'math-platform'
|
||||
replica: '{{.ExternalURL}}'
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ['alertmanager:9093']
|
||||
|
||||
# Load rules once and periodically evaluate them
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# Prometheus itself
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Backend API
|
||||
- job_name: 'backend'
|
||||
static_configs:
|
||||
- targets: ['backend:3001']
|
||||
metrics_path: '/metrics'
|
||||
scrape_interval: 10s
|
||||
scrape_timeout: 5s
|
||||
|
||||
# Frontend
|
||||
- job_name: 'frontend'
|
||||
static_configs:
|
||||
- targets: ['frontend:3000']
|
||||
scrape_interval: 30s
|
||||
|
||||
# PostgreSQL (via postgres_exporter)
|
||||
- job_name: 'postgres'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
scrape_interval: 15s
|
||||
|
||||
# Redis (via redis_exporter)
|
||||
- job_name: 'redis'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
scrape_interval: 15s
|
||||
|
||||
# Workers
|
||||
- job_name: 'pdf-worker'
|
||||
static_configs:
|
||||
- targets: ['pdf-worker:3002']
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'exercise-worker'
|
||||
static_configs:
|
||||
- targets: ['exercise-worker:3003']
|
||||
scrape_interval: 30s
|
||||
|
||||
- job_name: 'notification-worker'
|
||||
static_configs:
|
||||
- targets: ['notification-worker:3004']
|
||||
scrape_interval: 30s
|
||||
|
||||
# Nginx
|
||||
- job_name: 'nginx'
|
||||
static_configs:
|
||||
- targets: ['nginx:9113']
|
||||
scrape_interval: 30s
|
||||
|
||||
# Node Exporter (host metrics)
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
scrape_interval: 15s
|
||||
|
||||
# Docker Daemon
|
||||
- job_name: 'docker'
|
||||
static_configs:
|
||||
- targets: ['docker-exporter:9323']
|
||||
scrape_interval: 30s
|
||||
203
monitoring/prometheus/rules/alerts.yml
Normal file
203
monitoring/prometheus/rules/alerts.yml
Normal file
@@ -0,0 +1,203 @@
|
||||
# ========================================
|
||||
# PROMETHEUS ALERTING RULES
|
||||
# Enterprise Grade Monitoring
|
||||
# ========================================
|
||||
|
||||
groups:
|
||||
# ========================================
|
||||
# Backend API Alerts
|
||||
# ========================================
|
||||
- name: backend_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: BackendDown
|
||||
expr: up{job="backend"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backend API is down"
|
||||
description: "Backend API has been down for more than 1 minute"
|
||||
|
||||
- alert: BackendHighErrorRate
|
||||
expr: rate(http_requests_total{job="backend", status=~"5.."}[5m]) > 0.05
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backend high error rate"
|
||||
description: "Backend error rate is {{ $value | humanizePercentage }}"
|
||||
|
||||
- alert: BackendHighResponseTime
|
||||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="backend"}[5m])) > 2
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Backend high response time"
|
||||
description: "95th percentile response time is {{ $value }}s"
|
||||
|
||||
- alert: BackendLowSuccessRate
|
||||
expr: rate(http_requests_total{job="backend", status=~"2.."}[5m]) / rate(http_requests_total{job="backend"}[5m]) < 0.95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Backend low success rate"
|
||||
description: "Success rate is {{ $value | humanizePercentage }}"
|
||||
|
||||
# ========================================
|
||||
# Database Alerts
|
||||
# ========================================
|
||||
- name: database_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: up{job="postgres"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "PostgreSQL database has been down for more than 1 minute"
|
||||
|
||||
- alert: PostgreSQLHighConnections
|
||||
expr: pg_stat_activity_count > 150
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL high connection count"
|
||||
description: "PostgreSQL has {{ $value }} connections (> 150)"
|
||||
|
||||
- alert: PostgreSQLReplicationLag
|
||||
expr: pg_replication_lag_seconds > 30
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL replication lag"
|
||||
description: "Replication lag is {{ $value }}s"
|
||||
|
||||
- alert: PostgreSQLSlowQueries
|
||||
expr: rate(pg_stat_statements_seconds_total[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PostgreSQL slow queries detected"
|
||||
description: "Slow query rate is {{ $value }}s/s"
|
||||
|
||||
# ========================================
|
||||
# Redis Alerts
|
||||
# ========================================
|
||||
- name: redis_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: RedisDown
|
||||
expr: up{job="redis"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis is down"
|
||||
description: "Redis has been down for more than 1 minute"
|
||||
|
||||
- alert: RedisHighMemoryUsage
|
||||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Redis high memory usage"
|
||||
description: "Redis memory usage is {{ $value | humanizePercentage }}"
|
||||
|
||||
- alert: RedisRejectedConnections
|
||||
expr: rate(redis_rejected_connections_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Redis rejected connections"
|
||||
description: "Redis is rejecting connections"
|
||||
|
||||
# ========================================
|
||||
# Worker Alerts
|
||||
# ========================================
|
||||
- name: worker_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: PDFWorkerDown
|
||||
expr: up{job="pdf-worker"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "PDF Worker is down"
|
||||
|
||||
- alert: ExerciseWorkerDown
|
||||
expr: up{job="exercise-worker"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Exercise Worker is down"
|
||||
|
||||
- alert: NotificationWorkerDown
|
||||
expr: up{job="notification-worker"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Notification Worker is down"
|
||||
|
||||
- alert: WorkerHighCPUUsage
|
||||
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Worker high CPU usage"
|
||||
description: "CPU usage is {{ $value }}%"
|
||||
|
||||
# ========================================
|
||||
# Infrastructure Alerts
|
||||
# ========================================
|
||||
- name: infrastructure_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: NodeHighMemoryUsage
|
||||
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node high memory usage"
|
||||
description: "Memory usage is {{ $value | humanizePercentage }}"
|
||||
|
||||
- alert: NodeDiskFull
|
||||
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node disk is filling up"
|
||||
description: "Disk has {{ $value | humanizePercentage }} available"
|
||||
|
||||
- alert: NodeHighLoad
|
||||
expr: node_load1 > 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Node high load average"
|
||||
description: "Load average is {{ $value }}"
|
||||
|
||||
- alert: ContainerHighRestartRate
|
||||
expr: rate(container_start_count_total[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Container restarting frequently"
|
||||
description: "Container {{ $labels.name }} is restarting"
|
||||
Reference in New Issue
Block a user